In [None]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from loading import load_data, load_test
from preprocessing import preprocess_x, preprocess_y

In [None]:
data_x_orig, data_y_orig = load_data() # loads from ".\data" by default
test_x_orig = load_test() # this is for submission purposes

In [None]:
# look at missing data
s = data_x_orig.isna().sum()/data_x_orig.count()
s[data_x_orig.isna().any()].sort_values(ascending=False)

In [None]:
data_x = preprocess_x(data_x_orig)
data_y = preprocess_y(data_x_orig, data_y_orig)

In [None]:
# give ids to traders
from sklearn.preprocessing import LabelEncoder

trader_encoder = LabelEncoder()
groups = trader_encoder.fit_transform(data_x_orig["Trader"].to_numpy())

In [None]:
# send everything to numpy arrays
X = data_x.to_numpy()
y = data_y.to_numpy()

## Model Selection

In [None]:
from scipy import stats
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier

# build a list of dicts that says which classifier heads to test, and what params to test on them
params = [
    {
        "clf": [RadiusNeighborsClassifier()], 
        "clf__radius": stats.loguniform(1, 1e2)
    }, {
        "clf": [ExtraTreesClassifier(max_features="sqrt")],
        "clf__n_estimators": stats.loguniform(50,200)
    }, {
        "clf": [SGDClassifier()],
        "clf__class_weight": [None, "balanced"]
    }
]

In [None]:
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# these estimators will be appled sequentially to the data:
pipe = Pipeline([
    ("standardisation", StandardScaler()),
    ("reduce_dim", PCA(n_components='mle')),
    ("clf", SGDClassifier())
])

In [None]:
#split dataset into training and validation by trader
gkf = GroupKFold(n_splits=5).split(X, y, groups)

search = RandomizedSearchCV(
    n_iter=10,
    cv=gkf,
    estimator=pipe,
    param_distributions=params,
    scoring="balanced_accuracy", # TODO is this the best score ? Shouldn't we implement our own method ?
    n_jobs=-1,
    pre_dispatch="2*n_jobs"
)

search.fit(X, y)