In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from loading import load_data, load_test
from preprocessing import preprocess_x, preprocess_y

In [2]:
data_x_orig, data_y_orig = load_data() # loads from ".\data" by default
test_x_orig = load_test() # this is for submission purposes

In [3]:
# look at missing data
s = data_x_orig.isna().sum()/data_x_orig.count()
s[data_x_orig.isna().any()].sort_values(ascending=False)

OMR                0.818498
min_dt_TV1_TV3     0.313622
mean_dt_TV1_TV3    0.313622
med_dt_TV1_TV3     0.313622
min_dt_TV1_TV2     0.312334
mean_dt_TV1_TV2    0.312334
med_dt_TV1_TV2     0.312334
min_dt_TV1_TV4     0.311878
mean_dt_TV1_TV4    0.311878
med_dt_TV1_TV4     0.311878
min_dt_TV1         0.041705
mean_dt_TV1        0.041705
med_dt_TV1         0.041705
OTR                0.016167
dtype: float64

In [4]:
data_x = preprocess_x(data_x_orig)
data_y = preprocess_y(data_x_orig, data_y_orig)

In [5]:
# give ids to traders
from sklearn.preprocessing import LabelEncoder

trader_encoder = LabelEncoder()
groups = trader_encoder.fit_transform(data_x_orig["Trader"].to_numpy())

In [6]:
# send everything to numpy arrays
X = data_x.to_numpy()
y = data_y.to_numpy()

## Model Selection

In [7]:
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier

# build a list of dicts that says which classifier heads to test, and what params to test on them
params = [
    {
        "clf": [KNeighborsClassifier()], 
        "clf__n_neighbors": stats.randint(1, 16)
    },
    {
        "clf": [ExtraTreesClassifier(max_features="sqrt")],
        "clf__n_estimators": stats.loguniform(50,200)
    },
    {
        "clf": [SGDClassifier()],
        "clf__class_weight": [None, "balanced"]
    }
]

In [8]:
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# these estimators will be appled sequentially to the data:
pipe = Pipeline([
    ("standardisation", StandardScaler()),
    ("reduce_dim", PCA(n_components='mle')),
    ("clf", SGDClassifier())
])

In [17]:
#split dataset into training and validation by trader
gkf = GroupKFold(n_splits=8).split(X, y, groups)

search = RandomizedSearchCV(
    n_iter=100,
    cv=gkf,
    estimator=pipe,
    param_distributions=params,
    scoring="balanced_accuracy", # TODO is this the best score ? Shouldn't we implement our own method ?
    n_jobs=12,
    pre_dispatch="2*n_jobs",
    verbose=2
)

search.fit(X, y)
search.best_score_

Fitting 8 folds for each of 100 candidates, totalling 800 fits
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:   34.0s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:  1.7min
[Parallel(n_jobs=12)]: Done 341 tasks      | elapsed:  3.9min
[Parallel(n_jobs=12)]: Done 624 tasks      | elapsed:  8.5min
[Parallel(n_jobs=12)]: Done 800 out of 800 | elapsed: 10.7min finished


0.7240291157494414

In [19]:
search.best_params_

{'clf': SGDClassifier(class_weight='balanced'),
 'clf__class_weight': 'balanced'}