In [1]:
import numpy as np
import pandas as pd
from loading import load_data, load_test
from preprocessing import preprocess_x, preprocess_y

In [2]:
data_x_orig, data_y_orig = load_data() # loads from ".\data" by default
test_x_orig = load_test() # this is for submission purposes

In [3]:
# look at NaN data
s = data_x_orig.isna().sum()/data_x_orig.count()
s[data_x_orig.isna().any()].sort_values(ascending=False)

OMR                0.818498
min_dt_TV1_TV3     0.313622
mean_dt_TV1_TV3    0.313622
med_dt_TV1_TV3     0.313622
min_dt_TV1_TV2     0.312334
mean_dt_TV1_TV2    0.312334
med_dt_TV1_TV2     0.312334
min_dt_TV1_TV4     0.311878
mean_dt_TV1_TV4    0.311878
med_dt_TV1_TV4     0.311878
min_dt_TV1         0.041705
mean_dt_TV1        0.041705
med_dt_TV1         0.041705
OTR                0.016167
dtype: float64

In [4]:
data_x = preprocess_x(data_x_orig)
data_y = preprocess_y(data_x_orig, data_y_orig)

In [5]:
# give ids to traders
from sklearn.preprocessing import LabelEncoder

trader_encoder = LabelEncoder()
groups = trader_encoder.fit_transform(data_x_orig["Trader"].to_numpy())

In [6]:
# send everything to numpy arrays
X = data_x.to_numpy()
y = data_y.to_numpy()

## Model Selection

In [7]:
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier

# build a list of dicts that says which classifier heads to test, and what params to test on them
search_params = [
    # # already tested, worse than SGDClassifier(class_weight='balanced', loss='log'):
    # {
    #     "clf": [KNeighborsClassifier()], 
    #     "clf__n_neighbors": np.arange(3,16),
    #     "clf__weights": ['uniform', 'distance']
    # },
    # {
    #     "clf": [ExtraTreesClassifier(max_features="sqrt")],
    #     "clf__n_estimators": np.linspace(50,200, num=50, dtype = int)
    # },
    # {
    #     "clf": [SGDClassifier()],
    #     "clf__class_weight": [None, "balanced"],
    #     "clf__loss": ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron','squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
    # },
    {
        "clf": [GradientBoostingClassifier(max_features='sqrt')],
        "clf__loss": ['deviance', 'exponential'],
        "clf__learning_rate": np.logspace(-3,0, num = 15),
    },
    {
        "clf": [SGDClassifier(class_weight='balanced', loss='log')]
    }
]

In [8]:
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# these estimators will be appled sequentially to the data:
pipe = Pipeline([
    ("standardisation", StandardScaler()),
    # ("reduce_dim", PCA(n_components='mle')), # unnecessary, MLE only removes the last dimension 
    ("clf", SGDClassifier())
])

In [9]:
#split dataset into training and validation by trader
gkf = GroupKFold(n_splits=8).split(X, y, groups)

search = GridSearchCV(
    pipe,
    search_params,
    #n_iter=100,
    cv=gkf,
    scoring="f1_micro",
    n_jobs=-1,
    pre_dispatch="2*n_jobs",
    verbose=10
)

search.fit(X, y)
search.best_score_

Fitting 8 folds for each of 31 candidates, totalling 248 fits


0.7317939512566107

In [10]:
search.best_params_

{'clf': GradientBoostingClassifier(learning_rate=0.22758459260747887,
                            max_features='sqrt'),
 'clf__learning_rate': 0.22758459260747887,
 'clf__loss': 'deviance'}

## Postprocessing

In [11]:
X_test = preprocess_x(test_x_orig).to_numpy()
y_pred = search.predict(X_test)

In [12]:
from preprocessing import classes
id_to_classes = dict(enumerate(list(classes)))

In [13]:
traders = test_x_orig["Trader"]
results_raw = pd.Series(y_pred, index=test_x_orig["Trader"]).replace(id_to_classes)
grouped = results_raw.groupby("Trader")
counts = grouped.value_counts().unstack(level=1).fillna(0)
ratios = counts.div(counts.sum(axis="columns"), axis = "index")

In [14]:
# implement the rule
hft_threshold = 0.85
mix_threshold = 0.5

results = pd.Series(["NON HFT" for _ in range(len(counts))], index=counts.index, name ="type")
results[ratios["MIX"] > mix_threshold] = "MIX"
results[ratios["HFT"] > hft_threshold] = "HFT"
results

Trader
Adelaide            NON HFT
Alana               NON HFT
Alcmene             NON HFT
Alice                   HFT
Alices Sister       NON HFT
                     ...   
Monstro                 MIX
Morgana                 MIX
The Doorknob        NON HFT
The Doorman             HFT
The Magic Mirror        MIX
Name: type, Length: 85, dtype: object

In [15]:
results.to_csv("results.csv")

## Additional experiments

In [81]:
# number of components found by the MLE approach:
pca = search.best_estimator_.get_params()["reduce_dim"]
pca.n_components_

34