In [1]:
import numpy as np
import pandas as pd
import sys
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid

sys.path.append("../")
import mlflow
from src.models.train_models import run_ml_flow_experiment

import warnings
warnings.filterwarnings('ignore')

from config.definitions import ROOT_DIR, RAW_DATA_PATH, TRAIN_PATH, TEST_PATH

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
TARGET = 'result'
DROP_COLS = ['index', 'white', 'black', 'date', 'id', 'name', 'start_date', 'end_date', 'time_control']
CATEGORICAL_PROCESSED = ['white_latin', 'black_latin']

params = {}
params['lr'] = list(ParameterGrid({'class_weight':[None, 'balanced'],'penalty':['l2','none'], 'C':[0.1,1, 2]}))
params['rf'] = list(ParameterGrid({'class_weight':[None, 'balanced'], 'n_estimators':[50, 100],'min_samples_leaf':[1,50], 'max_depth':[None, 100]}))
params['lgb'] = list(ParameterGrid({'class_weight':[None, 'balanced'], 'n_estimators':[500, 1000], 'num_leaves':[31, 100]}))

## Experiment 1 - scale features and fit the model

In [4]:
# load the data and do a train/test split
with open('../data/processed/df_processed.pickle', 'rb') as handle:
    df_processed = pickle.load(handle)

df_processed[TARGET] = (df_processed[TARGET] * 2).astype(int)

X_train, X_test, y_train, y_test = train_test_split(df_processed.drop(TARGET,axis=1), df_processed[TARGET], test_size=0.2, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

In [5]:
NUM_FEATURES = X_train.drop(DROP_COLS+CATEGORICAL_PROCESSED,axis=1).columns.to_list()

In [6]:
minmax_transformer = Pipeline(steps=[('minmax', MinMaxScaler())])
minmax_transformer.fit(X_train[NUM_FEATURES])

with open('../models/utils/minmax_transformer.pickle', 'wb') as f:
    pickle.dump(minmax_transformer, f)

In [7]:
X_train_scaled = minmax_transformer.transform(X_train[NUM_FEATURES])
X_val_scaled = minmax_transformer.transform(X_val[NUM_FEATURES])
X_test_scaled = minmax_transformer.transform(X_test[NUM_FEATURES])

In [8]:
## track experiments on http://localhost:5000/
# !mlflow ui

In [9]:
run_ml_flow_experiment(X_train=X_train_scaled, X_val=X_val_scaled, X_test=X_test_scaled, 
                       y_train=y_train, y_val=y_val, y_test=y_test, 
                       params=params, 
                       experiment_name='minmax_experiment')

# try lr with only several features that are highly correlated with target
corr = df_processed.corr()
lr_feat = corr[(abs(corr['result'])>0.03) & (abs(corr['result'])<0.8)]['result'].index.to_list()
lr_feat_idx = [df_processed[NUM_FEATURES].columns.to_list().index(x) for x in lr_feat]
print('features: ' ,lr_feat)

run_ml_flow_experiment(X_train=X_train_scaled[:,lr_feat_idx], 
                       X_val=X_val_scaled[:,lr_feat_idx], 
                       X_test=X_test_scaled[:,lr_feat_idx], 
                       y_train=y_train, 
                       y_val=y_val, 
                       y_test=y_test, 
                       params=params, 
                       experiment_name='minmax_experiment', run_section='lr', model_desc=' only corr features')

model: logistic regression, accuracy:  0.42941649437169765 f1:  0.3832390940833052 
 params:  {'C': 0.1, 'class_weight': None, 'penalty': 'l2'}
model: logistic regression, accuracy:  0.42924419940271075 f1:  0.3833753273520439 
 params:  {'C': 0.1, 'class_weight': None, 'penalty': 'none'}
model: logistic regression, accuracy:  0.39490006891798757 f1:  0.3927807317179144 
 params:  {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}
model: logistic regression, accuracy:  0.3961635653572249 f1:  0.3941598116762142 
 params:  {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'none'}
model: logistic regression, accuracy:  0.42953135768435563 f1:  0.38362482354027955 
 params:  {'C': 1, 'class_weight': None, 'penalty': 'l2'}
model: logistic regression, accuracy:  0.42924419940271075 f1:  0.3833753273520439 
 params:  {'C': 1, 'class_weight': None, 'penalty': 'none'}
model: logistic regression, accuracy:  0.39622099701355384 f1:  0.3942131837999628 
 params:  {'C': 1, 'class_weight': 'ba

In [10]:
# initial metric is a bit better than random guessing
best_runs = mlflow.search_runs(experiment_names=['minmax_experiment'], order_by=["metrics.f1 DESC"])
best_runs[['run_id','tags.mlflow.runName', 'metrics.recall','metrics.precision','metrics.f1','metrics.accuracy', 'params.n_estimators','params.class_weight', 'params.num_leaves', 'params.min_samples_leaf', 'params.max_depth', 'params.C', 'params.penalty']].head()

Unnamed: 0,run_id,tags.mlflow.runName,metrics.recall,metrics.precision,metrics.f1,metrics.accuracy,params.n_estimators,params.class_weight,params.num_leaves,params.min_samples_leaf,params.max_depth,params.C,params.penalty
0,a31b565442d54f30bd9cf3336de90fc0,lgb__7,0.438574,0.434519,0.434521,0.436883,1000,balanced,100.0,,,,
1,0000928522aa46e5bf62fef453937fa3,lgb__5,0.438574,0.434519,0.434521,0.436883,500,balanced,100.0,,,,
2,3332640326ca49e289d3ccfb37b69e82,lgb__6,0.438212,0.433629,0.43403,0.436021,1000,balanced,31.0,,,,
3,fdcd3260657c4bc8b7ef2409164052d9,lgb__4,0.438212,0.433629,0.43403,0.436021,500,balanced,31.0,,,,
4,f491535da6634f3f825c7b2c674849ad,rf__15,0.431573,0.427801,0.426558,0.4289,100,balanced,,50.0,100.0,,


## Experiment 2 - adding polynomial features

In [11]:
POLY_FEATURES = ['white_rank_est', 'black_rank_est', 'white_rank_chg', 'black_rank_chg', 'rank_diff']
poly_transformer = Pipeline(steps=[('polynomial', PolynomialFeatures(2))])

minmax_poly_tranformer = ColumnTransformer(
        remainder='passthrough',
        transformers=[
            ('num_scaler', minmax_transformer , NUM_FEATURES),
            ('poly', poly_transformer, POLY_FEATURES)
        ])

minmax_poly_tranformer.fit(X_train[NUM_FEATURES])

with open('../models/utils/minmax_poly_tranformer.pickle', 'wb') as f:
    pickle.dump(minmax_poly_tranformer, f)

In [12]:
X_train_scaled = minmax_poly_tranformer.transform(X_train[NUM_FEATURES])
X_val_scaled = minmax_poly_tranformer.transform(X_val[NUM_FEATURES])
X_test_scaled = minmax_poly_tranformer.transform(X_val[NUM_FEATURES])

In [13]:
# due to num of features, logreg will be skipped
run_ml_flow_experiment(X_train=X_train_scaled, X_val=X_val_scaled, X_test=X_test_scaled, 
                       y_train=y_train, y_val=y_val, y_test=y_test, 
                       params=params, 
                       experiment_name='minmax_poly_experiment', run_section=['rf','lgb'])

model: random forest  accuracy:  0.3426946933149552 f1:  0.3328599013512152 
 params:  {'class_weight': None, 'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 50}
model: random forest  accuracy:  0.3440156214105215 f1:  0.33410437716872615 
 params:  {'class_weight': None, 'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 100}
model: random forest  accuracy:  0.36905582356995176 f1:  0.3188357741352484 
 params:  {'class_weight': None, 'max_depth': None, 'min_samples_leaf': 50, 'n_estimators': 50}
model: random forest  accuracy:  0.36693085228577993 f1:  0.3156672977610953 
 params:  {'class_weight': None, 'max_depth': None, 'min_samples_leaf': 50, 'n_estimators': 100}
model: random forest  accuracy:  0.3426946933149552 f1:  0.3328599013512152 
 params:  {'class_weight': None, 'max_depth': 100, 'min_samples_leaf': 1, 'n_estimators': 50}
model: random forest  accuracy:  0.3440156214105215 f1:  0.33410437716872615 
 params:  {'class_weight': None, 'max_depth': 100, 'min_

In [14]:
# adding new features hurts the accuracy
best_runs = mlflow.search_runs(experiment_names=['minmax_poly_experiment'], order_by=["metrics.f1 DESC"])
best_runs[['run_id','tags.mlflow.runName', 'metrics.recall','metrics.precision','metrics.f1','metrics.accuracy', 'params.n_estimators','params.class_weight', 'params.num_leaves', 'params.min_samples_leaf', 'params.max_depth']].head()

Unnamed: 0,run_id,tags.mlflow.runName,metrics.recall,metrics.precision,metrics.f1,metrics.accuracy,params.n_estimators,params.class_weight,params.num_leaves,params.min_samples_leaf,params.max_depth
0,8c2372a3924a4ba08d48c459f97cfa87,lgb__7,0.339184,0.338478,0.337208,0.342924,1000,balanced,100.0,,
1,b1ded2ee9274487ea3ad25eb0959a63a,lgb__5,0.339184,0.338478,0.337208,0.342924,500,balanced,100.0,,
2,0154ddbb4bfe47738217c704d36186ce,lgb__6,0.338057,0.337439,0.33659,0.341833,1000,balanced,31.0,,
3,97351c3cb35846218adcbd2bc9e947d1,lgb__4,0.338057,0.337439,0.33659,0.341833,500,balanced,31.0,,
4,9b27a5e124b644ee84b6881afd72802d,rf__14,0.337936,0.336713,0.335606,0.339708,50,balanced,,50.0,100.0


## Experiment 3 - adding encoding for categorical features

In [15]:
te = Pipeline(steps=[('target_enc', TargetEncoder(CATEGORICAL_PROCESSED, return_df=False))])

minmax_te_tranformer = ColumnTransformer(
        remainder='passthrough',
        transformers=[
            ('num_scaler', minmax_transformer , NUM_FEATURES),
            ('te', te, CATEGORICAL_PROCESSED)
        ])

minmax_te_tranformer.fit(X_train[NUM_FEATURES + CATEGORICAL_PROCESSED], y_train)

with open('../models/utils/minmax_te_tranformer.pickle', 'wb') as f:
    pickle.dump(minmax_te_tranformer, f)

In [16]:
X_train_scaled = minmax_te_tranformer.transform(X_train[NUM_FEATURES + CATEGORICAL_PROCESSED])
X_val_scaled = minmax_te_tranformer.transform(X_val[NUM_FEATURES + CATEGORICAL_PROCESSED])
X_test_scaled = minmax_te_tranformer.transform(X_test[NUM_FEATURES + CATEGORICAL_PROCESSED])

In [None]:
run_ml_flow_experiment(X_train=X_train_scaled, X_val=X_val_scaled, X_test=X_test_scaled, 
                       y_train=y_train, y_val=y_val, y_test=y_test, 
                       params=params, 
                       experiment_name='minmax_te_experiment')

In [24]:
# much better results on test set obtained by encoding players
best_runs = mlflow.search_runs(experiment_names=['minmax_te_experiment'], order_by=["metrics.f1 DESC", "metrics.accuracy DESC"])
best_runs[['run_id','tags.mlflow.runName', 'metrics.recall','metrics.precision','metrics.f1','metrics.accuracy', 'params.n_estimators','params.class_weight', 'params.min_samples_leaf', 'params.max_depth']].head()

Unnamed: 0,run_id,tags.mlflow.runName,metrics.recall,metrics.precision,metrics.f1,metrics.accuracy,params.n_estimators,params.class_weight,params.min_samples_leaf,params.max_depth
0,5e3e90d2e5664b71a42344ffb7940040,lgb__6,0.513309,0.503608,0.506779,0.505054,1000,balanced,,
1,28b188013ed8421c8e968bfdcef49231,lgb__4,0.513309,0.503608,0.506779,0.505054,500,balanced,,
2,6281cf07127b4418b6c6a90efadf1919,rf__7,0.498475,0.533558,0.506752,0.514186,100,,50.0,100.0
3,45564be8c6844f6e81182316c66cce42,rf__3,0.498475,0.533558,0.506752,0.514186,100,,50.0,
4,56e7b27425854e85b8daae74f37e5828,lgb__3,0.498176,0.535345,0.506368,0.51499,1000,,,


## Experiment 4 - combine poly with encoding for categorical features

In [None]:
te = Pipeline(steps=[('target_enc', TargetEncoder(CATEGORICAL_PROCESSED, return_df=False))])

minmax_poly_ohe_tranformer = ColumnTransformer(
        remainder='passthrough',
        transformers=[
            ('num_scaler', minmax_transformer , NUM_FEATURES),
            ('poly', poly_transformer, NUM_FEATURES),
            ('te', te, CATEGORICAL_PROCESSED)
        ])

minmax_poly_ohe_tranformer.fit(X_train[NUM_FEATURES + CATEGORICAL_PROCESSED], y_train)

X_train_scaled = minmax_poly_ohe_tranformer.transform(X_train[NUM_FEATURES + CATEGORICAL_PROCESSED])
X_val_scaled = minmax_poly_ohe_tranformer.transform(X_val[NUM_FEATURES + CATEGORICAL_PROCESSED])
X_test_scaled = minmax_poly_ohe_tranformer.transform(X_test[NUM_FEATURES + CATEGORICAL_PROCESSED])

with open('../models/utils/minmax_poly_te_tranformer.pickle', 'wb') as f:
    pickle.dump(minmax_poly_ohe_tranformer, f)

run_ml_flow_experiment(X_train=X_train_scaled, X_val=X_val_scaled, X_test=X_test_scaled, 
                       y_train=y_train, y_val=y_val, y_test=y_test, 
                       params=params, 
                       experiment_name='minmax_poly_te_experiment', run_section=['rf', 'lgb'])

In [20]:
# no improvement 
best_runs = mlflow.search_runs(experiment_names=['minmax_poly_te_experiment'], order_by=["metrics.f1 DESC"])
best_runs[['run_id','tags.mlflow.runName', 'metrics.recall','metrics.precision','metrics.f1','metrics.accuracy', 'params.n_estimators','params.class_weight', 'params.min_samples_leaf', 'params.max_depth']].head()

Unnamed: 0,run_id,tags.mlflow.runName,metrics.recall,metrics.precision,metrics.f1,metrics.accuracy,params.n_estimators,params.class_weight,params.min_samples_leaf,params.max_depth
0,56b495c7b49b4892b991dc9610457e59,lgb__2,0.498383,0.535802,0.506756,0.514588,1000,,,
1,91ea0dc1a4cf404ca724933ded87d661,lgb__0,0.498383,0.535802,0.506756,0.514588,500,,,
2,f7c6de7ee81a446fa591ab63c49984dc,lgb__6,0.512321,0.502787,0.505959,0.504652,1000,balanced,,
3,af2fb3e3575a4f95b6906e1c9395b4cd,lgb__4,0.512321,0.502787,0.505959,0.504652,500,balanced,,
4,959c5460bc6a45c497d930737a0a9c1a,lgb__3,0.497301,0.535239,0.505708,0.513669,1000,,,


## Get best run and predict test data

In [22]:
best_runs = mlflow.search_runs(experiment_names=['minmax_te_experiment'], order_by=["metrics.f1 DESC"])
best_runs[['run_id','tags.mlflow.runName', 'metrics.recall','metrics.precision','metrics.f1','metrics.accuracy', 'params.n_estimators','params.class_weight', 'params.min_samples_leaf', 'params.max_depth']].head()
# best run from Experiment 3
mlflow.get_run(best_runs['run_id'].iloc[0]).data.to_dictionary()['params']

{'class_weight': 'balanced',
 'model': 'lightgbm',
 'num_leaves': '100',
 'n_estimators': '1000'}

In [32]:
# for easier implementation we'll use rf that had simmilar performance as lgb model
# format parameters, chain into pipeline and save
p = mlflow.get_run('6281cf07127b4418b6c6a90efadf1919').data.to_dictionary()['params']
p.pop('model')
p.pop('class_weight')
for i in p.keys():
    try:
        p[i] = int(p[i])
    except:
        pass

X = pd.concat([X_train[NUM_FEATURES + CATEGORICAL_PROCESSED], X_val[NUM_FEATURES + CATEGORICAL_PROCESSED]])
y = pd.concat([y_train, y_val])

from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([('preprocess', minmax_te_tranformer), ('rf', RandomForestClassifier(random_state=1, n_jobs=4, **p))])

pipeline.fit(X, y)
with open('../models/minmax_te_model.pickle', 'wb') as f:
    pickle.dump({'model':pipeline, 'features':NUM_FEATURES + CATEGORICAL_PROCESSED}, f)


In [33]:
# feature importance
pd.Series(dict(zip(NUM_FEATURES + CATEGORICAL_PROCESSED, pipeline.steps[1][1].feature_importances_))).sort_values()

black_game_n          0.011187
white_game_n          0.013375
tour_days_passed      0.016534
tours                 0.017287
tour_length_days      0.019693
tour_round            0.020010
black_rank_est        0.026612
black_rank_chg        0.029905
white_rank_est        0.032642
white_rank_chg        0.038830
tour_game             0.041629
rank_diff             0.073964
time_control_rapid    0.078017
white_latin           0.282161
black_latin           0.298154
dtype: float64

In [None]:
from src.models.predict import predict_new_cases, update_json_w_pred
pred = predict_new_cases(model_name='minmax_te', data_path="test")

In [68]:
update_json_w_pred(df=pred, rel_path="test")