In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib
import os
import seaborn as sns
import numpy as np
from joblib import dump, load

In [2]:
data_folder = "../data"
model_folder = "../models"
train_file = "train_data.csv"
train_data = pd.read_csv(os.path.join(data_folder, train_file))
np.random.seed(2308723)
random_state = 84738

In [3]:
X_train = train_data.values[:, :-1]
y_train = train_data.values[:, -1]

In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
print(X_train.shape)
X_train_pca = pca.fit_transform(X_train)

(14318, 8)


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [8]:
models = {}
for model_file in os.listdir(model_folder):
    model_filepath = os.path.join(model_folder, model_file)
    model_name = model_file.split(".")[0]
    model = load(model_filepath)
    models[model_name] = model

In [15]:
from sklearn.model_selection import cross_validate

def get_cv_scores(model_name, use_scaled=True):
    train_data = X_train_scaled
    if "rf" in model_name:
        train_data = X_train
    model = models[model_name]
    cv_scores = cross_validate(model, train_data, y_train, scoring=["f1", "accuracy"], verbose=1, n_jobs=-1, cv=5)
    return cv_scores

In [16]:
models.keys()

dict_keys(['adab_clf', 'log_reg', 'rf_best', 'svc', 'xgb'])

In [19]:
get_cv_scores("xgb")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   33.1s finished


{'fit_time': array([16.60644817, 16.55846334, 16.63048053, 16.92456365, 14.01868105]),
 'score_time': array([0.31401443, 0.33800697, 0.39705086, 0.3509872 , 0.21794176]),
 'test_f1': array([0.90485437, 0.87698413, 0.90019569, 0.89788054, 0.87058824]),
 'test_accuracy': array([0.98289106, 0.97835196, 0.98219274, 0.98148795, 0.97694726])}

In [20]:
for model_name in models.keys():
    cv_score = np.mean(get_cv_scores(model_name)["test_f1"])
    print(f"{model_name} has cv f1_score {cv_score}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   43.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


adab_clf has cv f1_score 0.8872826382437538


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


log_reg has cv f1_score 0.8791640563340133


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   46.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


rf_best has cv f1_score 0.885586915895256


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


svc has cv f1_score 0.8861500309538975
xgb has cv f1_score 0.8901005930851124


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   30.8s finished


# Voting Classifier

In [37]:
from itertools import combinations
from sklearn.ensemble import VotingClassifier

estimators_combinations = []
voting_perf_dict = {}

for nb_estimators in (3, 5):
    estimator_names = [list(i) for i in combinations(models.keys(), nb_estimators)]
    estimators_combinations += [[(name, models[name]) for name in estimators_names_combination] for estimators_names_combination in estimator_names]
    
for combination in estimators_combinations:
    combination_names = ""
    for name, _ in combination:
        combination_names += f"{name} + "
    combination_names = combination_names[:-3]
    voting_clf = VotingClassifier(combination, voting='hard')
    train_data = X_train_scaled
    cv_scores = cross_validate(voting_clf, train_data, y_train, scoring=["f1", "accuracy"], verbose=1, n_jobs=-1, cv=5)
    voting_perf_dict[combination_names] = cv_scores['test_f1']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   47.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5

In [38]:
voting_perf_dict

{'adab_clf + log_reg + rf_best': array([0.90039841, 0.87626775, 0.89068826, 0.9015748 , 0.87250996]),
 'adab_clf + log_reg + svc': array([0.90140845, 0.87169043, 0.89205703, 0.90335306, 0.87374749]),
 'adab_clf + log_reg + xgb': array([0.90551181, 0.87726358, 0.89820359, 0.8984375 , 0.87301587]),
 'adab_clf + rf_best + svc': array([0.9       , 0.8699187 , 0.88617886, 0.90118577, 0.8742515 ]),
 'adab_clf + rf_best + xgb': array([0.9015748 , 0.87272727, 0.896     , 0.89980354, 0.87524752]),
 'adab_clf + svc + xgb': array([0.90513834, 0.87323944, 0.89378758, 0.90410959, 0.87475149]),
 'log_reg + rf_best + svc': array([0.90180361, 0.87346939, 0.89655172, 0.90373281, 0.872     ]),
 'log_reg + rf_best + xgb': array([0.8972332 , 0.87626775, 0.89738431, 0.90019569, 0.87475149]),
 'log_reg + svc + xgb': array([0.90180361, 0.87676768, 0.89655172, 0.90196078, 0.872     ]),
 'rf_best + svc + xgb': array([0.8972332 , 0.87044534, 0.89738431, 0.90196078, 0.87475149]),
 'adab_clf + log_reg + rf_best +

In [40]:
for key in voting_perf_dict.keys():
    voting_perf_dict[key] = np.mean(voting_perf_dict[key])

In [41]:
voting_perf_dict

{'adab_clf + log_reg + rf_best': 0.8882878354542969,
 'adab_clf + log_reg + svc': 0.8884512914137137,
 'adab_clf + log_reg + xgb': 0.89048647166856,
 'adab_clf + rf_best + svc': 0.8863069657465171,
 'adab_clf + rf_best + xgb': 0.8890706273950262,
 'adab_clf + svc + xgb': 0.8902052863571482,
 'log_reg + rf_best + svc': 0.8895115057075434,
 'log_reg + rf_best + xgb': 0.889166488332932,
 'log_reg + svc + xgb': 0.8898167584867525,
 'rf_best + svc + xgb': 0.8883550253825991,
 'adab_clf + log_reg + rf_best + svc + xgb': 0.8885524729493752}

# Stacking Classifier

In [43]:
from sklearn.ensemble import StackingClassifier

estimators_combinations = []
stacking_perf_dict = {}

for nb_estimators in [2, 3, 4, 5]:
    estimator_names = [list(i) for i in combinations(models.keys(), nb_estimators)]
    estimators_combinations += [[(name, models[name]) for name in estimators_names_combination] for estimators_names_combination in estimator_names]
    
print(f'{len(estimators_combinations)} combinations to go')

for i, combination in enumerate(estimators_combinations):
    combination_names = ""
    for name, _ in combination:
        combination_names += f"{name} + "
    combination_names = combination_names[:-3]
    print(f'combination {i+1} is {combination_names}')
    voting_clf = StackingClassifier(combination)
    train_data = X_train_scaled
    cv_scores = cross_validate(voting_clf, train_data, y_train, scoring=["f1", "accuracy"], verbose=1, n_jobs=-1, cv=5)
    stacking_perf_dict[combination_names] = np.mean(cv_scores['test_f1'])

26 combinations to go
combination 1 is adab_clf + log_reg


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 2 is adab_clf + rf_best


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 3 is adab_clf + svc


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 4 is adab_clf + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 5 is log_reg + rf_best


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 6 is log_reg + svc


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 7 is log_reg + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 8 is rf_best + svc


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 9 is rf_best + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 10 is svc + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 11 is adab_clf + log_reg + rf_best


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  7.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 12 is adab_clf + log_reg + svc


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 13 is adab_clf + log_reg + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 14 is adab_clf + rf_best + svc


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 15 is adab_clf + rf_best + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  9.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 16 is adab_clf + svc + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 17 is log_reg + rf_best + svc


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 18 is log_reg + rf_best + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 19 is log_reg + svc + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 20 is rf_best + svc + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 21 is adab_clf + log_reg + rf_best + svc


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  7.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 22 is adab_clf + log_reg + rf_best + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  9.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 23 is adab_clf + log_reg + svc + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 24 is adab_clf + rf_best + svc + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  9.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 25 is log_reg + rf_best + svc + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


combination 26 is adab_clf + log_reg + rf_best + svc + xgb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  9.8min finished


In [44]:
stacking_perf_dict

{'adab_clf + log_reg': 0.882403703466452,
 'adab_clf + rf_best': 0.8856326910908354,
 'adab_clf + svc': 0.8838541904246288,
 'adab_clf + xgb': 0.8863949654360166,
 'log_reg + rf_best': 0.8832441939662494,
 'log_reg + svc': 0.881914854267168,
 'log_reg + xgb': 0.8885371170427085,
 'rf_best + svc': 0.8833301471911014,
 'rf_best + xgb': 0.8862320066473058,
 'svc + xgb': 0.8868883572447643,
 'adab_clf + log_reg + rf_best': 0.883595173891232,
 'adab_clf + log_reg + svc': 0.8848273776121849,
 'adab_clf + log_reg + xgb': 0.8847415232425506,
 'adab_clf + rf_best + svc': 0.884270462568351,
 'adab_clf + rf_best + xgb': 0.8856546793321112,
 'adab_clf + svc + xgb': 0.8858265979440484,
 'log_reg + rf_best + svc': 0.8835678562940773,
 'log_reg + rf_best + xgb': 0.8861101919665675,
 'log_reg + svc + xgb': 0.8868520093267145,
 'rf_best + svc + xgb': 0.8857601666881573,
 'adab_clf + log_reg + rf_best + svc': 0.8833800793572377,
 'adab_clf + log_reg + rf_best + xgb': 0.8854010108203838,
 'adab_clf + log

## None of the ensemble models have a better cross validation f1-score than the XGBoost model.