In [1]:
# -*- coding: UTF-8 -*-
"""Prediction of survival at the Titanic with GradientBoosting classifier"""
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score, RepeatedKFold, RepeatedStratifiedKFold, cross_validate
from sklearn import metrics
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,  HistGradientBoostingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
def exec_time(start:float, end:float) -> float:
    """Measures code runtime, returns value in milliseconds rounded to 4 decimal places."""
    diff_time = (end - start) * 1000
    return round(diff_time, 4)

In [3]:
def get_metrics(y_test, y_pred) -> map:
    """Calculates the main model metrics - accuracy, balanced-Accuracy, recall, precision`, f1_score and returns them as a map object"""
    accuracy = metrics.accuracy_score(y_test, y_pred)
    balanced_accuracy = metrics.balanced_accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f_1 = metrics.f1_score(y_test, y_pred)
    roc_auc = metrics.roc_auc_score(y_test, y_pred)
    metrics = map(lambda element: round(element, 4), (accuracy, balanced_accuracy, precision, recall, f_1, roc_auc))
    return metrics

In [5]:
# Define test classification dataset
X, y = make_classification(n_samples=5000, n_features=20, n_informative=5, n_redundant=2, random_state=12)
# summarize the dataset
print(X.shape, y.shape)
# Split for train/test data as 80% and 20 %
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
X, y

(5000, 20) (5000,)


(array([[ 1.79818937e+00,  4.53015277e+00,  4.22403615e-03, ...,
          1.02597489e+00,  4.27126411e+00,  6.61644026e-01],
        [ 6.93138828e-01, -1.76468266e+00,  2.30285684e-01, ...,
         -1.59501947e+00,  1.76354883e+00, -4.27768545e-01],
        [ 6.58166152e-01, -3.19425213e+00,  8.05843890e-02, ...,
         -6.28951404e-01, -2.99132020e+00, -9.92600787e-02],
        ...,
        [-8.78901958e-01,  2.19683613e+00, -1.71063685e+00, ...,
          1.65315937e-01,  1.89641658e+00, -5.91270778e-01],
        [ 1.89973102e+00, -1.31447362e+00, -5.56524415e-01, ...,
         -1.63908172e+00, -1.10436920e+00,  1.47530500e+00],
        [-5.20090832e-01, -2.64398607e+00, -1.22005814e+00, ...,
         -1.82798431e-01, -2.30324798e+00,  4.37861623e-01]]),
 array([1, 1, 1, ..., 0, 1, 1]))

In [6]:
# instance the classification models
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
hist_gb = HistGradientBoostingClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()
catgb = CatBoostClassifier(verbose=0, n_estimators=200)
mlp = MLPClassifier(max_iter=1000)

models = (rf, gb, hist_gb, xgb, lgb, catgb, mlp)

models

(RandomForestClassifier(),
 GradientBoostingClassifier(),
 HistGradientBoostingClassifier(),
 XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, gamma=None,
               gpu_id=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, n_estimators=100, n_jobs=None,
               num_parallel_tree=None, predictor=None, random_state=None,
               reg_alpha=None, reg_lambda=None, ...),
 LGBMClassifier(),
 <catboost.core.CatBoostClassifier at 0x17b2a4670>,
 MLPClassifier(max_iter=1000))

In [7]:
def make_cross_validation(estimator, X_train, y_train, cv=5):
    scorings = ('accuracy', 'balanced_accuracy', 'f1', 'precision', 'recall',  'roc_auc')
    scores = cross_validate(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring=scorings)
    scores.items()
    final_metrics = dict()
    for key, item in sorted(scores.items()):
        # print(f'{key} is {item.mean().round(4)}')
        final_metrics[key] = item.mean().round(4)
    return final_metrics

In [8]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=12)
data = []
for mdl in models:
    report_dict = make_cross_validation(mdl, X_train, y_train, cv=cv)
    data.append(report_dict)
df_report = pd.DataFrame(data, index=models)
df_report

Unnamed: 0,fit_time,score_time,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_roc_auc
RandomForestClassifier(),1.4515,0.0542,0.9268,0.9268,0.9273,0.9286,0.9263,0.9753
GradientBoostingClassifier(),2.9569,0.0105,0.9279,0.9279,0.9282,0.9322,0.9245,0.9762
HistGradientBoostingClassifier(),0.6874,0.0327,0.9339,0.9339,0.9344,0.9361,0.9328,0.981
"XGBClassifier(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, gamma=None,\n gpu_id=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n max_leaves=None, min_child_weight=None, missing=nan,\n monotone_constraints=None, n_estimators=100, n_jobs=None,\n num_parallel_tree=None, predictor=None, random_state=None,\n reg_alpha=None, reg_lambda=None, ...)",2.3399,0.0187,0.9334,0.9335,0.9338,0.9369,0.9309,0.9795
LGBMClassifier(),0.3436,0.04,0.9344,0.9344,0.9349,0.9364,0.9335,0.9809
<catboost.core.CatBoostClassifier object at 0x17b2a4670>,2.4475,0.0318,0.9389,0.9389,0.9392,0.9429,0.9357,0.982
MLPClassifier(max_iter=1000),9.0588,0.007,0.924,0.924,0.9245,0.9261,0.9231,0.9754


In [9]:
def make_fitting(estimator, X_train, y_train):
    estimator.fit(X_train, y_train)
    return estimator

In [10]:
def make_prediction(estimator, X_test):
    y_pred = estimator.predict(X_test)
    return y_pred

In [11]:
for mdl in models:
    mdl.fit(X_train, y_train)
    # estimator = make_fitting(mdl, X_train, y_train)
    # y_pred = make_prediction(estimator, X_test)
    y_pred = mdl.predict(X_test)
    accaracy = metrics.accuracy_score(y_test, y_pred)
    bal_accuracy = metrics.balanced_accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1_score = metrics.f1_score(y_test, y_pred)
    roc_auc = metrics.roc_auc_score(y_test, y_pred)
    # mult_nb_balanced_accuracy, mult_nb_precision, mult_nb_recall, mult_nb_f1_score = get_metrics(y_test, y_pred)
    print(f'\nTesting the best model: {mdl}')
    print(f"Accuracy: {accaracy.round(4)}, Balanced Accuracy: {bal_accuracy.round(4)}, Precision {precision.round(4)},  Recall {recall.round(4)}, F1_score {f1_score.round(4)}")


Testing the best model: RandomForestClassifier()
Accuracy: 0.946, Balanced Accuracy: 0.946, Precision 0.9419,  Recall 0.9458, F1_score 0.9439

Testing the best model: GradientBoostingClassifier()
Accuracy: 0.937, Balanced Accuracy: 0.937, Precision 0.9317,  Recall 0.9375, F1_score 0.9346

Testing the best model: HistGradientBoostingClassifier()
Accuracy: 0.945, Balanced Accuracy: 0.9446, Precision 0.9493,  Recall 0.9354, F1_score 0.9423

Testing the best model: XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, mono