In [4]:
# -*- coding: UTF-8 -*-
"""Comparison performance of the most popular classifiers with applying different scaling technique on the Titanic dataset."""
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from titanic_data_cleaning import train_df_clean, test_df_clean
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn import metrics

In [5]:
# Train/test split of the train.csv data
X = train_df_clean.drop('Survived', axis=1).values
y = train_df_clean['Survived'].values

# Split for train/test data as 90% and 20 %
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Get the data for prediction from the test.csv data
X_for_prediction = test_df_clean.values

In [6]:
def make_scaling(scaler, X_train, X_test):
    """Perform feature scaling technique.
    
    Example:
    >>>scaler = StandardScaler()
    >>> X_train, X_test = make_scaling(scaler, X_train, X_test)
    """
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [7]:
# StandardScaler or Z-Score Normalization
scaler = StandardScaler()
X_train_s, X_test_s = make_scaling(scaler, X_train, X_test)

In [8]:
# MinMaxScaling(commonly used normalization technique)
min_max_scaler = MinMaxScaler()
X_train_n, X_test_n = make_scaling(min_max_scaler, X_train, X_test)

In [9]:
# Instances of the classification models
lr = LogisticRegression()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
hist_gb = HistGradientBoostingClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()
catgb = CatBoostClassifier(verbose=0, n_estimators=200)
mlp = MLPClassifier(max_iter=1500, solver='adam')



models = (lr, rf, gb, hist_gb, xgb, lgb, catgb, mlp)
models_tuple = ('LogisticRegression',
                'RandomForestClassifier',
                'GradientBoostingClassifier',
                'HistGradientBoostingClassifier',
                'XGBClassifier',
                'LGBMClassifier',
                'CatBoostClassifier',
                'MLPClassifier',
               )
models

(LogisticRegression(),
 RandomForestClassifier(),
 GradientBoostingClassifier(),
 HistGradientBoostingClassifier(),
 XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, gamma=None,
               gpu_id=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, n_estimators=100, n_jobs=None,
               num_parallel_tree=None, predictor=None, random_state=None,
               reg_alpha=None, reg_lambda=None, ...),
 LGBMClassifier(),
 <catboost.core.CatBoostClassifier at 0x2792236f250>,
 MLPClassifier(max_iter=1500))

In [10]:
def make_cross_validation(estimator, X_train, y_train, cv=5):
    """Perform cross validation and teturn metrics:
    accuracy, balanced_accuracy, f1, precision, recall, roc_auc"""
    scorings = ('accuracy', 'balanced_accuracy', 'f1', 'precision', 'recall', 'roc_auc')
    scores = cross_validate(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring=scorings)
    final_metrics = dict()
    for key, item in sorted(scores.items()):
        final_metrics[key] = item.mean().round(4)
    return final_metrics

In [11]:
# Using raw data without scaling
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=12)
data_cross_val = []
for mdl in models:
    report_dict = make_cross_validation(mdl, X_train, y_train, cv=cv)
    data_cross_val.append(report_dict)
df_report = pd.DataFrame(data_cross_val, index=models_tuple)
df_report

Unnamed: 0,fit_time,score_time,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_roc_auc
LogisticRegression,0.0176,0.005,0.8017,0.7817,0.724,0.7482,0.705,0.8546
RandomForestClassifier,0.1504,0.0296,0.825,0.8047,0.7539,0.7863,0.727,0.8659
GradientBoostingClassifier,0.0791,0.0052,0.8416,0.8172,0.7716,0.8297,0.7239,0.8683
HistGradientBoostingClassifier,0.1955,0.0098,0.8253,0.803,0.7522,0.7928,0.7178,0.8687
XGBClassifier,0.0562,0.008,0.8168,0.7975,0.7448,0.77,0.7232,0.864
LGBMClassifier,0.0337,0.0136,0.8261,0.8045,0.7538,0.7919,0.7216,0.87
CatBoostClassifier,0.3045,0.0067,0.8402,0.8147,0.7682,0.8301,0.7171,0.8681
MLPClassifier,0.5796,0.0048,0.7938,0.7759,0.7165,0.7297,0.7073,0.8588


In [12]:
# Using data after standardization with StandardScaler
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=12)
data_cross_val = []
for mdl in models:
    report_dict = make_cross_validation(mdl, X_train_s, y_train, cv=cv)
    data_cross_val.append(report_dict)
df_report = pd.DataFrame(data_cross_val, index=models_tuple)
df_report

Unnamed: 0,fit_time,score_time,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_roc_auc
LogisticRegression,0.0032,0.0047,0.8025,0.7831,0.7259,0.7474,0.7088,0.8549
RandomForestClassifier,0.1675,0.03,0.823,0.8028,0.7515,0.7813,0.7254,0.8659
GradientBoostingClassifier,0.0869,0.0058,0.8416,0.8172,0.7716,0.8297,0.7239,0.8684
HistGradientBoostingClassifier,0.2126,0.0103,0.825,0.8026,0.7517,0.7926,0.7171,0.868
XGBClassifier,0.0509,0.0077,0.8166,0.7972,0.7445,0.7694,0.7232,0.8639
LGBMClassifier,0.0297,0.007,0.8264,0.8049,0.7543,0.7923,0.7224,0.8694
CatBoostClassifier,0.2579,0.0066,0.8402,0.8147,0.7682,0.8301,0.7171,0.8682
MLPClassifier,1.968,0.0046,0.8166,0.7911,0.7356,0.7886,0.6936,0.85


In [13]:
# Using data after normalization with MinMaxScaler
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=12)
data_cross_val = []
for mdl in models:
    report_dict = make_cross_validation(mdl, X_train_n, y_train, cv=cv)
    data_cross_val.append(report_dict)
df_report = pd.DataFrame(data_cross_val, index=models_tuple)
df_report

Unnamed: 0,fit_time,score_time,test_accuracy,test_balanced_accuracy,test_f1,test_precision,test_recall,test_roc_auc
LogisticRegression,0.0061,0.0048,0.8042,0.7851,0.7282,0.7496,0.7119,0.8561
RandomForestClassifier,0.1526,0.0305,0.823,0.8026,0.7513,0.7831,0.7247,0.8669
GradientBoostingClassifier,0.0821,0.0053,0.841,0.8163,0.7705,0.8299,0.7217,0.8689
HistGradientBoostingClassifier,0.201,0.0099,0.825,0.8026,0.7517,0.7926,0.7171,0.8678
XGBClassifier,0.0495,0.0076,0.8168,0.7975,0.7448,0.77,0.7232,0.8641
LGBMClassifier,0.0288,0.0084,0.8261,0.8045,0.7538,0.7919,0.7216,0.87
CatBoostClassifier,0.2612,0.006,0.8402,0.8147,0.7682,0.8301,0.7171,0.8681
MLPClassifier,0.7752,0.0049,0.809,0.7778,0.717,0.794,0.658,0.8567


In [14]:
def test_evaluation(estimator, X_train, y_train, X_test, y_test) -> dict:
    """Calculates the main model metrics - 
    accuracy, balanced-Accuracy, recall, precision, f1_score on the test set
    and returns them as a map object."""
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred).round(4)
    balanced_accuracy = metrics.balanced_accuracy_score(y_test, y_pred).round(4)
    precision = metrics.precision_score(y_test, y_pred).round(4)
    f_1 = metrics.f1_score(y_test, y_pred).round(4)
    recall = metrics.recall_score(y_test, y_pred).round(4)
    roc_auc = metrics.roc_auc_score(y_test, y_pred).round(4)

    final_metrics = {'accuracy': accuracy,
                     'balanced_accuracy': balanced_accuracy,
                     'f1': f_1,
                     'precision': precision,
                     'recall': recall,
                     'roc_auc': roc_auc,
                    }
    # Use dict comprehension for round dict values
    final_metrics = {key:round(val, 4) for key, val in final_metrics.items()} 
    return final_metrics

In [15]:
# Evaluate the models on the test set with raw data
data_test = []
for mdl in models:
    report_dict_test = test_evaluation(mdl, X_train, y_train, X_test, y_test)
    data_test.append(report_dict_test)
df_report_test = pd.DataFrame(data_test, index=models_tuple)
df_report_test

Unnamed: 0,accuracy,balanced_accuracy,f1,precision,recall,roc_auc
LogisticRegression,0.7877,0.7821,0.7532,0.7733,0.7342,0.7821
RandomForestClassifier,0.7933,0.7884,0.7613,0.7763,0.7468,0.7884
GradientBoostingClassifier,0.7877,0.7768,0.7397,0.806,0.6835,0.7768
HistGradientBoostingClassifier,0.7821,0.7758,0.7451,0.7703,0.7215,0.7758
XGBClassifier,0.7709,0.7618,0.7248,0.7714,0.6835,0.7618
LGBMClassifier,0.7933,0.7884,0.7613,0.7763,0.7468,0.7884
CatBoostClassifier,0.7765,0.7654,0.726,0.791,0.6709,0.7654
MLPClassifier,0.8045,0.7971,0.7682,0.8056,0.7342,0.7971


In [16]:
# Evaluate the models on the test set using data after standardization with StandardScaler
data_test = []
for mdl in models:
    report_dict_test = test_evaluation(mdl, X_train_s, y_train, X_test_s, y_test)
    data_test.append(report_dict_test)
df_report_test = pd.DataFrame(data_test, index=models_tuple)
df_report_test

Unnamed: 0,accuracy,balanced_accuracy,f1,precision,recall,roc_auc
LogisticRegression,0.7821,0.7771,0.7484,0.7632,0.7342,0.7771
RandomForestClassifier,0.7877,0.7834,0.7564,0.7662,0.7468,0.7834
GradientBoostingClassifier,0.7877,0.7768,0.7397,0.806,0.6835,0.7768
HistGradientBoostingClassifier,0.7821,0.7758,0.7451,0.7703,0.7215,0.7758
XGBClassifier,0.7709,0.7618,0.7248,0.7714,0.6835,0.7618
LGBMClassifier,0.7933,0.7884,0.7613,0.7763,0.7468,0.7884
CatBoostClassifier,0.7765,0.7654,0.726,0.791,0.6709,0.7654
MLPClassifier,0.7654,0.7554,0.7162,0.7681,0.6709,0.7554


In [17]:
# Evaluate the models on the test set using data after standardization with MinMaxScaler
data_test = []
for mdl in models:
    report_dict_test = test_evaluation(mdl, X_train_n, y_train, X_test_n, y_test)
    data_test.append(report_dict_test)
df_report_test = pd.DataFrame(data_test, index=models_tuple)
df_report_test

Unnamed: 0,accuracy,balanced_accuracy,f1,precision,recall,roc_auc
LogisticRegression,0.7877,0.7834,0.7564,0.7662,0.7468,0.7834
RandomForestClassifier,0.7821,0.7758,0.7451,0.7703,0.7215,0.7758
GradientBoostingClassifier,0.7877,0.7768,0.7397,0.806,0.6835,0.7768
HistGradientBoostingClassifier,0.7821,0.7758,0.7451,0.7703,0.7215,0.7758
XGBClassifier,0.7709,0.7618,0.7248,0.7714,0.6835,0.7618
LGBMClassifier,0.7933,0.7884,0.7613,0.7763,0.7468,0.7884
CatBoostClassifier,0.7765,0.7654,0.726,0.791,0.6709,0.7654
MLPClassifier,0.7877,0.7741,0.7324,0.8254,0.6582,0.7741


1) Therefore, we could see that using **StandardScaler** doesn't affect on boosting algorithms like: *GradientBoostingClassifier, HistGradientBoostingClassifier, XGBClassifier, LGBMClassifier, CatBoostClassifier*. Moreover **StandardScaler** lead to falling metrics for *Random Forest*, *MLPClassifier*, and *LogisticRegression*.

2) Using **MinMaxScaler** as normalization technique also doesn't affect on boosting algorithms like: *GradientBoostingClassifier, HistGradientBoostingClassifier, XGBClassifier, LGBMClassifier, CatBoostClassifier*. It slightly improves only the *LogisticRegression* algorithm but leads to falling metrics for *Random Forest* and *MLPClassifier* as compared to data without scaling. However, falling metrics for *MLPClassifier* was not so drastic as during using **StandardScaler**, but for *Random Forest* it was.