In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from feature_engine.outliers import Winsorizer
from sklearn.model_selection import KFold, cross_validate

In [2]:
# Load the classification dataset
dft = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\train.csv")
dfo = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\bank-full.csv", delimiter=';')
dfo['y'] = dfo['y'].map({'no' : 0, 'yes': 1})
# df = pd.concat([dft, dfo])
y = dfo['y']
dfo = dfo.drop('y', axis=1)

In [3]:
dfo['default'] = pd.get_dummies(dfo['default'], drop_first=True, dtype=int)
dfo['housing'] = pd.get_dummies(dfo['housing'], drop_first=True, dtype=int)
dfo['loan'] = pd.get_dummies(dfo['loan'], drop_first=True, dtype=int)
dfo['contact'] = dfo['contact'].map({'telephone': 'cellular', 'unknown' : 'unknown', 'telephone': 'telephone'})
dfo['contact'] = pd.get_dummies(dfo['contact'], drop_first=True, dtype=int)

In [10]:
import lightgbm as lgb
from catboost import CatBoostClassifier

In [5]:
X = dfo.select_dtypes(include=np.number)

In [11]:
# a. Specify the feature to apply transformations on
# You can change 'Age' to any other numerical column name from your dataframe
feature_to_transform = 'duration'

# b. Define the classification models to evaluate
models = {
    "LightGBM": lgb.LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0)
}

# c. Define the transformation pipelines to test on the specified feature
# The ColumnTransformer applies the specified pipeline to the 'feature_to_transform'
# and leaves the other features untouched (remainder='passthrough').
transformation_pipelines = {
    'Log Transformation': ColumnTransformer(
        transformers=[
            ('transformer', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('log', FunctionTransformer(np.log1p))
            ]), [feature_to_transform])
        ],
        remainder='passthrough'
    ),
    'Square Root Transformation': ColumnTransformer(
        transformers=[
            ('transformer', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('sqrt', FunctionTransformer(np.sqrt))
            ]), [feature_to_transform])
        ],
        remainder='passthrough'
    ),
    'Winsorization (0-95%)': ColumnTransformer(
        transformers=[
            ('transformer', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('winsorizer', Winsorizer(capping_method='quantiles', tail='right', fold=0.05))
            ]), [feature_to_transform])
        ],
        remainder='passthrough'
    )
}

# d. Define the classification scoring metrics
scoring_metrics = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_weighted',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted'
}

# e. Define the cross-validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
# This DataFrame will hold all results for final comparison
all_results = pd.DataFrame()

# --- Main Loop ---
for model_name, model in models.items():
    print(f"--- Evaluating Model: {model_name} ---")
    model_results = {}

    # a. "Feature Dropped" Evaluation
    # Dynamically drops the feature specified in cell 3
    X_dropped = X.drop(columns=[feature_to_transform])
    feature_dropped_scores = cross_validate(
        model, X_dropped, y, cv=cv_strategy,
        scoring=scoring_metrics, return_train_score=True
    )
    model_results['Feature Dropped'] = {
        'Train F1-Score': feature_dropped_scores['train_f1_score'].mean(),
        'CV F1-Score': feature_dropped_scores['test_f1_score'].mean(),
        'CV Accuracy': feature_dropped_scores['test_accuracy'].mean()
    }
    print(f"--- feature dropped: {model_name} ---")
    # b. "Imputation Only" Baseline Evaluation
    # Applies imputation only to the specified feature, others are passed through
    baseline_preprocessor = ColumnTransformer(
        transformers=[
            ('imputer', SimpleImputer(strategy='median'), [feature_to_transform])
        ],
        remainder='passthrough'
    )
    baseline_pipeline = Pipeline(steps=[
        ('preprocessor', baseline_preprocessor),
        ('model', model)
    ])
    baseline_scores = cross_validate(
        baseline_pipeline, X, y, cv=cv_strategy,
        scoring=scoring_metrics, return_train_score=True
    )
    model_results['Baseline (Imputation Only)'] = {
        'Train F1-Score': baseline_scores['train_f1_score'].mean(),
        'CV F1-Score': baseline_scores['test_f1_score'].mean(),
        'CV Accuracy': baseline_scores['test_accuracy'].mean()
    }
    print(f"--- imputation only: {model_name} ---")
    # c. Transformation Pipelines Evaluation
    # Iterates through the ColumnTransformer pipelines defined in cell 3
    for tech_name, preprocessor in transformation_pipelines.items():
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        scores = cross_validate(
            full_pipeline, X, y, cv=cv_strategy,
            scoring=scoring_metrics, return_train_score=True
        )
        model_results[tech_name] = {
            'Train F1-Score': scores['train_f1_score'].mean(),
            'CV F1-Score': scores['test_f1_score'].mean(),
            'CV Accuracy': scores['test_accuracy'].mean()
        }
        print(f"--- tranformation: {tech_name} ---")
    # d. Consolidate and store results
    temp_df = pd.DataFrame.from_dict(model_results, orient='index')
    temp_df['Model'] = model_name
    all_results = pd.concat([all_results, temp_df])

# e. Final processing for the results table
all_results.reset_index(inplace=True)
all_results.rename(columns={'index': 'Preprocessing Technique'}, inplace=True)
all_results['Generalization'] = all_results['CV F1-Score'] / all_results['Train F1-Score']
all_results = all_results.sort_values(by='CV F1-Score', ascending=False)


--- Evaluating Model: LightGBM ---
[LightGBM] [Info] Number of positive: 4198, number of negative: 31970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 698
[LightGBM] [Info] Number of data points in the train set: 36168, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116069 -> initscore=-2.030190
[LightGBM] [Info] Start training from score -2.030190
[LightGBM] [Info] Number of positive: 4279, number of negative: 31890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003504 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 699
[LightGBM] [Info] Number of data points in the train set: 36169, number of used 



[LightGBM] [Info] Number of positive: 4279, number of negative: 31890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003011 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118306 -> initscore=-2.008573
[LightGBM] [Info] Start training from score -2.008573




[LightGBM] [Info] Number of positive: 4184, number of negative: 31985
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 952
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115679 -> initscore=-2.033999
[LightGBM] [Info] Start training from score -2.033999




[LightGBM] [Info] Number of positive: 4269, number of negative: 31900
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003288 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 952
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118029 -> initscore=-2.011226
[LightGBM] [Info] Start training from score -2.011226




[LightGBM] [Info] Number of positive: 4226, number of negative: 31943
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116840 -> initscore=-2.022697
[LightGBM] [Info] Start training from score -2.022697




--- imputation only: LightGBM ---
[LightGBM] [Info] Number of positive: 4198, number of negative: 31970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 953
[LightGBM] [Info] Number of data points in the train set: 36168, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116069 -> initscore=-2.030190
[LightGBM] [Info] Start training from score -2.030190




[LightGBM] [Info] Number of positive: 4279, number of negative: 31890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118306 -> initscore=-2.008573
[LightGBM] [Info] Start training from score -2.008573




[LightGBM] [Info] Number of positive: 4184, number of negative: 31985
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002566 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 952
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115679 -> initscore=-2.033999
[LightGBM] [Info] Start training from score -2.033999




[LightGBM] [Info] Number of positive: 4269, number of negative: 31900
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001931 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 952
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118029 -> initscore=-2.011226
[LightGBM] [Info] Start training from score -2.011226




[LightGBM] [Info] Number of positive: 4226, number of negative: 31943
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116840 -> initscore=-2.022697
[LightGBM] [Info] Start training from score -2.022697




--- tranformation: Log Transformation ---
[LightGBM] [Info] Number of positive: 4198, number of negative: 31970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003059 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 953
[LightGBM] [Info] Number of data points in the train set: 36168, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116069 -> initscore=-2.030190
[LightGBM] [Info] Start training from score -2.030190




[LightGBM] [Info] Number of positive: 4279, number of negative: 31890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118306 -> initscore=-2.008573
[LightGBM] [Info] Start training from score -2.008573




[LightGBM] [Info] Number of positive: 4184, number of negative: 31985
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004992 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 952
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115679 -> initscore=-2.033999
[LightGBM] [Info] Start training from score -2.033999




[LightGBM] [Info] Number of positive: 4269, number of negative: 31900
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 952
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118029 -> initscore=-2.011226
[LightGBM] [Info] Start training from score -2.011226




[LightGBM] [Info] Number of positive: 4226, number of negative: 31943
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116840 -> initscore=-2.022697
[LightGBM] [Info] Start training from score -2.022697




--- tranformation: Square Root Transformation ---
[LightGBM] [Info] Number of positive: 4198, number of negative: 31970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 953
[LightGBM] [Info] Number of data points in the train set: 36168, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116069 -> initscore=-2.030190
[LightGBM] [Info] Start training from score -2.030190




[LightGBM] [Info] Number of positive: 4279, number of negative: 31890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002527 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118306 -> initscore=-2.008573
[LightGBM] [Info] Start training from score -2.008573




[LightGBM] [Info] Number of positive: 4184, number of negative: 31985
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 952
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115679 -> initscore=-2.033999
[LightGBM] [Info] Start training from score -2.033999




[LightGBM] [Info] Number of positive: 4269, number of negative: 31900
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 952
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.118029 -> initscore=-2.011226
[LightGBM] [Info] Start training from score -2.011226




[LightGBM] [Info] Number of positive: 4226, number of negative: 31943
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 954
[LightGBM] [Info] Number of data points in the train set: 36169, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116840 -> initscore=-2.022697
[LightGBM] [Info] Start training from score -2.022697




--- tranformation: Winsorization (0-95%) ---
--- Evaluating Model: CatBoost ---
--- feature dropped: CatBoost ---
--- imputation only: CatBoost ---
--- tranformation: Log Transformation ---
--- tranformation: Square Root Transformation ---
--- tranformation: Winsorization (0-95%) ---


In [13]:
# Reorder columns for a more logical presentation
final_columns_order = [
    'Model',
    'Preprocessing Technique',
    'CV F1-Score',
    'CV Accuracy',
    'Train F1-Score',
    'Generalization'
]
all_results = all_results[final_columns_order]

all_results

Unnamed: 0,Model,Preprocessing Technique,CV F1-Score,CV Accuracy,Train F1-Score,Generalization
1,LightGBM,Baseline (Imputation Only),0.891245,0.900246,0.913464,0.975676
2,LightGBM,Log Transformation,0.891245,0.900246,0.913464,0.975676
3,LightGBM,Square Root Transformation,0.891245,0.900246,0.913464,0.975676
4,LightGBM,Winsorization (0-95%),0.890039,0.899383,0.911915,0.97601
8,CatBoost,Square Root Transformation,0.889514,0.899449,0.924754,0.961893
6,CatBoost,Baseline (Imputation Only),0.889514,0.899449,0.924754,0.961893
7,CatBoost,Log Transformation,0.889514,0.899449,0.924754,0.961893
9,CatBoost,Winsorization (0-95%),0.888676,0.898432,0.9233,0.9625
0,LightGBM,Feature Dropped,0.858809,0.888479,0.87436,0.982215
5,CatBoost,Feature Dropped,0.858723,0.88755,0.884103,0.971293
