In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_validate

In [2]:
# Load the classification dataset
dft = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\train.csv")
dfo = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\bank-full.csv", delimiter=';')
dfo['y'] = dfo['y'].map({'no' : 0, 'yes': 1})
# df = pd.concat([dft, dfo])
y = dfo['y']
dfo = dfo.drop('y', axis=1)
dfo = dfo.drop(['month', 'day', 'job','marital'], axis=1)


In [3]:
dfo['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [4]:
dfo['default'] = pd.get_dummies(dfo['default'], drop_first=True, dtype=int)
dfo['housing'] = pd.get_dummies(dfo['housing'], drop_first=True, dtype=int)
dfo['loan'] = pd.get_dummies(dfo['loan'], drop_first=True, dtype=int)
# dfo['poutcome'].map({'failure': '0', 'other' : '0', 'unknown' : '0', 'succes' : '1'}).astype('int')
from sklearn.preprocessing import OrdinalEncoder
category_order = ['primary', 'secondary', 'unknown', 'tertiary']
ordinal_encoder = OrdinalEncoder(categories=[category_order])
dfo['education'] = ordinal_encoder.fit_transform(dfo[['education']])
dfo['contact'] = dfo['contact'].map({'telephone': 'cellular', 'unknown' : 'unknown', 'telephone': 'telephone'})
dfo['contact'] = pd.get_dummies(dfo['contact'], drop_first=True, dtype=int)

In [5]:
X = dfo.copy()

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn import set_config
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
set_config(transform_output="pandas")
# Custom transformer for manual mapping of categorical values
class ColumnValueMapper(BaseEstimator, TransformerMixin):
    """
    Applies a specified mapping to a column.
    Values not in the mapping are left unchanged.
    """
    def __init__(self, mapping):
        self.mapping = mapping

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_out = X.copy()
        # Get the first (and only) column name to apply the mapping
        col_name = X_out.columns[0]
        X_out[col_name] = X_out[col_name].replace(self.mapping)
        return X_out

    def get_feature_names_out(self, input_features=None):
        return input_features

# a. Specify the feature to apply transformations on
feature_to_transform = 'poutcome'

# b. Define the classification models to evaluate
models = {
    "LightGBM": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0)
}

# c. Define the transformation pipelines to test on the specified feature
transformation_pipelines = {
    'OneHotEncoder': ColumnTransformer(
        transformers=[
            ('encoder', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
            ]), [feature_to_transform])
        ],
        remainder='passthrough'
    ),
    'OrdinalEncoder': ColumnTransformer(
        transformers=[
            ('encoder', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
            ]), [feature_to_transform])
        ],
        remainder='passthrough'
    ),
    'ManualMap_OneHotEncoder': ColumnTransformer(
        transformers=[
            ('encoder', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('mapper', ColumnValueMapper(mapping={'failure': '0', 'other' : '0', 'unknown' : '0', 'succes' : '1'})),
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ]), [feature_to_transform])
        ],
        remainder='passthrough'
    )
}

# d. Define the classification scoring metrics
scoring_metrics = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_weighted',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted'
}

# e. Define the cross-validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
# This DataFrame will hold all results for final comparison
all_results = pd.DataFrame()

# --- Main Loop ---
for model_name, model in models.items():
    print(f"--- Evaluating Model: {model_name} ---")
    model_results = {}

    # a. "Feature Dropped" Evaluation
    # Dynamically drops the feature specified in cell 3
    X_dropped = X.drop(columns=[feature_to_transform])
    feature_dropped_scores = cross_validate(
        model, X_dropped, y, cv=cv_strategy,
        scoring=scoring_metrics, return_train_score=True
    )
    model_results['Feature Dropped'] = {
        'Train F1-Score': feature_dropped_scores['train_f1_score'].mean(),
        'CV F1-Score': feature_dropped_scores['test_f1_score'].mean(),
        'CV Accuracy': feature_dropped_scores['test_accuracy'].mean()
    }
    print(f"--- feature dropped: {model_name} ---")

    # c. Transformation Pipelines Evaluation
    # Iterates through the ColumnTransformer pipelines defined in cell 3
    for tech_name, preprocessor in transformation_pipelines.items():
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        scores = cross_validate(
            full_pipeline, X, y, cv=cv_strategy,
            scoring=scoring_metrics, return_train_score=True
        )
        model_results[tech_name] = {
            'Train F1-Score': scores['train_f1_score'].mean(),
            'CV F1-Score': scores['test_f1_score'].mean(),
            'CV Accuracy': scores['test_accuracy'].mean()
        }
        print(f"--- tranformation: {tech_name} ---")
    # d. Consolidate and store results
    temp_df = pd.DataFrame.from_dict(model_results, orient='index')
    temp_df['Model'] = model_name
    all_results = pd.concat([all_results, temp_df])

# e. Final processing for the results table
all_results.reset_index(inplace=True)
all_results.rename(columns={'index': 'Preprocessing Technique'}, inplace=True)
all_results['Generalization'] = all_results['CV F1-Score'] / all_results['Train F1-Score']
all_results = all_results.sort_values(by='CV F1-Score', ascending=False)


--- Evaluating Model: LightGBM ---
[LightGBM] [Info] Number of positive: 4198, number of negative: 31970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 925
[LightGBM] [Info] Number of data points in the train set: 36168, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116069 -> initscore=-2.030190
[LightGBM] [Info] Start training from score -2.030190
[LightGBM] [Info] Number of positive: 4279, number of negative: 31890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 926
[LightGBM] [Info] Number of data points in the train set: 36169, number of used 

In [8]:
# Reorder columns for a more logical presentation
final_columns_order = [
    'Model',
    'Preprocessing Technique',
    'CV F1-Score',
    'CV Accuracy',
    'Train F1-Score',
    'Generalization'
]
all_results = all_results[final_columns_order]

all_results

Unnamed: 0,Model,Preprocessing Technique,CV F1-Score,CV Accuracy,Train F1-Score,Generalization
3,LightGBM,ManualMap_OneHotEncoder,0.893398,0.902856,0.912014,0.979588
2,LightGBM,OrdinalEncoder,0.893398,0.902922,0.912159,0.979432
1,LightGBM,OneHotEncoder,0.892881,0.902258,0.912397,0.97861
7,CatBoost,ManualMap_OneHotEncoder,0.891275,0.901197,0.923285,0.96533
6,CatBoost,OrdinalEncoder,0.891137,0.901219,0.923935,0.964502
5,CatBoost,OneHotEncoder,0.890415,0.900511,0.923801,0.96386
0,LightGBM,Feature Dropped,0.889643,0.89956,0.909068,0.978632
4,CatBoost,Feature Dropped,0.887939,0.898542,0.91994,0.965214


In [9]:
LightGBM	ManualMap_OneHotEncoder	0.893398	0.902856	0.912014	0.979588
6	LightGBM	OrdinalEncoder	0.893398	0.902922	0.912159	0.979432
5	LightGBM	OneHotEncoder	0.892881	0.902258	0.912397	0.978610

LightGBM	OrdinalEncoder	0.893530	0.902856	0.911901	0.979854
7	LightGBM	ManualMap_OneHotEncoder	0.893398	0.902856	0.912014	0.979588
5	LightGBM	OneHotEncoder	0.892760	0.902325	0.912347	0.978531

SyntaxError: invalid syntax (1450552359.py, line 1)