In [1]:
import pandas as pd
import numpy as np
from sklearn import set_config
set_config(transform_output="pandas")

from sklearn.model_selection import KFold, cross_validate

In [2]:
# Load the classification dataset
dft = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\train.csv")
dfo = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\bank-full.csv", delimiter=';')
dfo['y'] = dfo['y'].map({'no' : 0, 'yes': 1})
# df = pd.concat([dft, dfo])
y = dfo['y']
dfo = dfo.drop('y', axis=1)
dfo = dfo.drop(['month', 'day', 'job','marital'], axis=1)


In [4]:
dfo['default'] = pd.get_dummies(dfo['default'], drop_first=True, dtype=int)
dfo['housing'] = pd.get_dummies(dfo['housing'], drop_first=True, dtype=int)
dfo['loan'] = pd.get_dummies(dfo['loan'], drop_first=True, dtype=int)
dfo['poutcome'] = dfo['poutcome'].map({'failure': '0', 'other' : '0', 'unknown' : '0', 'succes' : '1'})
from sklearn.preprocessing import OrdinalEncoder
category_order = ['primary', 'secondary', 'unknown', 'tertiary']
ordinal_encoder = OrdinalEncoder(categories=[category_order])
dfo['education'] = ordinal_encoder.fit_transform(dfo[['education']])
dfo['contact'] = dfo['contact'].map({'telephone': 'cellular', 'unknown' : 'unknown', 'telephone': 'telephone'})
dfo['contact'] = pd.get_dummies(dfo['contact'], drop_first=True, dtype=int)

In [5]:
X = dfo.copy()
X.columns

Index(['age', 'education', 'default', 'balance', 'housing', 'loan', 'contact',
       'duration', 'campaign', 'pdays', 'previous', 'poutcome'],
      dtype='object')

In [6]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class MultiInteractionFeatureAdder(BaseEstimator, TransformerMixin):
    """
    A custom transformer that adds new features by multiplying existing ones.
    """
    def __init__(self, interactions_to_add):
        self.interactions_to_add = interactions_to_add

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("MultiInteractionFeatureAdder requires a pandas DataFrame as input.")
        
        X_copy = X.copy()
        for interaction_tuple in self.interactions_to_add:
            new_col_name = '_x_'.join(interaction_tuple)
            X_copy[new_col_name] = X_copy[list(interaction_tuple)].prod(axis=1)
        return X_copy

class ColumnDropper(BaseEstimator, TransformerMixin):
    """
    A custom transformer that drops specified columns from a DataFrame.
    """
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("ColumnDropper requires a pandas DataFrame as input.")
        return X.drop(columns=self.cols_to_drop)


In [7]:
import itertools
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier



# a. Define the classification models to evaluate
models = {
    "LightGBM": LGBMClassifier(random_state=42)
}

# b. !!! ACTION REQUIRED !!!
# Fill this list with the best interaction terms you found in the last experiment.
# The format is a list of tuples.
shortlist_of_interactions = [
    ('duration', 'campaign', 'pdays'),            
    ('duration', 'previous'),                    
    ('campaign', 'pdays', 'previous'),   
    ('campaign', 'pdays')  
]
# c. Define all transformation pipelines
# c. Initialize pipelines and add the baseline model for comparison
transformation_pipelines = {}
transformation_pipelines['Baseline_Original_Features'] = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# d. Create pipelines for all combinations of the shortlisted features
# This will test the features individually (r=1) and in combination (r>1).
for r in range(1, len(shortlist_of_interactions) + 1):
    for subset in itertools.combinations(shortlist_of_interactions, r):
        # Create a descriptive name from the feature names in the subset
        name = ' & '.join(['_x_'.join(i) for i in subset])
        pipeline_name = f'Originals_Plus_{name}'
        
        # Create a pipeline that adds all interactions in the current subset
        pipe = Pipeline([
            ('add_interactions', MultiInteractionFeatureAdder(interactions_to_add=list(subset))),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        transformation_pipelines[pipeline_name] = pipe

# e. Define scoring and cross-validation
scoring_metrics = { 'f1_score': 'f1_weighted', 'accuracy': 'accuracy' }
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
# This DataFrame will hold all results for final comparison
all_results = pd.DataFrame()

# --- Main Loop ---
for model_name, model in models.items():
    print(f"--- Evaluating Model: {model_name} ---")
    model_results = {}


    # c. Transformation Pipelines Evaluation
    # Iterates through the ColumnTransformer pipelines defined in cell 3
    for tech_name, preprocessor in transformation_pipelines.items():
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        scores = cross_validate(
            full_pipeline, X, y, cv=cv_strategy,
            scoring=scoring_metrics, return_train_score=True
        )
        model_results[tech_name] = {
            'Train F1-Score': scores['train_f1_score'].mean(),
            'CV F1-Score': scores['test_f1_score'].mean(),
            'CV Accuracy': scores['test_accuracy'].mean()
        }
        print(f"--- tranformation: {tech_name} ---")
    # d. Consolidate and store results
    temp_df = pd.DataFrame.from_dict(model_results, orient='index')
    temp_df['Model'] = model_name
    all_results = pd.concat([all_results, temp_df])

# e. Final processing for the results table
all_results.reset_index(inplace=True)
all_results.rename(columns={'index': 'Preprocessing Technique'}, inplace=True)
all_results['Generalization'] = all_results['CV F1-Score'] / all_results['Train F1-Score']
all_results = all_results.sort_values(by='CV F1-Score', ascending=False)


--- Evaluating Model: LightGBM ---
[LightGBM] [Info] Number of positive: 4198, number of negative: 31970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 931
[LightGBM] [Info] Number of data points in the train set: 36168, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116069 -> initscore=-2.030190
[LightGBM] [Info] Start training from score -2.030190
[LightGBM] [Info] Number of positive: 4279, number of negative: 31890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004501 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 932
[LightGBM] [Info] Number of data points in the train set: 36169, number of used 

In [9]:
# Reorder columns for a more logical presentation
final_columns_order = [
    'Model',
    'Preprocessing Technique',
    'CV F1-Score',
    'CV Accuracy',
    'Train F1-Score',
    'Generalization'
]
all_results = all_results[final_columns_order]

all_results

Unnamed: 0,Model,Preprocessing Technique,CV F1-Score,CV Accuracy,Train F1-Score,Generalization
1,LightGBM,Originals_Plus_duration_x_campaign_x_pdays,0.890248,0.899892,0.910073,0.978216
13,LightGBM,Originals_Plus_duration_x_campaign_x_pdays & c...,0.890058,0.899825,0.909961,0.978128
9,LightGBM,Originals_Plus_duration_x_previous & campaign_...,0.889944,0.899781,0.910392,0.977539
12,LightGBM,Originals_Plus_duration_x_campaign_x_pdays & d...,0.889717,0.899693,0.91092,0.976723
7,LightGBM,Originals_Plus_duration_x_campaign_x_pdays & c...,0.889587,0.899449,0.910679,0.976839
2,LightGBM,Originals_Plus_duration_x_previous,0.889474,0.898896,0.910012,0.977431
11,LightGBM,Originals_Plus_duration_x_campaign_x_pdays & d...,0.889453,0.899449,0.911278,0.97605
6,LightGBM,Originals_Plus_duration_x_campaign_x_pdays & c...,0.889351,0.899118,0.910527,0.976743
3,LightGBM,Originals_Plus_campaign_x_pdays_x_previous,0.889326,0.899095,0.90974,0.97756
5,LightGBM,Originals_Plus_duration_x_campaign_x_pdays & d...,0.889258,0.899272,0.910501,0.976669


In [10]:
LightGBM	ManualMap_OneHotEncoder	0.893398	0.902856	0.912014	0.979588
6	LightGBM	OrdinalEncoder	0.893398	0.902922	0.912159	0.979432
5	LightGBM	OneHotEncoder	0.892881	0.902258	0.912397	0.978610

LightGBM	OrdinalEncoder	0.893530	0.902856	0.911901	0.979854
7	LightGBM	ManualMap_OneHotEncoder	0.893398	0.902856	0.912014	0.979588
5	LightGBM	OneHotEncoder	0.892760	0.902325	0.912347	0.978531

SyntaxError: invalid syntax (1450552359.py, line 1)