In [1]:
import pandas as pd
import numpy as np
from sklearn import set_config
set_config(transform_output="pandas")

from sklearn.model_selection import KFold, cross_validate

In [2]:
# Load the classification dataset
dft = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\train.csv")
dfo = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\bank-full.csv", delimiter=';')
dfo['y'] = dfo['y'].map({'no' : 0, 'yes': 1})
# df = pd.concat([dft, dfo])
y = dfo['y']
dfo = dfo.drop('y', axis=1)
dfo = dfo.drop(['month', 'day', 'job','marital'], axis=1)


In [3]:
dfo['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [4]:
dfo['default'] = pd.get_dummies(dfo['default'], drop_first=True, dtype=int)
dfo['housing'] = pd.get_dummies(dfo['housing'], drop_first=True, dtype=int)
dfo['loan'] = pd.get_dummies(dfo['loan'], drop_first=True, dtype=int)
dfo['poutcome'] = dfo['poutcome'].map({'failure': '0', 'other' : '0', 'unknown' : '0', 'succes' : '1'})
from sklearn.preprocessing import OrdinalEncoder
category_order = ['primary', 'secondary', 'unknown', 'tertiary']
ordinal_encoder = OrdinalEncoder(categories=[category_order])
dfo['education'] = ordinal_encoder.fit_transform(dfo[['education']])
dfo['contact'] = dfo['contact'].map({'telephone': 'cellular', 'unknown' : 'unknown', 'telephone': 'telephone'})
dfo['contact'] = pd.get_dummies(dfo['contact'], drop_first=True, dtype=int)

In [11]:
X = dfo.copy()
X.columns

Index(['age', 'education', 'default', 'balance', 'housing', 'loan', 'contact',
       'duration', 'campaign', 'pdays', 'previous', 'poutcome'],
      dtype='object')

In [6]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class MultiInteractionFeatureAdder(BaseEstimator, TransformerMixin):
    """
    A custom transformer that adds new features by multiplying existing ones.
    """
    def __init__(self, interactions_to_add):
        self.interactions_to_add = interactions_to_add

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("MultiInteractionFeatureAdder requires a pandas DataFrame as input.")
        
        X_copy = X.copy()
        for interaction_tuple in self.interactions_to_add:
            new_col_name = '_x_'.join(interaction_tuple)
            X_copy[new_col_name] = X_copy[list(interaction_tuple)].prod(axis=1)
        return X_copy

class ColumnDropper(BaseEstimator, TransformerMixin):
    """
    A custom transformer that drops specified columns from a DataFrame.
    """
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("ColumnDropper requires a pandas DataFrame as input.")
        return X.drop(columns=self.cols_to_drop)


In [7]:
import itertools
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# a. Specify the features to create interactions from
features_to_transform = ['duration', 'campaign', 'pdays', 'previous']

# b. Define the classification models to evaluate
models = {
    "LightGBM": LGBMClassifier(random_state=42)
}

# c. Define all transformation pipelines
transformation_pipelines = {}

# Pipeline Set 1: Baseline model
transformation_pipelines['Baseline_Original_Features'] = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# d. Generate all possible interaction terms from degree 2 to 4
all_interactions = []
for r in range(2, len(features_to_transform) + 1):
    all_interactions.extend(itertools.combinations(features_to_transform, r))

# e. Create pipelines for each experiment type
for interaction_tuple in all_interactions:
    col_names = '_x_'.join(interaction_tuple)
    
    # Pipeline Set 2: Originals + one interaction feature
    pipeline_name_add = f'Originals_Plus_{col_names}'
    pipe_add = Pipeline([
        ('add_interaction', MultiInteractionFeatureAdder(interactions_to_add=[interaction_tuple])),
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    transformation_pipelines[pipeline_name_add] = pipe_add
    
    # Pipeline Set 3: Interaction feature REPLACES its source columns
    pipeline_name_replace = f'Interaction_Replaces_{col_names}'
    cols_to_drop = list(interaction_tuple)
    pipe_replace = Pipeline([
        ('add_interaction', MultiInteractionFeatureAdder(interactions_to_add=[interaction_tuple])),
        ('drop_source_cols', ColumnDropper(cols_to_drop=cols_to_drop)),
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    transformation_pipelines[pipeline_name_replace] = pipe_replace

# f. Define the classification scoring metrics
scoring_metrics = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_weighted',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted'
}

# g. Define the cross-validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# This DataFrame will hold all results for final comparison
all_results = pd.DataFrame()

# --- Main Loop ---
for model_name, model in models.items():
    print(f"--- Evaluating Model: {model_name} ---")
    model_results = {}


    # c. Transformation Pipelines Evaluation
    # Iterates through the ColumnTransformer pipelines defined in cell 3
    for tech_name, preprocessor in transformation_pipelines.items():
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        scores = cross_validate(
            full_pipeline, X, y, cv=cv_strategy,
            scoring=scoring_metrics, return_train_score=True
        )
        model_results[tech_name] = {
            'Train F1-Score': scores['train_f1_score'].mean(),
            'CV F1-Score': scores['test_f1_score'].mean(),
            'CV Accuracy': scores['test_accuracy'].mean()
        }
        print(f"--- tranformation: {tech_name} ---")
    # d. Consolidate and store results
    temp_df = pd.DataFrame.from_dict(model_results, orient='index')
    temp_df['Model'] = model_name
    all_results = pd.concat([all_results, temp_df])

# e. Final processing for the results table
all_results.reset_index(inplace=True)
all_results.rename(columns={'index': 'Preprocessing Technique'}, inplace=True)
all_results['Generalization'] = all_results['CV F1-Score'] / all_results['Train F1-Score']
all_results = all_results.sort_values(by='CV F1-Score', ascending=False)


In [9]:
# Reorder columns for a more logical presentation
final_columns_order = [
    'Model',
    'Preprocessing Technique',
    'CV F1-Score',
    'CV Accuracy',
    'Train F1-Score',
    'Generalization'
]
all_results = all_results[final_columns_order]

all_results

Unnamed: 0,Model,Preprocessing Technique,CV F1-Score,CV Accuracy,Train F1-Score,Generalization
13,LightGBM,Originals_Plus_duration_x_campaign_x_pdays,0.890248,0.899892,0.910073,0.978216
5,LightGBM,Originals_Plus_duration_x_previous,0.889474,0.898896,0.910012,0.977431
19,LightGBM,Originals_Plus_campaign_x_pdays_x_previous,0.889326,0.899095,0.90974,0.97756
7,LightGBM,Originals_Plus_campaign_x_pdays,0.88893,0.898808,0.910275,0.976551
17,LightGBM,Originals_Plus_duration_x_pdays_x_previous,0.888842,0.898764,0.910062,0.976682
11,LightGBM,Originals_Plus_pdays_x_previous,0.888716,0.89841,0.909529,0.977116
3,LightGBM,Originals_Plus_duration_x_pdays,0.888709,0.898631,0.909915,0.976694
1,LightGBM,Originals_Plus_duration_x_campaign,0.888641,0.898498,0.909982,0.976548
15,LightGBM,Originals_Plus_duration_x_campaign_x_previous,0.888634,0.898587,0.909751,0.976788
9,LightGBM,Originals_Plus_campaign_x_previous,0.888566,0.89852,0.909705,0.976762


Model	Preprocessing Technique	CV F1-Score	CV Accuracy	Train F1-Score	Generalization
13	LightGBM	Originals_Plus_duration_x_campaign_x_pdays	        0.890248	0.899892	0.910073	0.978216
5	LightGBM	Originals_Plus_duration_x_previous	                0.889474	0.898896	0.910012	0.977431
19	LightGBM	Originals_Plus_campaign_x_pdays_x_previous	        0.889326	0.899095	0.909740	0.977560
7	LightGBM	Originals_Plus_campaign_x_pdays	                    0.888930	0.898808	0.910275	0.976551
17	LightGBM	Originals_Plus_duration_x_pdays_x_previous	        0.888842	0.898764	0.910062	0.976682
11	LightGBM	Originals_Plus_pdays_x_previous	                    0.888716	0.898410	0.909529	0.977116
3	LightGBM	Originals_Plus_duration_x_pdays	                    0.888709	0.898631	0.909915	0.976694
1	LightGBM	Originals_Plus_duration_x_campaign	                0.888641	0.898498	0.909982	0.976548
15	LightGBM	Originals_Plus_duration_x_campaign_x_previous	    0.888634	0.898587	0.909751	0.976788
9	LightGBM	Originals_Plus_campaign_x_previous	                0.888566	0.898520	0.909705	0.976762
21	LightGBM	Originals_Plus_duration_x_campaign_x_pdays_x_p...	0.888278	0.898366	0.910007	0.976122
10	LightGBM	Interaction_Replaces_campaign_x_previous	        0.888025	0.898299	0.908752	0.977192
0	LightGBM	Baseline_Original_Features	                        0.887892	0.897835	0.909662	0.976067
8	LightGBM	Interaction_Replaces_campaign_x_pdays           	0.886242	0.896530	0.907265	0.976828
12	LightGBM	Interaction_Replaces_pdays_x_previous	            0.885401	0.895866	0.905707	0.977580
20	LightGBM	Interaction_Replaces_campaign_x_pdays_x_previous	0.885199	0.895933	0.904074	0.979122
4	LightGBM	Interaction_Replaces_duration_x_pdays	            0.878878	0.892261	0.900482	0.976009
2	LightGBM	Interaction_Replaces_duration_x_campaign	        0.864480	0.889164	0.884330	0.977554
6	LightGBM	Interaction_Replaces_duration_x_previous	        0.860828	0.889319	0.878345	0.980057
16	LightGBM	Interaction_Replaces_duration_x_campaign_x_pre...	0.858091	0.888213	0.876155	0.979382
14	LightGBM	Interaction_Replaces_duration_x_campaign_x_pdays	0.853461	0.883745	0.874284	0.976183
18	LightGBM	Interaction_Replaces_duration_x_pdays_x_previous	0.852408	0.885470	0.866899	0.983284
22	LightGBM	Interaction_Replaces_duration_x_campaign_x_pda...	0.849423	0.884873	0.864599	0.982447


In [10]:
LightGBM	ManualMap_OneHotEncoder	0.893398	0.902856	0.912014	0.979588
6	LightGBM	OrdinalEncoder	0.893398	0.902922	0.912159	0.979432
5	LightGBM	OneHotEncoder	0.892881	0.902258	0.912397	0.978610

LightGBM	OrdinalEncoder	0.893530	0.902856	0.911901	0.979854
7	LightGBM	ManualMap_OneHotEncoder	0.893398	0.902856	0.912014	0.979588
5	LightGBM	OneHotEncoder	0.892760	0.902325	0.912347	0.978531

SyntaxError: invalid syntax (1450552359.py, line 1)