In [1]:
import pandas as pd
import numpy as np
from sklearn import set_config
set_config(transform_output="pandas")

from sklearn.model_selection import KFold, cross_validate

In [18]:
# Load the classification dataset
dft = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\train.csv")
dfo = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\bank-full.csv", delimiter=';')
dfo['y'] = dfo['y'].map({'no' : 0, 'yes': 1})
# df = pd.concat([dft, dfo])
y = dfo['y']
dfo = dfo.drop('y', axis=1)
dfo = dfo.drop(['job','marital'], axis=1)


In [19]:
dfo['default'] = pd.get_dummies(dfo['default'], drop_first=True, dtype=int)
dfo['housing'] = pd.get_dummies(dfo['housing'], drop_first=True, dtype=int)
dfo['loan'] = pd.get_dummies(dfo['loan'], drop_first=True, dtype=int)
dfo['poutcome'] = dfo['poutcome'].map({'failure': '0', 'other' : '0', 'unknown' : '0', 'succes' : '1'})
from sklearn.preprocessing import OrdinalEncoder
category_order = ['primary', 'secondary', 'unknown', 'tertiary']
ordinal_encoder = OrdinalEncoder(categories=[category_order])
dfo['education'] = ordinal_encoder.fit_transform(dfo[['education']])
dfo['contact'] = dfo['contact'].map({'telephone': 'cellular', 'unknown' : 'unknown', 'telephone': 'telephone'})
dfo['contact'] = pd.get_dummies(dfo['contact'], drop_first=True, dtype=int)

In [20]:
X = dfo.copy()
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        45211 non-null  int64   
 1   education  45211 non-null  float64 
 2   default    45211 non-null  int64   
 3   balance    45211 non-null  int64   
 4   housing    45211 non-null  int64   
 5   loan       45211 non-null  int64   
 6   contact    45211 non-null  int64   
 7   day        45211 non-null  int64   
 8   month      45211 non-null  category
 9   duration   45211 non-null  int64   
 10  campaign   45211 non-null  int64   
 11  pdays      45211 non-null  int64   
 12  previous   45211 non-null  int64   
 13  poutcome   43700 non-null  category
dtypes: category(2), float64(1), int64(11)
memory usage: 4.2 MB


In [21]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class MultiInteractionFeatureAdder(BaseEstimator, TransformerMixin):
    """
    A custom transformer that adds new features by multiplying existing ones.
    """
    def __init__(self, interactions_to_add):
        self.interactions_to_add = interactions_to_add

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("MultiInteractionFeatureAdder requires a pandas DataFrame as input.")
        
        X_copy = X.copy()
        for interaction_tuple in self.interactions_to_add:
            new_col_name = '_x_'.join(interaction_tuple)
            X_copy[new_col_name] = X_copy[list(interaction_tuple)].prod(axis=1)
        return X_copy

class ColumnDropper(BaseEstimator, TransformerMixin):
    """
    A custom transformer that drops specified columns from a DataFrame.
    """
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("ColumnDropper requires a pandas DataFrame as input.")
        return X.drop(columns=self.cols_to_drop)


In [22]:
import itertools
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier



# a. Define the classification models to evaluate
models = {
    "LightGBM": LGBMClassifier(random_state=42)
}

# b. !!! ACTION REQUIRED !!!
# Fill this list with the best interaction terms you found in the last experiment.
# The format is a list of tuples.
shortlist_of_interactions = [
    ('duration', 'campaign', 'pdays')
]
# c. Define all transformation pipelines
# c. Initialize pipelines and add the baseline model for comparison
transformation_pipelines = {}
transformation_pipelines['Baseline_Original_Features'] = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# d. Create pipelines for all combinations of the shortlisted features
# This will test the features individually (r=1) and in combination (r>1).
for r in range(1, len(shortlist_of_interactions) + 1):
    for subset in itertools.combinations(shortlist_of_interactions, r):
        # Create a descriptive name from the feature names in the subset
        name = ' & '.join(['_x_'.join(i) for i in subset])
        pipeline_name = f'Originals_Plus_{name}'
        
        # Create a pipeline that adds all interactions in the current subset
        pipe = Pipeline([
            ('add_interactions', MultiInteractionFeatureAdder(interactions_to_add=list(subset))),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        transformation_pipelines[pipeline_name] = pipe

# e. Define scoring and cross-validation
scoring_metrics = { 'f1_score': 'f1_weighted', 'accuracy': 'accuracy' }
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

In [23]:
# This DataFrame will hold all results for final comparison
all_results = pd.DataFrame()

# --- Main Loop ---
for model_name, model in models.items():
    print(f"--- Evaluating Model: {model_name} ---")
    model_results = {}


    # c. Transformation Pipelines Evaluation
    # Iterates through the ColumnTransformer pipelines defined in cell 3
    for tech_name, preprocessor in transformation_pipelines.items():
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        scores = cross_validate(
            full_pipeline, X, y, cv=cv_strategy,
            scoring=scoring_metrics, return_train_score=True
        )
        model_results[tech_name] = {
            'Train F1-Score': scores['train_f1_score'].mean(),
            'CV F1-Score': scores['test_f1_score'].mean(),
            'CV Accuracy': scores['test_accuracy'].mean()
        }
        print(f"--- tranformation: {tech_name} ---")
    # d. Consolidate and store results
    temp_df = pd.DataFrame.from_dict(model_results, orient='index')
    temp_df['Model'] = model_name
    all_results = pd.concat([all_results, temp_df])

# e. Final processing for the results table
all_results.reset_index(inplace=True)
all_results.rename(columns={'index': 'Preprocessing Technique'}, inplace=True)
all_results['Generalization'] = all_results['CV F1-Score'] / all_results['Train F1-Score']
all_results = all_results.sort_values(by='CV F1-Score', ascending=False)


--- Evaluating Model: LightGBM ---


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 655, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 589, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 719, in fit_transform
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 589, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 897, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\impute\_base.py", line 436, in fit
    X = self._validate_input(X, in_fit=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\impute\_base.py", line 361, in _validate_input
    raise new_ve from None
ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'may'


In [None]:
# Reorder columns for a more logical presentation
final_columns_order = [
    'Model',
    'Preprocessing Technique',
    'CV F1-Score',
    'CV Accuracy',
    'Train F1-Score',
    'Generalization'
]
all_results = pd.DataFrame(all_results[final_columns_order])
pd.set_option("display.max_colwidth", None)
all_results.head(15)

In [None]:
LightGBM	Originals_Plus_duration_x_campaign_x_pdays	0.890248	0.899892	0.910073	0.978216
13	LightGBM	Originals_Plus_duration_x_campaign_x_pdays & balance_x_age & duration_x_pdays_x_balance	0.889545	0.899228	