In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from feature_engine.outliers import Winsorizer
from sklearn.model_selection import KFold, cross_validate

In [2]:
# Load the classification dataset
dft = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\train.csv")
dfo = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\bank-full.csv", delimiter=';')
dfo['y'] = dfo['y'].map({'no' : 0, 'yes': 1})
# df = pd.concat([dft, dfo])
# y = dfo['y']
# dfo = dfo.drop('y', axis=1)

In [3]:
dfo['default'] = pd.get_dummies(dfo['default'], drop_first=True, dtype=int)
dfo['housing'] = pd.get_dummies(dfo['housing'], drop_first=True, dtype=int)
dfo['loan'] = pd.get_dummies(dfo['loan'], drop_first=True, dtype=int)
dfo['contact'] = dfo['contact'].map({'telephone': 'cellular', 'unknown' : 'unknown', 'telephone': 'telephone'})
dfo['contact'] = pd.get_dummies(dfo['contact'], drop_first=True, dtype=int)

In [4]:
dfo.dtypes

age           int64
job          object
marital      object
education    object
default       int64
balance       int64
housing       int64
loan          int64
contact       int64
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int64
dtype: object

In [5]:
import pandas as pd
import numpy as np

# This cell assumes a DataFrame 'dfo' exists with 'month', 'day', and 'y' columns.
# It prepares the data and engineers features for modeling.

# Ensure all object columns are converted to 'category' dtype for native handling by LightGBM
for col in dfo.select_dtypes(include=['object']).columns:
    if col != 'y': 
        dfo[col] = dfo[col].astype('category')

# Create a mapping from month name to month number
month_map = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
dfo['month_num'] = dfo['month'].map(month_map)

# --- FIX for 'day is out of range for month' error ---
# Create a map of days per month and cap the 'day' column to a valid day.
# This handles invalid dates like April 31st or February 30th.
days_in_month = {1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31}
dfo['day_corrected'] = dfo.apply(lambda row: min(row['day'], days_in_month[row['month_num']]), axis=1)

# Create a proper datetime column using the corrected day
dfo['date'] = pd.to_datetime('2023-' + dfo['month_num'].astype(str) + '-' + dfo['day_corrected'].astype(str))
# --- End of fix ---

# Engineer a categorical feature 'month_day_bin' using the original day
bins = [0, 5, 10, 15, 20, 25, 31]
labels = ['1-5', '6-10', '11-15', '16-20', '21-25', '26-31']
dfo['day_bin'] = pd.cut(dfo['day'], bins=bins, labels=labels, right=True).astype(str)
dfo['month_day_bin'] = dfo['month'].astype(str) + '_' + dfo['day_bin']
dfo['month_day_bin'] = dfo['month_day_bin'].astype('category')

# Engineer 'week_of_year' feature
dfo['week_of_year'] = dfo['date'].dt.isocalendar().week

# Engineer cyclical features for the week of the year
dfo['week_sin'] = np.sin(2 * np.pi * dfo['week_of_year'] / 52)
dfo['week_cos'] = np.cos(2 * np.pi * dfo['week_of_year'] / 52)

# Define the final feature matrix X and target vector y
# Drop intermediate columns used for feature creation
feature_cols = [
    'month', 'day', 'month_day_bin', 'week_of_year', 'week_sin', 'week_cos'
]
other_cols = [col for col in dfo.columns if col not in ['y', 'month_num', 'date', 'day_bin', 'day_corrected'] and col not in feature_cols]
X = dfo[feature_cols + other_cols]
y = dfo['y']

In [6]:
X.dtypes

month            category
day                 int64
month_day_bin    category
week_of_year       UInt32
week_sin          Float64
week_cos          Float64
age                 int64
job              category
marital          category
education        category
default             int64
balance             int64
housing             int64
loan                int64
contact             int64
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome         category
dtype: object

In [7]:
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, TargetEncoder
from sklearn.model_selection import KFold

# a. Specify the model to evaluate
models = {
    "LightGBM": LGBMClassifier(random_state=42)
}

# b. Define the transformation pipelines to test
# Each pipeline is a ColumnTransformer. We set verbose_feature_names_out=False
# to keep original column names where possible.
transformation_pipelines = {
    'Ordinal_with_originals': ColumnTransformer(
        transformers=[
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['month_day_bin']),
            ('drop_features', 'drop', ['week_of_year', 'week_sin', 'week_cos'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    ),
    'Ordinal_without_originals': ColumnTransformer(
        transformers=[
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['month_day_bin']),
            ('drop_features', 'drop', ['month', 'day', 'week_of_year', 'week_sin', 'week_cos'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    ),
    'TargetEnc_with_originals': ColumnTransformer(
        transformers=[
            ('target', TargetEncoder(target_type='binary'), ['week_of_year']),
            ('drop_features', 'drop', ['month_day_bin', 'week_sin', 'week_cos'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    ),
    'TargetEnc_without_originals': ColumnTransformer(
        transformers=[
            ('target', TargetEncoder(target_type='binary'), ['week_of_year']),
            ('drop_features', 'drop', ['month', 'day', 'month_day_bin', 'week_sin', 'week_cos'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    ),
    'Cyclical_with_originals': ColumnTransformer(
        transformers=[
            ('scaler', StandardScaler(), ['week_sin', 'week_cos']),
            ('drop_features', 'drop', ['month_day_bin', 'week_of_year'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    ),
    'Cyclical_without_originals': ColumnTransformer(
        transformers=[
            ('scaler', StandardScaler(), ['week_sin', 'week_cos']),
            ('drop_features', 'drop', ['month', 'day', 'month_day_bin', 'week_of_year'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    )
}

# CRITICAL FIX: Set the output of all transformers to be a pandas DataFrame.
# This preserves the 'category' dtype so LightGBM can handle it natively.
for preprocessor in transformation_pipelines.values():
    preprocessor.set_output(transform="pandas")


# c. Define the classification scoring metrics
scoring_metrics = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_weighted',
}

# d. Define the cross-validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, TargetEncoder
from sklearn.model_selection import KFold

# a. Specify the model to evaluate
models = {
    "LightGBM": LGBMClassifier(random_state=42)
}

# b. Define the transformation pipelines to test
# Each pipeline is a ColumnTransformer. We set verbose_feature_names_out=False
# to keep original column names where possible.
transformation_pipelines = {
    'Ordinal_with_originals': ColumnTransformer(
        transformers=[
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['month_day_bin']),
            ('drop_features', 'drop', ['week_of_year', 'week_sin', 'week_cos'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    ),
    'Ordinal_without_originals': ColumnTransformer(
        transformers=[
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['month_day_bin']),
            ('drop_features', 'drop', ['month', 'day', 'week_of_year', 'week_sin', 'week_cos'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    ),
    'TargetEnc_with_originals': ColumnTransformer(
        transformers=[
            ('target', TargetEncoder(target_type='binary'), ['week_of_year']),
            ('drop_features', 'drop', ['month_day_bin', 'week_sin', 'week_cos'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    ),
    'TargetEnc_without_originals': ColumnTransformer(
        transformers=[
            ('target', TargetEncoder(target_type='binary'), ['week_of_year']),
            ('drop_features', 'drop', ['month', 'day', 'month_day_bin', 'week_sin', 'week_cos'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    ),
    'Cyclical_with_originals': ColumnTransformer(
        transformers=[
            ('scaler', StandardScaler(), ['week_sin', 'week_cos']),
            ('drop_features', 'drop', ['month_day_bin', 'week_of_year'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    ),
    'Cyclical_without_originals': ColumnTransformer(
        transformers=[
            ('scaler', StandardScaler(), ['week_sin', 'week_cos']),
            ('drop_features', 'drop', ['month', 'day', 'month_day_bin', 'week_of_year'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    )
}

# CRITICAL FIX: Set the output of all transformers to be a pandas DataFrame.
# This preserves the 'category' dtype so LightGBM can handle it natively.
for preprocessor in transformation_pipelines.values():
    preprocessor.set_output(transform="pandas")


# c. Define the classification scoring metrics
scoring_metrics = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_weighted',
}

# d. Define the cross-validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
# This DataFrame will hold all results for final comparison
all_results = pd.DataFrame()

# --- Main Loop ---
# Iterates through the models defined in the previous cell
for model_name, model in models.items():
    print(f"--- Evaluating Model: {model_name} ---")
    model_results = {}

    # Iterates through the ColumnTransformer pipelines to evaluate each feature engineering technique
    for tech_name, preprocessor in transformation_pipelines.items():
        # Create a full pipeline that first preprocesses the data and then applies the model
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        
        # Perform cross-validation
        scores = cross_validate(
            full_pipeline, X, y, cv=cv_strategy,
            scoring=scoring_metrics, return_train_score=True
        )
        
        # Store the mean scores for the current technique
        model_results[tech_name] = {
            'Train F1-Score': scores['train_f1_score'].mean(),
            'CV F1-Score': scores['test_f1_score'].mean(),
            'CV Accuracy': scores['test_accuracy'].mean()
        }
        print(f"--- Evaluated transformation: {tech_name} ---")

    # Consolidate and store results for the current model
    temp_df = pd.DataFrame.from_dict(model_results, orient='index')
    temp_df['Model'] = model_name
    all_results = pd.concat([all_results, temp_df])

# Final processing for the results table
all_results.reset_index(inplace=True)
all_results.rename(columns={'index': 'Preprocessing Technique'}, inplace=True)
# Calculate a generalization score to check for overfitting
all_results['Generalization'] = all_results['CV F1-Score'] / all_results['Train F1-Score']
# Sort the results by the cross-validated F1-score for easy comparison
all_results = all_results.sort_values(by='CV F1-Score', ascending=False)

# Display the final comparative results
print("\n--- Final Results ---")
display(all_results)

--- Evaluating Model: LightGBM ---
[LightGBM] [Info] Number of positive: 4198, number of negative: 31970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1062
[LightGBM] [Info] Number of data points in the train set: 36168, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116069 -> initscore=-2.030190
[LightGBM] [Info] Start training from score -2.030190
[LightGBM] [Info] Number of positive: 4279, number of negative: 31890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 36169, number of use

Unnamed: 0,Preprocessing Technique,Train F1-Score,CV F1-Score,CV Accuracy,Model,Generalization
2,TargetEnc_with_originals,0.929322,0.905231,0.909668,LightGBM,0.974076
4,Cyclical_with_originals,0.93155,0.904998,0.909248,LightGBM,0.971497
0,Ordinal_with_originals,0.930589,0.904383,0.908739,LightGBM,0.971839
5,Cyclical_without_originals,0.928966,0.903745,0.908673,LightGBM,0.972851
1,Ordinal_without_originals,0.927519,0.902551,0.90812,LightGBM,0.973081
3,TargetEnc_without_originals,0.922782,0.902387,0.907987,LightGBM,0.977898


In [10]:
# Reorder columns for a more logical presentation
final_columns_order = [
    'Model',
    'Preprocessing Technique',
    'CV F1-Score',
    'CV Accuracy',
    'Train F1-Score',
    'Generalization'
]
all_results = all_results[final_columns_order]

all_results

Unnamed: 0,Model,Preprocessing Technique,CV F1-Score,CV Accuracy,Train F1-Score,Generalization
2,LightGBM,TargetEnc_with_originals,0.905231,0.909668,0.929322,0.974076
4,LightGBM,Cyclical_with_originals,0.904998,0.909248,0.93155,0.971497
0,LightGBM,Ordinal_with_originals,0.904383,0.908739,0.930589,0.971839
5,LightGBM,Cyclical_without_originals,0.903745,0.908673,0.928966,0.972851
1,LightGBM,Ordinal_without_originals,0.902551,0.90812,0.927519,0.973081
3,LightGBM,TargetEnc_without_originals,0.902387,0.907987,0.922782,0.977898
