In [1]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate


In [2]:
dft = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\train.csv")
dfo = pd.read_csv(r"C:\Users\basde\Documents\GitHub\Code-and-examples\Projects\Binary Classification with a Bank Dataset\bank-full.csv", delimiter=';')
dfo['y'] = dfo['y'].map({'no' : 0, 'yes': 1})
df = pd.concat([dft, dfo])
y = df['y']
df = df.drop('y', axis=1)

In [3]:
df['default'] = pd.get_dummies(df['default'], drop_first=True, dtype=int)
df['housing'] = pd.get_dummies(df['housing'], drop_first=True, dtype=int)
df['loan'] = pd.get_dummies(df['loan'], drop_first=True, dtype=int)
df['contact'] = df['contact'].map({'telephone': 'cellular', 'unknown' : 'unknown', 'telephone': 'telephone'})
df['contact'] = pd.get_dummies(df['contact'], drop_first=True, dtype=int)

In [4]:
df = df.select_dtypes(include=['number'])

In [None]:
# 2. **Setup**

# a. Define columns to process
columns_to_process = ['duration']

# b. Define cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# c. Define the evaluation model
model = LogisticRegression(random_state=42, max_iter=1000)

# d. Define a dictionary of techniques
# First, create a custom transformer for Winsorization
class Winsorizer(BaseEstimator, TransformerMixin):
    """
    Custom transformer to apply Winsorization to a column.
    It caps and floors the data at specified quantiles.
    """
    def __init__(self, limits=(0, 0.05)):
        self.limits = limits

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # The input X from a ColumnTransformer can be a DataFrame.
        # We convert it to a 1D array for winsorize, then reshape back.
        x_transformed = winsorize(X[:, 0], limits=self.limits)
        return x_transformed.reshape(-1, 1)

# Define the dictionary of techniques to test
techniques = {
    'Log(1+x) Transformation': FunctionTransformer(np.log1p),
    'Square Root Transformation': FunctionTransformer(np.sqrt),
    'Winsorization (5%-95%)': Winsorizer(limits=(0.05, 0.05))
}

# 3. **Execution and Evaluation**

# a. Create an empty dictionary to store the results
results = {}

# Identify numeric columns for imputation purposes
numeric_features = df.select_dtypes(include=np.number).columns.tolist()


# b. Baseline Evaluation
print("Running Baseline Evaluation...")
# Create a pipeline that first imputes missing values (with the median)
# and then fits the logistic regression model.
baseline_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('model', model)
])

# Perform cross-validation, returning train scores
baseline_scores_dict = cross_validate(
    baseline_pipeline, df, y, cv=cv, scoring='accuracy', return_train_score=True
)

# Store the mean and standard deviation of the scores
results['Baseline'] = {
    'Mean Train Score': baseline_scores_dict['train_score'].mean(),
    'Mean CV Score': baseline_scores_dict['test_score'].mean(),
    'Std CV Score': baseline_scores_dict['test_score'].std()
}



    
# c. Technique Evaluation Loop
print("Running Technique Evaluations...")
# Find the numerical index of the column to be transformed
fare_index = [i for i, col in enumerate(numeric_features) if col in columns_to_process][0]

for name, transformer in techniques.items():
    print(f"  - Evaluating: {name}")
    
    # Define a preprocessor pipeline with sequential, non-nested steps
    preprocessor = Pipeline(steps=[
        ('imputer', ColumnTransformer(
            transformers=[('numeric_imputer', SimpleImputer(strategy='median'), numeric_features)],
            remainder='passthrough'
        )),
        ('transform', ColumnTransformer(
            transformers=[('apply_technique', transformer, [fare_index])],
            remainder='passthrough'
        ))
    ])
    
    # Create the main pipeline
    main_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Perform cross-validation, returning train scores
    scores_dict = cross_validate(
        main_pipeline, df, y, cv=cv, scoring='accuracy', return_train_score=True
    )
    
    # Store the results
    results[name] = {
        'Mean Train Score': scores_dict['train_score'].mean(),
        'Mean CV Score': scores_dict['test_score'].mean(),
        'Std CV Score': scores_dict['test_score'].std()
    }


# d. Feature Dropped Evaluation
print("Running Feature Dropped Evaluation...")
# Create a new DataFrame without the 'Fare' column
df_dropped = df.drop(columns=columns_to_process)

dropped_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('model', model)
])

# Perform cross-validation, returning train scores
dropped_scores_dict = cross_validate(
    dropped_pipeline, df_dropped, y, cv=cv, scoring='accuracy', return_train_score=True
)

# Store the results
results['Feature Dropped'] = {
    'Mean Train Score': dropped_scores_dict['train_score'].mean(),
    'Mean CV Score': dropped_scores_dict['test_score'].mean(),
    'Std CV Score': dropped_scores_dict['test_score'].std()
}
# 4. Conclusion
print("\n--- Evaluation Results ---")
# Convert the results dictionary to a pandas DataFrame
results_df = pd.DataFrame.from_dict(results, orient='index')

# Define column order for clarity and select them
column_order = ['Mean Train Score', 'Mean CV Score', 'Std CV Score']
results_df = results_df[column_order]

# Sort the results by the mean cross-validation score in descending order
results_df = results_df.sort_values(by='Mean CV Score', ascending=False)

# Print the final comparison table
print(results_df.to_string())


Running Baseline Evaluation...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
