In [None]:
  !pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, f1_score
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc

In [None]:
df_original = pd.read_csv('/content/European_bank_marketing.csv')
df_featured = pd.read_csv('/content/ml_final.csv')

In [None]:
df_original[df_original['Ethnicity_African' ] == 0]

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,term_deposit,Ethnicity_African
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1,0
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0,0
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0,0
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1,0


Let's create a function that has the capability to distinguish categorical and numerical variables for any dataset and determine the cardinality of categorical features and apply enocding accordingly

In [None]:
def apply_categorical_encodings(df, threshold):
  for col in df.columns:
    if pd.api.types.is_categorical_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
      unique_values = df[col].nunique()
      if unique_values <= threshold:
        one_hot_encoder = OneHotEncoder(sparse_output=False)
        encoded_col = pd.DataFrame(one_hot_encoder.fit_transform(df[[col]]))
        encoded_col.columns = [f'{col}_{val}' for val in one_hot_encoder.categories_[0]]
        df = pd.concat([df, encoded_col], axis=1)
      else:
        target_encoder = TargetEncoder()
        df[col] = target_encoder.fit_transform(df[col], df['term_deposit'])
      df.drop(col, axis=1, inplace=True)
  return df


In [None]:
X_encoded = apply_categorical_encodings(df_original,3)
X_encoded_featured = apply_categorical_encodings(df_featured,3)

Okay now let's move to the modelling step!!!

In [None]:
def train_lgbm_model(df, target_var, test_size=0.3, random_state=42, num_rounds=1000, early_stopping_rounds=50, params=None):
    # Split data into X and y
    X = df.drop(target_var, axis=1)
    y = df[target_var]

    # Train/test split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    # Create a LightGBM dataset from the training and validation sets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    # Set default hyperparameters for the LightGBM model
    if params is None:
        params = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': 'binary_logloss',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9
        }

    # Train the LightGBM model
    model = lgb.train(params, train_data, num_rounds, valid_sets=[train_data, val_data], early_stopping_rounds=early_stopping_rounds)
    
    return model,X_train, X_val, y_train, y_val


In [None]:
model,X_train, X_val, y_train, y_val = train_lgbm_model(X_encoded, 'term_deposit')
model_f,X_train_f, X_val_f, y_train_f, y_val_f = train_lgbm_model(X_encoded_featured, 'term_deposit')




[LightGBM] [Info] Number of positive: 3248, number of negative: 25583
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 699
[LightGBM] [Info] Number of data points in the train set: 28831, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.112657 -> initscore=-2.063889
[LightGBM] [Info] Start training from score -2.063889
[1]	training's binary_logloss: 0.331082	valid_1's binary_logloss: 0.331192
Training until validation scores don't improve for 50 rounds
[2]	training's binary_logloss: 0.314802	valid_1's binary_logloss: 0.315009
[3]	training's binary_logloss: 0.301332	valid_1's binary_logloss: 0.301703
[4]	training's binary_logloss: 0.289955	valid_1's binary_logloss: 0.290442
[5]	training's binary_logloss: 0.279959	valid_1's binary_logloss: 0.280643
[6]	training's binary_logloss: 0.27067	valid_1's binary_logloss: 0.271713
[7]	training's binary_logloss: 0.26249



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3924
[LightGBM] [Info] Number of data points in the train set: 28831, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.112657 -> initscore=-2.063889
[LightGBM] [Info] Start training from score -2.063889
[1]	training's binary_logloss: 0.330641	valid_1's binary_logloss: 0.331109
Training until validation scores don't improve for 50 rounds
[2]	training's binary_logloss: 0.313814	valid_1's binary_logloss: 0.314834
[3]	training's binary_logloss: 0.300265	valid_1's binary_logloss: 0.301514
[4]	training's binary_logloss: 0.288474	valid_1's binary_logloss: 0.290022
[5]	training's binary_logloss: 0.278174	valid_1's binary_logloss: 0.280061
[6]	training's binary_logloss: 0.269175	valid_1's binary_logloss: 0.271341
[7]	training's binary_logloss: 0.261158	valid_1's binary_logloss: 0.263563
[8]	training's binary_logloss: 

Choose the probability thresholds of 10%, 20%, 35%, and 50% and compute the confusion matrix for each of them

In [None]:
def evaluate_threshold(model, X, y_true, threshold):
    y_pred_proba = model.predict(X)
    y_pred = (y_pred_proba >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    return tn, fp, fn, tp, tpr, fpr

thresholds = [0.1, 0.2, 0.35, 0.5]

for threshold in thresholds:
    tn, fp, fn, tp, tpr, fpr = evaluate_threshold(model, X_val, y_val, threshold)
    f1 = f1_score(y_val, (model.predict(X_val) >= threshold).astype(int))
    print(f"Threshold Model 1: {threshold:.2f} | Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp} | TPR={tpr:.2f} | FPR={fpr:.2f} | F1 Score={f1:.2f}")
    
    tn_f, fp_f, fn_f, tp_f, tpr_f, fpr_f = evaluate_threshold(model_f, X_val_f, y_val_f, threshold)
    f1_f = f1_score(y_val_f, (model_f.predict(X_val_f) >= threshold).astype(int))
    print(f"Threshold Model 2: {threshold:.2f} | Confusion Matrix: TN={tn_f}, FP={fp_f}, FN={fn_f}, TP={tp_f} | TPR={tpr_f:.2f} | FPR={fpr_f:.2f} | F1 Score={f1_f:.2f}")
    print("------------------------------------------------------------------------------------------")


Threshold Model 1: 0.10 | Confusion Matrix: TN=9217, FP=1748, FN=81, TP=1311 | TPR=0.94 | FPR=0.16 | F1 Score=0.59
Threshold Model 2: 0.10 | Confusion Matrix: TN=9231, FP=1734, FN=76, TP=1316 | TPR=0.95 | FPR=0.16 | F1 Score=0.59
------------------------------------------------------------------------------------------
Threshold Model 1: 0.20 | Confusion Matrix: TN=9725, FP=1240, FN=151, TP=1241 | TPR=0.89 | FPR=0.11 | F1 Score=0.64
Threshold Model 2: 0.20 | Confusion Matrix: TN=9720, FP=1245, FN=166, TP=1226 | TPR=0.88 | FPR=0.11 | F1 Score=0.63
------------------------------------------------------------------------------------------
Threshold Model 1: 0.35 | Confusion Matrix: TN=10251, FP=714, FN=349, TP=1043 | TPR=0.75 | FPR=0.07 | F1 Score=0.66
Threshold Model 2: 0.35 | Confusion Matrix: TN=10219, FP=746, FN=333, TP=1059 | TPR=0.76 | FPR=0.07 | F1 Score=0.66
------------------------------------------------------------------------------------------
Threshold Model 1: 0.50 | Confusi

Plot the roc-auc curve

In [None]:
# Get the predicted probabilities of the positive class
y_pred_proba = model.predict(X_val)
y_pred_proba_f = model_f.predict(X_val_f)

# Compute the false positive rate (FPR), true positive rate (TPR), and threshold for the ROC curve
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba, pos_label=1)
fpr_f, tpr_f, thresholds = roc_curve(y_val_f, y_pred_proba_f, pos_label=1)

# Compute the area under the curve (AUC) of the ROC curve
roc_auc = auc(fpr, tpr)
roc_auc_f = auc(fpr_f, tpr_f)

# Plot the ROC curve
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve Model 1(AUC = %f)' % roc_auc))
fig.add_trace(go.Scatter(x=fpr_f, y=tpr_f, mode='lines', name='ROC curve Model 2(AUC = %f)' % roc_auc_f))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Classifier'))
fig.update_layout(title='Receiver Operating Characteristic (ROC) Curve',
                  xaxis_title='False Positive Rate (FPR)',
                  yaxis_title='True Positive Rate (TPR)')
fig.show()


The ROC curve shows the trade-off between the true positive rate (TPR) and the false positive rate (FPR) for different probability thresholds. The TPR is the proportion of actual positive samples that are correctly identified as positive, while the FPR is the proportion of actual negative samples that are incorrectly identified as positive.

The AUC (Area Under the Curve) is a summary statistic that represents the overall performance of the model across all possible probability thresholds. The AUC ranges from 0 to 1, with higher values indicating better performance. An AUC of 0.5 indicates a model that performs no better than a random classifier, while an AUC of 1.0 indicates a model that perfectly separates the positive and negative samples.

We can use the ROC curve and AUC to evaluate the performance of the model and choose an appropriate probability threshold based on the trade-off between TPR and FPR for our particular application.

In [None]:
# Get feature importances from the model
importances = model.feature_importance()
importances_f = model_f.feature_importance()

# Get feature names
feature_names = X_encoded.drop('term_deposit',axis=1).columns
feature_names_f = X_encoded_featured.drop('term_deposit',axis=1).columns

# Create a dictionary with feature names and importances
feature_importances = dict(zip(feature_names, importances))
feature_importances_f = dict(zip(feature_names_f, importances_f))

# Sort the features by importance
sorted_features = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)
sorted_features_f = sorted(feature_importances_f.items(), key=lambda x: x[1], reverse=True)

# Create a horizontal bar chart of feature importances
fig = go.Figure()
fig.add_trace(go.Bar(
    x=[val[1] for val in sorted_features],
    y=[val[0] for val in sorted_features],
    orientation='h',
    name='Model 1'
))
fig.add_trace(go.Bar(
    x=[val[1] for val in sorted_features_f],
    y=[val[0] for val in sorted_features_f],
    orientation='h',
    name='Model 2'
))
fig.update_layout(
    title='Feature Importances',
    xaxis_title='Importance',
    yaxis_title='Feature',
    height=500,
)
fig.show()
