In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc

In [None]:
!pip install category_encoders scikit-learn plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.0


In [None]:
df_original = pd.read_csv('European_bank_marketing.csv')
df_featured = pd.read_csv('ml_final.csv')

In [None]:
def apply_categorical_encodings(df, threshold):
  for col in df.columns:
    if pd.api.types.is_categorical_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
      unique_values = df[col].nunique()
      if unique_values <= threshold:
        target_encoder = TargetEncoder()
        df[col] = target_encoder.fit_transform(df[col], df['term_deposit'])
      else:
        df = pd.get_dummies(df, columns=[col], prefix=[col], drop_first=True)
  return df

In [None]:
X_encoded = apply_categorical_encodings(df_original,3)
X_encoded_featured = apply_categorical_encodings(df_featured,3)

In [None]:
def train_rf_model(df, target_var, test_size=0.3, random_state=42, params=None):
    # Split data into X and y
    X = df.drop(target_var, axis=1)
    y = df[target_var]

    # Train/test split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    # Set default hyperparameters for the random forest model
    if params is None:
        params = {
            'n_estimators': 100,
            'max_depth': None,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'auto',
            'bootstrap': True,
            'random_state': random_state
        }

    # Train the random forest model
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)

    return model, X_val, y_val


In [None]:
model,X_val, y_val = train_rf_model(X_encoded, 'term_deposit')
model_f,X_val_f, y_val_f = train_rf_model(X_encoded_featured, 'term_deposit')

  warn(
  warn(


In [None]:
def evaluate_threshold(model, X, y_true, threshold):
    y_pred_proba = model.predict_proba(X)[:, 1]
    y_pred = (y_pred_proba >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    f1 = f1_score(y_true, y_pred)
    return tn, fp, fn, tp, tpr, fpr, f1

thresholds = [0.1, 0.2, 0.35, 0.5]

for threshold in thresholds:
    tn, fp, fn, tp, tpr, fpr, f1 = evaluate_threshold(model, X_val, y_val, threshold)
    print(f"Threshold Model 1: {threshold:.2f} | Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp} | TPR={tpr:.2f} | FPR={fpr:.2f} | F1 Score={f1:.2f}")
    
    tn_f, fp_f, fn_f, tp_f, tpr_f, fpr_f, f1_f = evaluate_threshold(model_f, X_val_f, y_val_f, threshold)
    print(f"Threshold Model 2: {threshold:.2f} | Confusion Matrix: TN={tn_f}, FP={fp_f}, FN={fn_f}, TP={tp_f} | TPR={tpr_f:.2f} | FPR={fpr_f:.2f} | F1 Score={f1_f:.2f}")
    print("------------------------------------------------------------------------------------------")


Threshold Model 1: 0.10 | Confusion Matrix: TN=9078, FP=1887, FN=79, TP=1313 | TPR=0.94 | FPR=0.17 | F1 Score=0.57
Threshold Model 2: 0.10 | Confusion Matrix: TN=9063, FP=1902, FN=93, TP=1299 | TPR=0.93 | FPR=0.17 | F1 Score=0.57
------------------------------------------------------------------------------------------
Threshold Model 1: 0.20 | Confusion Matrix: TN=9686, FP=1279, FN=164, TP=1228 | TPR=0.88 | FPR=0.12 | F1 Score=0.63
Threshold Model 2: 0.20 | Confusion Matrix: TN=9655, FP=1310, FN=182, TP=1210 | TPR=0.87 | FPR=0.12 | F1 Score=0.62
------------------------------------------------------------------------------------------
Threshold Model 1: 0.35 | Confusion Matrix: TN=10221, FP=744, FN=363, TP=1029 | TPR=0.74 | FPR=0.07 | F1 Score=0.65
Threshold Model 2: 0.35 | Confusion Matrix: TN=10201, FP=764, FN=374, TP=1018 | TPR=0.73 | FPR=0.07 | F1 Score=0.64
------------------------------------------------------------------------------------------
Threshold Model 1: 0.50 | Confusi

In [None]:
import numpy as np
import plotly.graph_objects as go
from sklearn.metrics import auc, roc_curve, f1_score

# Get the predicted probabilities of the positive class
y_pred_proba = model.predict_proba(X_val)[:, 1]
y_pred_proba_f = model_f.predict_proba(X_val_f)[:, 1]

# Compute the false positive rate (FPR), true positive rate (TPR), and threshold for the ROC curve
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba, pos_label=1)
fpr_f, tpr_f, thresholds = roc_curve(y_val_f, y_pred_proba_f, pos_label=1)

# Compute the area under the curve (AUC) of the ROC curve
roc_auc = auc(fpr, tpr)
roc_auc_f = auc(fpr_f, tpr_f)

# Compute the F1 score
y_pred = np.round(y_pred_proba)
y_pred_f = np.round(y_pred_proba_f)
f1 = f1_score(y_val, y_pred)
f1_f = f1_score(y_val_f, y_pred_f)

# Plot the ROC curve
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve Model 1 (AUC = %f)' % roc_auc))
fig.add_trace(go.Scatter(x=fpr_f, y=tpr_f, mode='lines', name='ROC curve Model 2 (AUC = %f)' % roc_auc_f))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Classifier'))
fig.update_layout(title='Receiver Operating Characteristic (ROC) Curve',
                  xaxis_title='False Positive Rate (FPR)',
                  yaxis_title='True Positive Rate (TPR)',
                  legend=dict(x=0.1, y=0.9))
fig.add_annotation(
    x=0.5, y=0.1,
    text='F1 Score Model 1: {:.4f}<br>F1 Score Model 2: {:.4f}'.format(f1, f1_f),
    showarrow=False,
    font=dict(size=12),
    bgcolor='white',
    bordercolor='black',
    borderwidth=1,
    opacity=0.8
)
fig.show()


In [None]:
# Get feature importances from the model
importances = model.feature_importances_
importances_f = model_f.feature_importances_

# Get feature names
feature_names = X_encoded.drop('term_deposit',axis=1).columns
feature_names_f = X_encoded_featured.drop('term_deposit',axis=1).columns

# Create a dictionary with feature names and importances
feature_importances = dict(zip(feature_names, importances))
feature_importances_f = dict(zip(feature_names_f, importances_f))

# Sort the features by importance
sorted_features = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)
sorted_features_f = sorted(feature_importances_f.items(), key=lambda x: x[1], reverse=True)

# Create a horizontal bar chart of feature importances
fig = go.Figure()
fig.add_trace(go.Bar(
    x=[val[1] for val in sorted_features],
    y=[val[0] for val in sorted_features],
    orientation='h',
    name='Model 1'
))
fig.add_trace(go.Bar(
    x=[val[1] for val in sorted_features_f],
    y=[val[0] for val in sorted_features_f],
    orientation='h',
    name='Model 2'
))
fig.update_layout(
    title='Feature Importances',
    xaxis_title='Importance',
    yaxis_title='Feature',
    height=500,
)
fig.show()
