# Bank Churn Prediction

Matthew Lucas  
Unit 4 Incremental Capstone  
Class 2509 TA


## Imports


In [None]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score,
    accuracy_score,
    roc_curve,
    roc_auc_score
)
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from collections import Counter


## Data Preprocessing


In [None]:
# Load and clean data
bankData = pd.read_csv("Churn_Modeling.csv")

# Remove unnecessary features
# TODO: Create a list of columns to drop (RowNumber, CustomerId, Surname)
columns_to_drop = [___, ___, ___]
bankData_cleaned = bankData.drop(columns=columns_to_drop)

# Encode categorical variables
# TODO: Map Gender: "Female" -> 0, "Male" -> 1
bankData_cleaned["Gender"] = bankData_cleaned["Gender"].map({___: ___, ___: ___})

# TODO: Map Geography: "France" -> 0, "Germany" -> 1, "Spain" -> 2
bankData_cleaned["Geography"] = bankData_cleaned["Geography"].map({
    ___: ___,
    ___: ___,
    ___: ___
})

# Define column types
# TODO: Fill in the categorical and numerical column lists
cat_cols = [___, ___, ___, ___, ___]
num_cols = [___, ___, ___, ___, ___]

print(f"Dataset shape: {bankData_cleaned.shape}")
print(f"Missing values: {bankData_cleaned.isna().sum().sum()}")


## Train/Test Split and Scaling


In [None]:
# Split data (must be done before scaling)
# TODO: Separate features (X) and target (y). Target column is "Exited"
X = bankData_cleaned.drop(columns=[___])
y = bankData_cleaned[___]

# TODO: Split the data using train_test_split
# Use test_size=0.2, random_state=42, and stratify=y
X_train, X_test, y_train, y_test = train_test_split(
    ___, ___, 
    test_size=___, 
    random_state=___, 
    stratify=___
)

# Scale numerical features (fit on training, transform both)
# TODO: Create a StandardScaler and fit it on X_train[num_cols], then transform both X_train and X_test
scaler = StandardScaler()
X_train[num_cols] = scaler.___(X_train[num_cols])
X_test[num_cols] = scaler.___(X_test[num_cols])

print("Class distribution before SMOTE:")
print(f"Class 0 (No Churn): {Counter(y_train)[0]}")
print(f"Class 1 (Churn): {Counter(y_train)[1]}")


## Apply SMOTE and Calculate Class Weights


In [None]:
# Apply SMOTE to balance training data
# TODO: Create a SMOTE object with random_state=42, then fit_resample on X_train and y_train
smote = SMOTE(random_state=___)
X_train_balanced, y_train_balanced = smote.___(___, ___)

# Calculate class weights
# TODO: Use compute_class_weight with 'balanced' mode to calculate weights for y_train
class_weights = compute_class_weight(___, classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

print("Class distribution after SMOTE:")
print(f"Class 0 (No Churn): {Counter(y_train_balanced)[0]}")
print(f"Class 1 (Churn): {Counter(y_train_balanced)[1]}")
print(f"\nTraining samples: {len(X_train)} -> {len(X_train_balanced)}")
print(f"Class weights: {class_weight_dict}")


## Build Neural Network Model


In [None]:
# TODO: Build a Sequential model with:
# - Input layer with shape matching X_train_balanced.shape[1]
# - Dense layer with 32 units and 'relu' activation
# - Dense layer with 16 units and 'relu' activation  
# - Dense output layer with 1 unit and 'sigmoid' activation
model = Sequential([
    Input(shape=(___,)),
    Dense(___, activation=___),
    Dense(___, activation=___),
    Dense(___, activation=___)
])

# TODO: Compile the model with:
# - optimizer: Adam with learning_rate=0.001
# - loss: 'binary_crossentropy'
# - metrics: ['accuracy']
model.compile(
    optimizer=Adam(learning_rate=___),
    loss=___,
    metrics=[___]
)

model.summary()


## Train and Evaluate Model


In [None]:
# Train model
# TODO: Fill in the model.fit() parameters:
# - Use X_train_balanced and y_train_balanced
# - validation_split=0.2
# - epochs=30
# - batch_size=32
# - class_weight=class_weight_dict
# - callbacks with EarlyStopping monitoring 'val_loss' with patience=5 and restore_best_weights=True
# - verbose=1
history = model.fit(
    ___, ___,
    validation_split=___,
    epochs=___,
    batch_size=___,
    class_weight=___,
    callbacks=[EarlyStopping(monitor=___, patience=___, restore_best_weights=___)],
    verbose=___
)


## Make predictions on the model


In [None]:
# Make predictions (only run model once)
y_pred_proba = model.predict(X_test, verbose=0)
y_pred_proba = y_pred_proba.ravel()  # Flatten for easier use
y_pred = (y_pred_proba > 0.7017).astype(int)  # Using optimal threshold from ROC analysis

# Calculate all metrics efficiently
test_acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Calculate confusion matrix and derived metrics
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0

# Calculate percentages (avoid division by zero)
cm_percent = cm.astype('float') / (cm.sum(axis=1, keepdims=True) + 1e-10) * 100

# Print metrics
print("\n" + "="*50)
print("CLASSIFICATION METRICS")
print("="*50)
print(f"Accuracy:   {test_acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"Specificity: {specificity:.4f}")
print("="*50)

# Confusion Matrix with labels and percentages
print("\nCONFUSION MATRIX")
print("="*50)
print(f"{'':20s} {'Predicted: No':20s} {'Predicted: Yes':20s}")
print(f"{'Actual: No':20s} {f'TN={cm[0,0]} ({cm_percent[0,0]:.1f}%)':20s} {f'FP={cm[0,1]} ({cm_percent[0,1]:.1f}%)':20s}")
print(f"{'Actual: Yes':20s} {f'FN={cm[1,0]} ({cm_percent[1,0]:.1f}%)':20s} {f'TP={cm[1,1]} ({cm_percent[1,1]:.1f}%)':20s}")
print("="*50)
print(f"\nTotal Test Samples: {len(y_test)}")
print(f"True Negatives (TN):  {cm[0,0]} ({cm_percent[0,0]:.1f}%)")
print(f"False Positives (FP): {cm[0,1]} ({cm_percent[0,1]:.1f}%)")
print(f"False Negatives (FN): {cm[1,0]} ({cm_percent[1,0]:.1f}%)")
print(f"True Positives (TP):  {cm[1,1]} ({cm_percent[1,1]:.1f}%)")


## ROC Curve Analysis


In [None]:
# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Find optimal threshold using Youden's J statistic
youden_j = tpr - fpr
optimal_idx = np.argmax(youden_j)
optimal_threshold = thresholds[optimal_idx]

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.plot(fpr[optimal_idx], tpr[optimal_idx], 'ro', markersize=10, 
         label=f'Optimal Threshold = {optimal_threshold:.4f}')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12)
plt.ylabel('True Positive Rate (Sensitivity/Recall)', fontsize=12)
plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=14, fontweight='bold')
plt.legend(loc="lower right", fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Print ROC analysis
print("\n" + "="*50)
print("ROC CURVE ANALYSIS")
print("="*50)
print(f"AUC Score: {roc_auc:.4f}")
print(f"Optimal Threshold: {optimal_threshold:.4f}")
print("="*50)


Questions to Answer / Explore for the summary section

- Evaluate the current performance of the model. What are some things you could try to improve the performance of the model? If you can, implement them and discuss the changes from the current implementation, and whether or not performance improved.

- Is model that is at or near 100% feasible?

- Which of the classification metrics (Accuracy, Precision, Recall, F1, Specificity) are most important when it comes to identifying customers likely to churn.

- change the line of code y_pred = (y_pred_prob > 0.3).astype(int) to either 0.3 or the ROC optimal threshold value. What changes do you see to the confusion matrix? If the Bank were to offer a promotion to the clients identified as a churn risk, which value (0.3 or optimal) would be best to use if the promotion was very cheap? What if it was expensive?

- Any other thoughts on this project can be placed here.