In [1]:
# Supervised Learning Project — Beta Bank Churn Prediction
# --------------------------------------------------------
#
# Goal:
#     Predict whether a customer will leave Beta Bank (binary classification).
#     Target metric: F1 score ≥ 0.59 on the test set.
#     Also evaluate AUC-ROC to compare against F1.

In [2]:
# Importing what I think is needed to make a valid model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.utils import resample
from sklearn.model_selection import RandomizedSearchCV

In [3]:
# Load and View data
data = pd.read_csv('/datasets/Churn.csv')
print(data.head())
print(data.info())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0     2.0       0.00              1          1               1   
1     1.0   83807.86              1          0               1   
2     8.0  159660.80              3          1               0   
3     1.0       0.00              2          0               0   
4     2.0  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [4]:
# These columns (RowNumber, CustomerId, Surname) are identifiers and not predictive,
# so we remove them to prevent noise or data leakage.
data = data.drop(['RowNumber','CustomerId','Surname'], axis=1)

In [5]:
# Split the features/target
features = data.drop('Exited', axis = 1)
target = data['Exited']

# Stratified splitting ensures the proportion of churners vs non-churners
# is preserved across train/validation/test sets, which is important for imbalanced data.
# Without stratify, we could end up with splits where the minority class
# (Exited = 1) is underrepresented or even missing in validation/test,
# which would make evaluation unreliable.
X_trainval, X_test, y_trainval, y_test = train_test_split(
    features, target, test_size=0.2, stratify=target, random_state=12345
)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_trainval, y_trainval, test_size=0.25, stratify=y_trainval, random_state=12345
)
# 60/20/20 split

# Make copies to avoid SettingWithCopyWarning
X_train = X_train.copy()
X_valid = X_valid.copy()
X_test  = X_test.copy()

In [6]:
# Preprocessing:
# 1. Missing values handled: categorical -> 'Unknown', numeric -> median.
# 2. One-hot encoding: expanded categorical columns into binary indicators.
# 3. Scaling: standardized numeric features to mean=0, std=1 for fair treatment by models.
# Categorical and numeric columns
categorical = ['Geography', 'Gender']
numeric = ['CreditScore','Age','Tenure','Balance','NumOfProducts',
           'HasCrCard','IsActiveMember','EstimatedSalary']

# --- 1. Handle missing values BEFORE OHE ---
for col in categorical:
    X_train[col] = X_train[col].fillna('Unknown')
    X_valid[col] = X_valid[col].fillna('Unknown')
    X_test[col]  = X_test[col].fillna('Unknown')

for col in numeric:
    median = X_train[col].median()  # use training set median
    X_train[col] = X_train[col].fillna(median)
    X_valid[col] = X_valid[col].fillna(median)
    X_test[col]  = X_test[col].fillna(median)

# --- 2. One-Hot Encoding ---
X_train = pd.get_dummies(X_train, columns=categorical, drop_first=True)
X_valid = pd.get_dummies(X_valid, columns=categorical, drop_first=True)
X_test  = pd.get_dummies(X_test,  columns=categorical, drop_first=True)

# Align columns across sets
X_valid = X_valid.reindex(columns=X_train.columns, fill_value=0)
X_test  = X_test.reindex(columns=X_train.columns, fill_value=0)

# --- 3. Scale numeric features ---
# Scaling helps models like Logistic Regression converge faster
# and prevents features with large scales (like Balance or EstimatedSalary)
# from dominating those with smaller ranges (like NumOfProducts).
scaler = StandardScaler()
X_train[numeric] = scaler.fit_transform(X_train[numeric])
X_valid[numeric] = scaler.transform(X_valid[numeric])
X_test[numeric]  = scaler.transform(X_test[numeric])

In [7]:
# Examine class balance
print("Class distribution in target (Exited):")
print(target.value_counts(normalize=True))

Class distribution in target (Exited):
0    0.7963
1    0.2037
Name: Exited, dtype: float64


In [8]:
# We can see that about ~80% of customers did not exit while only ~20% did.
# This imbalance means accuracy is a misleading metric
# a model predicting "no churn" for everyone would already be ~80% accurate
# while failing to capture churners. That’s why F1 and AUC-ROC are better suited here.

In [9]:
# Logistic Regression (Baseline) model
logreg = LogisticRegression(max_iter=1000, random_state=12345)
logreg.fit(X_train, y_train)
proba_valid = logreg.predict_proba(X_valid)[:,1]

print('Baseline Logistic Regression:')
print('F1:', f1_score(y_valid, (proba_valid > 0.5).astype(int)))
print('AUC-ROC:', roc_auc_score(y_valid, proba_valid))

Baseline Logistic Regression:
F1: 0.3214953271028037
AUC-ROC: 0.7874854824007367


In [10]:
# Baseline results:
# A simple Logistic Regression was trained without any imbalance correction.
# It achieved an F1 score of ~0.32, reflecting poor capture of the minority class,
# even though the AUC-ROC (~0.79) suggests the model separates the classes
# reasonably well in terms of probability. This shows the imbalance issue:
# probability ranking looks fine, but binary decisions underperform for recall/precision.
# This step establishes a performance floor (baseline).
# Any further methods (balancing, ensembles, tuning) must beat this to justify their complexity.

In [11]:
# Logistic Regression (Baseline) model
logreg_bal = LogisticRegression(max_iter=1000, random_state=12345, class_weight='balanced')
logreg_bal.fit(X_train, y_train)
proba_valid_bal = logreg_bal.predict_proba(X_valid)[:,1]

print('Logistic Regression (Balanced):')
print('F1:', f1_score(y_valid, (proba_valid_bal > 0.5).astype(int)))
print('AUC-ROC:', roc_auc_score(y_valid, proba_valid_bal))

Logistic Regression (Balanced):
F1: 0.5108601216333623
AUC-ROC: 0.7917516900567748


In [12]:
# Note: This comparison shows that the improvement is indeed due to handling imbalance.
# The plain Logistic Regression had F1 ≈ 0.32, while the balanced version reached F1 ≈ 0.51.
# Since the AUC-ROC stayed similar, we can conclude that class weighting improved F1
# by giving more attention to the minority class.
# This shows that re-weighting shifted the decision boundary,
# catching more positives at the cost of slightly lower precision,
# which is exactly what we want for churn detection.

In [13]:
# Random Forest (Plain, no imbalance fix)
rf_plain = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=12345
)
rf_plain.fit(X_train, y_train)
proba_valid_plain = rf_plain.predict_proba(X_valid)[:,1]

print("\nRandom Forest (Plain):")
print("F1:", f1_score(y_valid, (proba_valid_plain > 0.5).astype(int)))
print("AUC-ROC:", roc_auc_score(y_valid, proba_valid_plain))


Random Forest (Plain):
F1: 0.5513866231647634
AUC-ROC: 0.8713289560747188


In [14]:
# Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight='balanced',
    random_state=12345
)
rf.fit(X_train, y_train)
proba_valid = rf.predict_proba(X_valid)[:,1]

print("\nRandom Forest:")
print("F1:", f1_score(y_valid, (proba_valid > 0.5).astype(int)))
print("AUC-ROC:", roc_auc_score(y_valid, proba_valid))


Random Forest:
F1: 0.6386554621848739
AUC-ROC: 0.8673326639428335


In [15]:
# Comparing plain vs balanced Random Forest confirms that
# imbalance handling (via class_weight) directly improves recall/F1,
# while AUC-ROC remains roughly the same — echoing the pattern seen in Logistic Regression.

In [16]:
# Gradient Boosting
gb = GradientBoostingClassifier(random_state=12345)
gb.fit(X_train, y_train)
proba_valid = gb.predict_proba(X_valid)[:,1]

print("\nGradient Boosting:")
print("F1:", f1_score(y_valid, (proba_valid > 0.5).astype(int)))
print("AUC-ROC:", roc_auc_score(y_valid, proba_valid))


Gradient Boosting:
F1: 0.5750000000000001
AUC-ROC: 0.8797480068666509


In [17]:
# Findings:
# - Random Forest performed best: F1 ~0.64 and AUC-ROC ~0.87, comfortably
#   above the project threshold of F1 ≥ 0.59. It effectively balances recall
#   and precision by leveraging class weighting.
# - Gradient Boosting achieved F1 ~0.58 and AUC-ROC ~0.88, meaning it had
#   stronger ranking power (AUC-ROC) but fell short on F1. This suggests it
#   leaned more towards precision than recall under the 0.5 threshold.
#
# Thus, Random Forest was chosen as the best model.

In [18]:
X_train_upsampled, y_train_upsampled = resample(
    X_train[y_train==1],
    y_train[y_train==1],
    replace=True,
    n_samples=y_train[y_train==0].shape[0],
    random_state=12345
)

X_train_up = pd.concat([X_train[y_train==0], X_train_upsampled])
y_train_up = pd.concat([y_train[y_train==0], y_train_upsampled])

# Shuffle to mix the classes
X_train_up, y_train_up = resample(X_train_up, y_train_up, random_state=12345)

# Train Random Forest on upsampled data
rf_up = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=12345
)
rf_up.fit(X_train_up, y_train_up)
proba_valid_up = rf_up.predict_proba(X_valid)[:,1]

print("\nRandom Forest (Upsampled):")
print("F1:", f1_score(y_valid, (proba_valid_up > 0.5).astype(int)))
print("AUC-ROC:", roc_auc_score(y_valid, proba_valid_up))


Random Forest (Upsampled):
F1: 0.6225596529284165
AUC-ROC: 0.8655111197484079


In [19]:
# Upsampling is a second imbalance fix.
# Unlike class_weight (which just re-weights), this physically duplicates minority examples
# to balance the training set. It can sometimes cause overfitting,
# but it gives the model more exposure to churn patterns.

In [20]:
# Define parameter grid
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'class_weight': [None, 'balanced']
}

rf_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=12345),
    param_distributions=param_dist,
    n_iter=10,
    scoring='f1',
    cv=3,
    random_state=12345,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)

print("Best parameters from search:", rf_search.best_params_)
print("Best F1 score from search:", rf_search.best_score_)

Best parameters from search: {'n_estimators': 200, 'min_samples_split': 5, 'max_depth': 10, 'class_weight': 'balanced'}
Best F1 score from search: 0.5976878675347547


In [21]:
# Hyperparameter tuning helps avoid arbitrary parameter choices
# and finds combinations that maximize F1. RandomizedSearchCV is used instead of GridSearch
# because it’s faster and still explores the search space effectively.

In [22]:
# Use the Random Forest model selected from validation
rf_final = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=5,
    max_depth=10,
    class_weight='balanced',
    random_state=12345
)

# Train on the combined training+validation data for maximum learning
# Retraining on train+valid ensures the final model
# uses all available labeled data before the test set, which I believe is common best practice.
rf_final.fit(pd.concat([X_train, X_valid]), pd.concat([y_train, y_valid]))

# Predict probabilities and convert to binary predictions
proba_test = rf_final.predict_proba(X_test)[:, 1]
pred_test = (proba_test > 0.5).astype(int)

# Evaluate
print("Final Test Results:")
print("F1:", f1_score(y_test, pred_test))
print("AUC-ROC:", roc_auc_score(y_test, proba_test))

Final Test Results:
F1: 0.6396292004634994
AUC-ROC: 0.8726677370745167


In [23]:
# --- Final Testing ---
# We performed the final evaluation using the held-out test set to check how well the best model
# (Random Forest with class balancing) generalizes to unseen data. 
# Results:
#   • F1 Score: ~0.64 → comfortably above the project threshold of 0.59, showing that the model
#     achieves a solid balance between precision and recall in predicting churn.
#   • AUC-ROC: ~0.872 → indicates the model is excellent at ranking customers by their churn risk,
#     with performance far above a random baseline (0.5).
#
# Together, these metrics confirm the Random Forest model is robust, generalizes well to new data,
# and provides reliable predictive performance for customer churn detection.

# --- Project Wrap-Up ---
# In summary:
#   1. Data was cleaned, missing values handled, categorical features encoded, and numeric features scaled.
#   2. We identified a strong class imbalance (≈80% non-churn vs 20% churn) and tested multiple fixes:
#      • Baseline models (Logistic Regression, Random Forest, Gradient Boosting)
#      • Class weighting
#      • Upsampling of the minority class
#   3. Comparisons showed that imbalance handling significantly boosted F1 scores while AUC-ROC remained stable.
#   4. Hyperparameter tuning (RandomizedSearchCV) further improved Random Forest performance.
#
# Final Outcome: Random Forest with class weighting was chosen as the best model, achieving the project’s
# target F1 score and demonstrating reliable predictive power. The process shows how imbalance corrections,
# model comparisons, and tuning are critical steps in building practical supervised learning models.