# Gap Analysis & Replication

**Goal**: Replicate the high performance (F1 ~0.76+, Recall ~0.86) observed in the reference code.
**Hypothesis**: The difference is in Preprocessing.
1.  **Imputation**: We will **NOT** use KNN. We will let models handle missing values (or fill with a placeholder).
2.  **Encoding**: We will use **Label Encoding** instead of One-Hot Encoding.
3.  **Data**: We will use the full `customer_churn_dataset.csv` directly.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
import mlflow.lightgbm
import mlflow.catboost
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_auc_score

%matplotlib inline

mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Churn_Prediction_Gap_Analysis")

  return FileStore(store_uri, store_uri)
2026/01/06 16:54:40 INFO mlflow.tracking.fluent: Experiment with name 'Churn_Prediction_Gap_Analysis' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:d:/MLOPS PROJECT CHURN PRED/experiment/../mlruns/873684746974737577', creation_time=1767698680713, experiment_id='873684746974737577', last_update_time=1767698680713, lifecycle_stage='active', name='Churn_Prediction_Gap_Analysis', tags={}>

## 1. Load Data (Mimic Reference)

In [2]:
# Load the FULL dataset (as referenced in user's snippet)
df = pd.read_csv('../customer_churn_dataset/customer_churn_dataset.csv')

# Drop ID
if 'customer_id' in df.columns:
    df.drop("customer_id", axis=1, inplace=True)

# Missing Values Strategy: 
# Reference code doesn't explicitly impute, so we suspect they allow LightGBM/CatBoost to handle it 
# OR they fill with a specific value implicitly.
# Let's fill NA with "Unknown" for Categorical to be safe and clear.
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna("Unknown")
    else:
        # For numeric, if any missing, fill with median (or keep as NaN for Boosters)
        # The 'internet_service' missing values were the main issue. They are categorical.
        pass

print("Missing after basic fill:")
print(df.isnull().sum())

Missing after basic fill:
tenure              0
monthly_charges     0
total_charges       0
contract            0
payment_method      0
internet_service    0
tech_support        0
online_security     0
support_calls       0
churn               0
dtype: int64


## 2. Label Encoding (Mimic Reference)
Instead of OHE, we convert strings to integers.

In [3]:
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])
        print(f"Encoded {col}")

# Convert target to int if not already (LabelEncoder handles it usually, but let's be sure)
# Check target column name (usually 'churn' or similar)
target_col = 'churn' # Verify this name
print(f"Target distribution:\n{df[target_col].value_counts()}")

Encoded contract
Encoded payment_method
Encoded internet_service
Encoded tech_support
Encoded online_security
Encoded churn
Target distribution:
churn
0    13157
1     6843
Name: count, dtype: int64


## 3. Split & Train

In [4]:
X = df.drop(target_col, axis=1)
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)

def train_simple(model, name):
    with mlflow.start_run(run_name=name):
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        
        print(f"Finished {name}: Acc={acc:.4f}, F1={f1:.4f}")
        print(classification_report(y_test, y_pred))

Train Shape: (16000, 9)
Test Shape: (4000, 9)


In [5]:
# 1. Random Forest (Balanced)
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
train_simple(rf, "RF_LabelEncoded_Balanced")

# 2. LightGBM (Balanced)
lgbm = LGBMClassifier(n_estimators=200, class_weight='balanced', random_state=42, verbosity=-1)
train_simple(lgbm, "LGBM_LabelEncoded_Balanced")

# 3. CatBoost (Balanced)
cat = CatBoostClassifier(iterations=200, auto_class_weights='Balanced', random_seed=42, verbose=0)
train_simple(cat, "Cat_LabelEncoded_Balanced")

Training RF_LabelEncoded_Balanced...
Finished RF_LabelEncoded_Balanced: Acc=0.8393, F1=0.7348
              precision    recall  f1-score   support

           0       0.84      0.94      0.88      2631
           1       0.84      0.65      0.73      1369

    accuracy                           0.84      4000
   macro avg       0.84      0.79      0.81      4000
weighted avg       0.84      0.84      0.83      4000

Training LGBM_LabelEncoded_Balanced...
Finished LGBM_LabelEncoded_Balanced: Acc=0.8383, F1=0.7358
              precision    recall  f1-score   support

           0       0.84      0.93      0.88      2631
           1       0.83      0.66      0.74      1369

    accuracy                           0.84      4000
   macro avg       0.84      0.80      0.81      4000
weighted avg       0.84      0.84      0.83      4000

Training Cat_LabelEncoded_Balanced...
Finished Cat_LabelEncoded_Balanced: Acc=0.8420, F1=0.7399
              precision    recall  f1-score   support

   