# Advanced Model Comparison

In this notebook, we experiment with models known for handling imbalance and categorical data effectively:
1.  **Random Forest (Balanced)**: Using native `class_weight='balanced'` parameter.
2.  **LightGBM**: Highly efficient gradient boosting, using `class_weight='balanced'`.
3.  **CatBoost**: Handles categorical features natively and has robust `auto_class_weights`.

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
import mlflow.lightgbm
import mlflow.catboost
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

%matplotlib inline

# Set MLflow Tracking URI
mlflow.set_tracking_uri("file:../mlruns")
mlflow.set_experiment("Churn_Prediction_Advanced_Models")

<Experiment: artifact_location='file:d:/MLOPS PROJECT CHURN PRED/experiment/../mlruns/982143950750977900', creation_time=1767698036854, experiment_id='982143950750977900', last_update_time=1767698036854, lifecycle_stage='active', name='Churn_Prediction_Advanced_Models', tags={'mlflow.experimentKind': 'custom_model_development'}>

## 1. Load Data & Preprocessing (Same as Baseline)

In [9]:
df = pd.read_csv('../customer_churn_dataset/customer_churn_dataset.csv')
X = df.drop('churn', axis=1)
y = df['churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Reuse Preprocessing Logic ---
def impute_internet_service(X_data, knn_model=None, scaler=None, is_train=True):
    X = X_data.copy()
    impute_features = ['monthly_charges', 'total_charges', 'tenure']
    if is_train:
        scaler = StandardScaler()
        scaler.fit(X[impute_features])
    X_scaled = scaler.transform(X[impute_features])
    mask_missing = X['internet_service'].isnull()
    if is_train:
        X_train_knn = X_scaled[~mask_missing]
        y_train_knn = X.loc[~mask_missing, 'internet_service']
        knn_model = KNeighborsClassifier(n_neighbors=5)
        knn_model.fit(X_train_knn, y_train_knn)
    if mask_missing.sum() > 0:
        X_missing_knn = X_scaled[mask_missing]
        imputed_values = knn_model.predict(X_missing_knn)
        X.loc[mask_missing, 'internet_service'] = imputed_values
    return X, knn_model, scaler

# 1. Impute
X_train_imp, knn_imputer, knn_scaler = impute_internet_service(X_train, is_train=True)
X_test_imp, _, _ = impute_internet_service(X_test, knn_model=knn_imputer, scaler=knn_scaler, is_train=False)

# 2. Encode & Scale
numerical_cols = ['tenure', 'monthly_charges', 'total_charges']
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
if 'customer_id' in categorical_cols: categorical_cols.remove('customer_id')

# For CatBoost, we usually keep strings, but for fair comparison we'll use OHE first (or we can use CatBoost native... let's stick to consistent OHE for now to compare algorithm power on SAME data)
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler_final = StandardScaler()

X_train_enc = ohe.fit_transform(X_train_imp[categorical_cols])
X_test_enc = ohe.transform(X_test_imp[categorical_cols])
X_train_sc = scaler_final.fit_transform(X_train_imp[numerical_cols])
X_test_sc = scaler_final.transform(X_test_imp[numerical_cols])

X_train_final = np.hstack([X_train_sc, X_train_enc])
X_test_final = np.hstack([X_test_sc, X_test_enc])

print("Data Processed. Train Shape:", X_train_final.shape)

Data Processed. Train Shape: (16000, 16)


## 2. Define Training & Tracking Function

In [10]:
def train_and_log(model, name):
    with mlflow.start_run(run_name=name):
        # Log Params
        mlflow.log_params(model.get_params())
        
        # Train
        print(f"Training {name}...")
        model.fit(X_train_final, y_train)
        
        # Predict
        y_pred = model.predict(X_test_final)
        y_prob = model.predict_proba(X_test_final)[:, 1] if hasattr(model, 'predict_proba') else None
        
        # Metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc = roc_auc_score(y_test, y_prob) if y_prob is not None else 0
        
        # Log Metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("auc_roc", roc)
        
        # Log Model
        if "LightGBM" in name:
            mlflow.lightgbm.log_model(model, name=name)
        elif "CatBoost" in name:
            mlflow.catboost.log_model(model, name=name)
        else:
            mlflow.sklearn.log_model(model, name=name)
        
        print(f"Finished {name}: Acc={acc:.4f}, F1={f1:.4f}")
        print(classification_report(y_test, y_pred))

## 3. Run Experiments

In [14]:
# 1. Random Forest (Balanced)
rf_balanced = RandomForestClassifier()
train_and_log(rf_balanced, "Random_Forest_Balanced")

Training Random_Forest_Balanced...
Finished Random_Forest_Balanced: Acc=0.7408, F1=0.5630
              precision    recall  f1-score   support

           0       0.77      0.87      0.82      2631
           1       0.67      0.49      0.56      1369

    accuracy                           0.74      4000
   macro avg       0.72      0.68      0.69      4000
weighted avg       0.73      0.74      0.73      4000



In [12]:
# 2. LightGBM (Balanced)
lgbm_balanced = LGBMClassifier(
    n_estimators=200,
    learning_rate=0.1,
    class_weight='balanced',  # <--- Key Change
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)
train_and_log(lgbm_balanced, "LightGBM_Balanced")

Training LightGBM_Balanced...




Finished LightGBM_Balanced: Acc=0.7485, F1=0.5970
              precision    recall  f1-score   support

           0       0.78      0.85      0.82      2631
           1       0.66      0.54      0.60      1369

    accuracy                           0.75      4000
   macro avg       0.72      0.70      0.71      4000
weighted avg       0.74      0.75      0.74      4000



In [13]:
# 3. CatBoost (Balanced)
cat_balanced = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    depth=6,
    auto_class_weights='Balanced', # <--- Key Change
    random_seed=42,
    verbose=0  # Silent training
)
train_and_log(cat_balanced, "CatBoost_Balanced")

Training CatBoost_Balanced...
Finished CatBoost_Balanced: Acc=0.7492, F1=0.5957
              precision    recall  f1-score   support

           0       0.78      0.86      0.82      2631
           1       0.66      0.54      0.60      1369

    accuracy                           0.75      4000
   macro avg       0.72      0.70      0.71      4000
weighted avg       0.74      0.75      0.74      4000

