In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import joblib
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

import xgboost as xgb


In [3]:
DATA_PATH = "//content//drive//MyDrive//churn//data//processed//churn_cleaned_integer_ready.csv"

df = pd.read_csv(DATA_PATH)

X = df.drop(columns=["Churn"])
y = df["Churn"]

print("Dataset Loaded:", X.shape)


Dataset Loaded: (7032, 23)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [5]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
models = {
    "logistic_regression": LogisticRegression(max_iter=2000),
    "random_forest": RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        class_weight="balanced",
        random_state=42
    ),
    "gradient_boosting": GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    ),
    "svm": SVC(
        kernel="rbf",
        probability=True,
        class_weight="balanced"
    ),
    "xgboost": xgb.XGBClassifier(
        n_estimators=400,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=42
    )
}


In [7]:
def evaluate_model(name, model, Xtr, Xte):
    model.fit(Xtr, y_train)
    y_pred = model.predict(Xte)
    y_prob = model.predict_proba(Xte)[:, 1]

    return {
        "model": name,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }


In [8]:
results = []

for name, model in models.items():
    if name in ["logistic_regression", "svm"]:
        scores = evaluate_model(name, model, X_train_scaled, X_test_scaled)
    else:
        scores = evaluate_model(name, model, X_train, X_test)

    results.append(scores)


In [9]:
results_df = pd.DataFrame(results).sort_values(by="roc_auc", ascending=False)
results_df


Unnamed: 0,model,accuracy,precision,recall,f1,roc_auc
2,gradient_boosting,0.795309,0.63522,0.540107,0.583815,0.839524
0,logistic_regression,0.803838,0.648485,0.572193,0.607955,0.835809
1,random_forest,0.779673,0.572072,0.679144,0.621027,0.833544
4,xgboost,0.775409,0.590625,0.505348,0.544669,0.81922
3,svm,0.722104,0.485569,0.764706,0.593977,0.804109


In [10]:
best_model_name = results_df.iloc[0]["model"]
best_model = models[best_model_name]

print("Best Model:", best_model_name)


Best Model: gradient_boosting


In [11]:
if best_model_name in ["logistic_regression", "svm"]:
    best_model.fit(scaler.fit_transform(X), y)
else:
    best_model.fit(X, y)


In [12]:
MODELS_DIR = "//content//drive//MyDrive//churn//model"
os.makedirs(MODELS_DIR, exist_ok=True)


In [14]:
joblib.dump(best_model, f"{"//content//drive//MyDrive//churn//model"}/{best_model_name}.joblib")

with open(f"{"//content//drive//MyDrive//churn//model"}/{best_model_name}.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Best model saved successfully")


Best model saved successfully


In [15]:
joblib.dump(scaler, f"{"//content//drive//MyDrive//churn//model"}/scaler.joblib")

with open(f"{"//content//drive//MyDrive//churn//model"}/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Scaler saved successfully")


Scaler saved successfully
