In [45]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import joblib

In [46]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/processed_data.csv')

X = df.drop("Churn", axis=1)
y = df["Churn"]

In [47]:
print(y.value_counts())

Churn
0    5174
1    1869
Name: count, dtype: int64


# **Train/Test Split**

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [49]:
print(y_train.value_counts())

Churn
0    4139
1    1495
Name: count, dtype: int64


In [50]:
scaler = joblib.load('/content/drive/MyDrive/Colab Notebooks/scaler.pkl')
X_train_scaled = scaler.transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)

## **1. Logistic Regression**

In [51]:
lr = LogisticRegression(
    solver="liblinear",
    class_weight="balanced",
    max_iter=2000
)
lr.fit(X_train_scaled, y_train)

lr_preds = lr.predict(X_test_scaled)
lr_f1 = f1_score(y_test, lr_preds)
lr_probs = lr.predict_proba(X_test_scaled)[:, 1]
lr_roc_auc = roc_auc_score(y_test, lr_probs)

print("F1 Score:", lr_f1)
print("ROC AUC Score:", lr_roc_auc)

F1 Score: 0.8453105968331304
ROC AUC Score: 0.9746673900126585


# **2. KNN**

In [52]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
knn_preds = knn.predict(X_test_scaled)
knn_probs = knn.predict_proba(X_test_scaled)[:, 1]

knn_f1 = f1_score(y_test, knn_preds)
knn_auc = roc_auc_score(y_test, knn_probs)
print("F1 Score:", knn_f1)
print("ROC-AUC:", knn_auc)


F1 Score: 0.2847457627118644
ROC-AUC: 0.6214446252809425


In [64]:
# KNN Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()

param_grid = {
    "n_neighbors": [3, 5, 7, 9, 11, 15, 21],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"]
}

grid_knn = GridSearchCV(
    knn,
    param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

grid_knn.fit(X_train_scaled, y_train)

print("Best KNN Params:", grid_knn.best_params_)
best_knn = grid_knn.best_estimator_

knn_preds = best_knn.predict(X_test_scaled)
knn_probs = best_knn.predict_proba(X_test_scaled)[:,1]

knn_f1 = f1_score(y_test, knn_preds)
knn_auc = roc_auc_score(y_test, knn_probs)

print("Improved KNN F1:", knn_f1)
print("Improved KNN ROC-AUC:", knn_auc)

Best KNN Params: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
Improved KNN F1: 0.40570522979397783
Improved KNN ROC-AUC: 0.7000465008137642


# **3. SVM**



In [44]:
svm = SVC(
    kernel="rbf",
    probability=True,
    class_weight="balanced",
    C=1.5,          # Try smaller C
    gamma="scale",
    random_state=42
)
svm.fit(X_train_scaled, y_train)
svm_pred = svm.predict(X_test_scaled)
svm_f1 = f1_score(y_test, svm_pred)
svm_auc = roc_auc_score(y_test, svm.predict_proba(X_test_scaled)[:,1])
print("F1 Score:", svm_f1)
print("ROC-AUC:", svm_auc)

print(pd.Series(svm_pred).value_counts())

F1 Score: 0.46628407460545196
ROC-AUC: 0.6682709447415329
1    1020
0     389
Name: count, dtype: int64


# **4. Decision Tree**



In [53]:
dt = DecisionTreeClassifier(
    class_weight ="balanced",
    random_state=42
)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
dt_probs = dt.predict_proba(X_test)[:,1]
dt_f1 = f1_score(y_test, dt_preds)
dt_auc = roc_auc_score(y_test, dt_probs)
print("F1 Score:", dt_f1)
print("ROC-AUC:", dt_auc)

F1 Score: 0.8191489361702128
ROC-AUC: 0.8779482807615799


# **5.   Random Forest**



In [54]:
rf = RandomForestClassifier(
    n_estimators = 300,
    class_weight = "balanced",
    random_state = 42
)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_prob = rf.predict_proba(X_test)[:, 1]

rf_f1 = f1_score(y_test, rf_preds)
rf_auc = roc_auc_score(y_test, rf_prob)

print("F1:", rf_f1)
print("ROC-AUC:", rf_auc)

F1: 0.8751642575558476
ROC-AUC: 0.976842594745408


# **6. AdaBoost**

In [55]:
ada = AdaBoostClassifier(
    n_estimators=200,
    learning_rate=0.5,
    random_state=42
)
ada.fit(X_train, y_train)
ada_preds = ada.predict(X_test)
ada_probs = ada.predict_proba(X_test)[:, 1]
ada_f1 = f1_score(y_test, ada_preds)
ada_auc = roc_auc_score(y_test, ada_probs)
print("F1 Score:", ada_f1)
print("ROC-AUC:", ada_auc)

F1 Score: 0.8642951251646904
ROC-AUC: 0.9828786587098608


# **7. XGBoost**

In [56]:
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(scale_pos_weight)

2.768561872909699


In [57]:
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate = 0.1,
    max_depth = 5,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric="logloss"
)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)
xgb_probs = xgb.predict_proba(X_test)[:, 1]
xgb_f1 = f1_score(y_test, xgb_preds)
xgb_auc = roc_auc_score(y_test, xgb_probs)
print("F1 Score:", xgb_f1)
print("ROC-AUC:", xgb_auc)

F1 Score: 0.8662420382165605
ROC-AUC: 0.9810664186623266


# **Model Comparision**

In [63]:
results = pd.DataFrame({
    "Model":[
        "Logistic Regression",
        "Random Forest",
        "KNN",
        "SVM",
        "Decision Tree",
        "AdaBoost",
        "XGBoost"
    ],
    "F1 Score":[
        lr_f1,
        rf_f1,
        knn_f1,
        svm_f1,
        dt_f1,
        ada_f1,
        xgb_f1
    ],
    "ROC-AUC": [
        lr_roc_auc,
        rf_auc,
        knn_auc,
        svm_auc,
        dt_auc,
        ada_auc,
        xgb_auc
    ]
})

results.sort_values(by="ROC-AUC", ascending=False)

Unnamed: 0,Model,F1 Score,ROC-AUC
5,AdaBoost,0.864295,0.982879
6,XGBoost,0.866242,0.981066
1,Random Forest,0.875164,0.976843
0,Logistic Regression,0.845311,0.974667
4,Decision Tree,0.819149,0.877948
2,KNN,0.405705,0.700047
3,SVM,0.466284,0.668271


# **Hyperparameter tuning of Random Forest**

In [69]:
rf = RandomForestClassifier(
    class_weight = "balanced",
    random_state = 42
)
param_grid = {
    "n_estimators": [200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}
grid_rf = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    verbose=1
)
grid_rf.fit(X_train, y_train)
print("Best Random Forest Params:", grid_rf.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Random Forest Params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [70]:
best_rf = grid_rf.best_estimator_
rf_preds = best_rf.predict(X_test)
rf_probs = best_rf.predict_proba(X_test)[:, 1]

rf_f1 = f1_score(y_test, rf_preds)
rf_auc = roc_auc_score(y_test, rf_prob)

print("Tuned RF F1:", rf_f1)
print("Tuned RF ROC-AUC:", rf_auc)

Tuned RF F1: 0.8691232528589581
Tuned RF ROC-AUC: 0.976842594745408


# **Hyperparameter tuning of XGBoost**

In [71]:
xgb = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    eval_metric="logloss",
    random_state=42
)
xgb_param_grid = {
    "n_estimators": [200, 300],
    "max_depth": [4, 5, 6],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8, 1.0]
}
grid_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=xgb_param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    verbose=1
)

grid_xgb.fit(X_train, y_train)

print("Best XGB Params:", grid_xgb.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best XGB Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}


In [72]:
best_xgb = grid_xgb.best_estimator_

xgb_preds = best_xgb.predict(X_test)
xgb_probs = best_xgb.predict_proba(X_test)[:,1]

xgb_f1 = f1_score(y_test, xgb_preds)
xgb_auc = roc_auc_score(y_test, xgb_probs)

print("Tuned XGBoost")
print("F1:", xgb_f1)
print("ROC-AUC:", xgb_auc)

Tuned XGBoost
F1: 0.8589263420724095
ROC-AUC: 0.9830995375752409


# **Hyperparameter tuning of AdaBoost**

In [73]:
ada = AdaBoostClassifier(random_state=42)

ada_param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.5, 1.0, 1.5]
}

grid_ada = GridSearchCV(
    estimator=ada,
    param_grid=ada_param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    verbose=1
)

grid_ada.fit(X_train, y_train)

print("Best Ada Params:", grid_ada.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Ada Params: {'learning_rate': 0.5, 'n_estimators': 300}


In [74]:
best_ada = grid_ada.best_estimator_

ada_preds = best_ada.predict(X_test)
ada_probs = best_ada.predict_proba(X_test)[:,1]

ada_f1 = f1_score(y_test, ada_preds)
ada_auc = roc_auc_score(y_test, ada_probs)

print("Tuned AdaBoost")
print("F1:", ada_f1)
print("ROC-AUC:", ada_auc)

Tuned AdaBoost
F1: 0.8624338624338624
ROC-AUC: 0.9826409878839546


In [75]:
results = pd.DataFrame({
    "Model":[
        "Logistic Regression",
        "Random Forest",
        "KNN",
        "SVM",
        "Decision Tree",
        "AdaBoost",
        "XGBoost"
    ],
    "F1 Score":[
        lr_f1,
        rf_f1,
        knn_f1,
        svm_f1,
        dt_f1,
        ada_f1,
        xgb_f1
    ],
    "ROC-AUC": [
        lr_roc_auc,
        rf_auc,
        knn_auc,
        svm_auc,
        dt_auc,
        ada_auc,
        xgb_auc
    ]
})

results.sort_values(by="ROC-AUC", ascending=False)

Unnamed: 0,Model,F1 Score,ROC-AUC
6,XGBoost,0.858926,0.9831
5,AdaBoost,0.862434,0.982641
1,Random Forest,0.869123,0.976843
0,Logistic Regression,0.845311,0.974667
4,Decision Tree,0.819149,0.877948
2,KNN,0.405705,0.700047
3,SVM,0.466284,0.668271


In [77]:
# Model saving
import joblib

joblib.dump(best_rf, "/content/drive/MyDrive/Colab Notebooks/best_model.pkl")

['/content/drive/MyDrive/Colab Notebooks/best_model.pkl']