In [32]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import joblib

In [34]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/processed_data.csv')

X = df.drop([
    "Churn",          # target
    "Churn Score",    # leakage
    "CLTV",           # likely derived
    "Zip Code",       # location noise
    "Latitude",
    "Longitude"
], axis=1)
y = df["Churn"]

In [35]:
print(y.value_counts())

Churn
0    5174
1    1869
Name: count, dtype: int64


# **Train/Test Split**

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [37]:
print(y_train.value_counts())

Churn
0    4139
1    1495
Name: count, dtype: int64


In [38]:
scaler = joblib.load('/content/drive/MyDrive/Colab Notebooks/scaler.pkl')
X_train_scaled = scaler.transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)



## **1. Logistic Regression**

In [39]:
lr = LogisticRegression(
    solver="liblinear",
    class_weight="balanced",
    max_iter=2000
)
lr.fit(X_train_scaled, y_train)

lr_preds = lr.predict(X_test_scaled)
lr_probs = lr.predict_proba(X_test_scaled)[:, 1]

lr_accuracy = accuracy_score(y_test, lr_preds)
lr_precision = precision_score(y_test, lr_preds)
lr_recall = recall_score(y_test, lr_preds)
lr_f1 = f1_score(y_test, lr_preds)
lr_roc_auc = roc_auc_score(y_test, lr_probs)

print("Logistic Regression")
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1 Score:", lr_f1)
print("ROC AUC Score:", lr_roc_auc)

Logistic Regression
Accuracy: 0.7430801987224982
Precision: 0.5104895104895105
Recall: 0.7807486631016043
F1 Score: 0.6173361522198731
ROC AUC Score: 0.8485597664625798


# **2. KNN**

In [40]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
knn_preds = knn.predict(X_test_scaled)
knn_probs = knn.predict_proba(X_test_scaled)[:, 1]

knn_accuracy = accuracy_score(y_test, knn_preds)
knn_precision = precision_score(y_test, knn_preds)
knn_recall = recall_score(y_test, knn_preds)
knn_f1 = f1_score(y_test, knn_preds)
knn_auc = roc_auc_score(y_test, knn_probs)

print("KNN")
print("Accuracy:", knn_accuracy)
print("Precision:", knn_precision)
print("Recall:", knn_recall)
print("F1 Score:", knn_f1)
print("ROC-AUC:", knn_auc)


KNN
Accuracy: 0.7608232789212207
Precision: 0.5509641873278237
Recall: 0.5347593582887701
F1 Score: 0.5427408412483039
ROC-AUC: 0.7785618331654138


In [41]:
# KNN Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()

param_grid = {
    "n_neighbors": [3, 5, 7, 9, 11, 15, 21],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"]
}

grid_knn = GridSearchCV(
    knn,
    param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

grid_knn.fit(X_train_scaled, y_train)

print("Best KNN Params:", grid_knn.best_params_)
best_knn = grid_knn.best_estimator_

knn_preds = best_knn.predict(X_test_scaled)
knn_probs = best_knn.predict_proba(X_test_scaled)[:,1]

knn_accuracy = accuracy_score(y_test, knn_preds)
knn_precision = precision_score(y_test, knn_preds)
knn_recall = recall_score(y_test, knn_preds)
knn_f1 = f1_score(y_test, knn_preds)
knn_auc = roc_auc_score(y_test, knn_probs)

print("Improved Accuracy:", knn_accuracy)
print("Improved Precision:", knn_precision)
print("Improved Recall:", knn_recall)
print("Improved KNN F1:", knn_f1)
print("Improved KNN ROC-AUC:", knn_auc)

Best KNN Params: {'metric': 'euclidean', 'n_neighbors': 21, 'weights': 'uniform'}
Improved Accuracy: 0.7814052519517388
Improved Precision: 0.5877659574468085
Improved Recall: 0.5909090909090909
Improved KNN F1: 0.5893333333333334
Improved KNN ROC-AUC: 0.8254914877677026


# **3. SVM**



In [42]:
svm = SVC(
    kernel="rbf",
    probability=True,
    class_weight="balanced",
    C=1.5,          # Try smaller C
    gamma="scale",
    random_state=42
)
svm.fit(X_train_scaled, y_train)
svm_preds = svm.predict(X_test_scaled)

svm_accuracy = accuracy_score(y_test, svm_preds)
svm_precision = precision_score(y_test, svm_preds)
svm_recall = recall_score(y_test, svm_preds)
svm_f1 = f1_score(y_test, svm_preds)
svm_auc = roc_auc_score(y_test, svm.predict_proba(X_test_scaled)[:,1])

print("SVM")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1)
print("ROC-AUC:", svm_auc)

print(pd.Series(svm_preds).value_counts())

SVM
Accuracy: 0.7615330021291696
Precision: 0.5344202898550725
Recall: 0.7887700534759359
F1 Score: 0.6371490280777538
ROC-AUC: 0.8305613681572761
0    857
1    552
Name: count, dtype: int64


# **4. Decision Tree**



In [43]:
dt = DecisionTreeClassifier(
    class_weight ="balanced",
    random_state=42
)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
dt_probs = dt.predict_proba(X_test)[:,1]

dt_accuracy = accuracy_score(y_test, dt_preds)
dt_precision = precision_score(y_test, dt_preds)
dt_recall = recall_score(y_test, dt_preds)
dt_f1 = f1_score(y_test, dt_preds)
dt_auc = roc_auc_score(y_test, dt_probs)

print("Decision Tree")
print("Accuracy:", dt_accuracy)
print("Precision:", dt_precision)
print("Recall:", dt_recall)
print("F1 Score:", dt_f1)
print("ROC-AUC:", dt_auc)

Decision Tree
Accuracy: 0.7359829666430092
Precision: 0.5025906735751295
Recall: 0.5187165775401069
F1 Score: 0.5105263157894737
ROC-AUC: 0.6671058410188844


# **5.   Random Forest**



In [44]:
rf = RandomForestClassifier(
    n_estimators = 300,
    class_weight = "balanced",
    random_state = 42
)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_prob = rf.predict_proba(X_test)[:, 1]

rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_f1 = f1_score(y_test, rf_preds)
rf_auc = roc_auc_score(y_test, rf_prob)

print("Random Forest")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1:", rf_f1)
print("ROC-AUC:", rf_auc)

Random Forest
Accuracy: 0.7913413768630234
Precision: 0.636986301369863
Recall: 0.49732620320855614
F1: 0.5585585585585585
ROC-AUC: 0.836159549458782


# **6. AdaBoost**

In [45]:
ada = AdaBoostClassifier(
    n_estimators=200,
    learning_rate=0.5,
    random_state=42
)
ada.fit(X_train, y_train)
ada_preds = ada.predict(X_test)
ada_probs = ada.predict_proba(X_test)[:, 1]

ada_accuracy = accuracy_score(y_test, ada_preds)
ada_precision = precision_score(y_test, ada_preds)
ada_recall = recall_score(y_test, ada_preds)
ada_f1 = f1_score(y_test, ada_preds)
ada_auc = roc_auc_score(y_test, ada_probs)

print("AdaBoost")
print("Accuracy:", ada_accuracy)
print("Precision:", ada_precision)
print("Recall:", ada_recall)
print("F1 Score:", ada_f1)
print("ROC-AUC:", ada_auc)

AdaBoost
Accuracy: 0.8090844570617459
Precision: 0.6732673267326733
Recall: 0.5454545454545454
F1 Score: 0.6026587887740029
ROC-AUC: 0.8501666279159885


# **7. XGBoost**

In [46]:
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(scale_pos_weight)

2.768561872909699


In [47]:
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate = 0.1,
    max_depth = 5,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric="logloss"
)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)
xgb_probs = xgb.predict_proba(X_test)[:, 1]

xgb_accuracy = accuracy_score(y_test, xgb_preds)
xgb_precision = precision_score(y_test, xgb_preds)
xgb_recall = recall_score(y_test, xgb_preds)
xgb_f1 = f1_score(y_test, xgb_preds)
xgb_auc = roc_auc_score(y_test, xgb_probs)

print("XGBoost")
print("Accuracy:", xgb_accuracy)
print("Precision:", xgb_precision)
print("Recall:", xgb_recall)
print("F1 Score:", xgb_f1)
print("ROC-AUC:", xgb_auc)

XGBoost
Accuracy: 0.7572746628814763
Precision: 0.5307692307692308
Recall: 0.7379679144385026
F1 Score: 0.6174496644295302
ROC-AUC: 0.8412681805264925


# **Model Comparision**

In [48]:
final_results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "KNN",
        "SVM",
        "Decision Tree",
        "Random Forest",
        "AdaBoost",
        "XGBoost"
    ],
    "Accuracy": [
        lr_accuracy,
        knn_accuracy,
        svm_accuracy,
        dt_accuracy,
        rf_accuracy,
        ada_accuracy,
        xgb_accuracy
    ],
    "Precision": [
        lr_precision,
        knn_precision,
        svm_precision,
        dt_precision,
        rf_precision,
        ada_precision,
        xgb_precision
    ],
    "Recall": [
        lr_recall,
        knn_recall,
        svm_recall,
        dt_recall,
        rf_recall,
        ada_recall,
        xgb_recall
    ],
    "F1 Score": [
        lr_f1,
        knn_f1,
        svm_f1,
        dt_f1,
        rf_f1,
        ada_f1,
        xgb_f1
    ],
    "ROC-AUC": [
        lr_roc_auc,
        knn_auc,
        svm_auc,
        dt_auc,
        rf_auc,
        ada_auc,
        xgb_auc
    ]
})

final_results.sort_values(
    by=["F1 Score", "Recall"],
    ascending=False
)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC-AUC
2,SVM,0.761533,0.53442,0.78877,0.637149,0.830561
6,XGBoost,0.757275,0.530769,0.737968,0.61745,0.841268
0,Logistic Regression,0.74308,0.51049,0.780749,0.617336,0.84856
5,AdaBoost,0.809084,0.673267,0.545455,0.602659,0.850167
1,KNN,0.781405,0.587766,0.590909,0.589333,0.825491
4,Random Forest,0.791341,0.636986,0.497326,0.558559,0.83616
3,Decision Tree,0.735983,0.502591,0.518717,0.510526,0.667106


# **Hyperparameter tuning of SVM**

In [63]:
svm = SVC(probability=True)

param_grid_svm = {
    "C": [0.5, 1, 5],
    "gamma": ["scale", 0.1],
    "kernel": ["rbf"],
    "class_weight": ["balanced"]
}

grid_svm = GridSearchCV(
    svm,
    param_grid_svm,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

grid_svm.fit(X_train_scaled, y_train)
print("Best SVM Params:", grid_svm.best_params_)

Best SVM Params: {'C': 1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}


In [64]:
best_svm = grid_svm.best_estimator_

svm_preds = best_svm.predict(X_test_scaled)
svm_probs = best_svm.predict_proba(X_test_scaled)[:, 1]

svm_accuracy = accuracy_score(y_test, svm_preds)
svm_precision = precision_score(y_test, svm_preds)
svm_recall = recall_score(y_test, svm_preds)
svm_f1 = f1_score(y_test, svm_preds)
svm_auc = roc_auc_score(y_test, svm_probs)

print("Tuned SVM")
print("Best Parameters:", grid_svm.best_params_)
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1)
print("ROC-AUC:", svm_auc)

Tuned SVM
Best Parameters: {'C': 1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.7558552164655784
Precision: 0.5268817204301075
Recall: 0.786096256684492
F1 Score: 0.630901287553648
ROC-AUC: 0.8317755560722311


# **Hyperparameter tuning of Random Forest**

In [49]:
rf = RandomForestClassifier(
    class_weight = "balanced",
    random_state = 42
)
param_grid = {
    "n_estimators": [200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}
grid_rf = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    verbose=1
)
grid_rf.fit(X_train, y_train)
print("Best Random Forest Params:", grid_rf.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Random Forest Params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}


In [50]:
best_rf = grid_rf.best_estimator_
rf_preds = best_rf.predict(X_test)
rf_probs = best_rf.predict_proba(X_test)[:, 1]

rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds)
rf_recall = recall_score(y_test, rf_preds)
rf_f1 = f1_score(y_test, rf_preds)
rf_auc = roc_auc_score(y_test, rf_prob)

print("Tuned RF Accuracy:", rf_accuracy)
print("Tuned RF Precision:", rf_precision)
print("Tuned RF Recall:", rf_recall)
print("Tuned RF F1:", rf_f1)
print("Tuned RF ROC-AUC:", rf_auc)

Tuned RF Accuracy: 0.7665010645848119
Tuned RF Precision: 0.5443786982248521
Tuned RF Recall: 0.7379679144385026
Tuned RF F1: 0.626560726447219
Tuned RF ROC-AUC: 0.836159549458782


# **Hyperparameter tuning of XGBoost**

In [51]:
xgb = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    eval_metric="logloss",
    random_state=42
)
xgb_param_grid = {
    "n_estimators": [200, 300],
    "max_depth": [4, 5, 6],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8, 1.0]
}
grid_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=xgb_param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    verbose=1
)

grid_xgb.fit(X_train, y_train)

print("Best XGB Params:", grid_xgb.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best XGB Params: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.8}


In [52]:
best_xgb = grid_xgb.best_estimator_

xgb_preds = best_xgb.predict(X_test)
xgb_probs = best_xgb.predict_proba(X_test)[:,1]

xgb_accuracy = accuracy_score(y_test, xgb_preds)
xgb_precision = precision_score(y_test, xgb_preds)
xgb_recall = recall_score(y_test, xgb_preds)
xgb_f1 = f1_score(y_test, xgb_preds)
xgb_auc = roc_auc_score(y_test, xgb_probs)

print("Tuned XGBoost")
print("Accuracy:", xgb_accuracy)
print("Precision:", xgb_precision)
print("Recall:", xgb_recall)
print("F1:", xgb_f1)
print("ROC-AUC:", xgb_auc)

Tuned XGBoost
Accuracy: 0.7565649396735273
Precision: 0.527336860670194
Recall: 0.7994652406417112
F1: 0.6354941551540914
ROC-AUC: 0.8545118706244026


# **Hyperparameter tuning of Logistic Regression**

In [56]:
from sklearn.model_selection import GridSearchCV

log_reg = LogisticRegression(max_iter=5000)

param_grid_lr = {
    "C": [0.01, 0.1, 1, 5, 10],
    "penalty": ["l2"],
    "solver": ["lbfgs", "liblinear"],
    "class_weight": [None, "balanced"]
}

grid_lr = GridSearchCV(
    log_reg,
    param_grid_lr,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

grid_lr.fit(X_train_scaled, y_train)

best_lr = grid_lr.best_estimator_

print("Best Logistic Params:", grid_lr.best_params_)

Best Logistic Params: {'C': 10, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}


In [59]:
from sklearn.metrics import classification_report

lr_preds = best_lr.predict(X_test_scaled)
lr_probs = best_lr.predict_proba(X_test_scaled)[:, 1]

lr_accuracy = accuracy_score(y_test, lr_preds)
lr_precision = precision_score(y_test, lr_preds)
lr_recall = recall_score(y_test, lr_preds)
lr_f1 = f1_score(y_test, lr_preds)
lr_roc_auc = roc_auc_score(y_test, lr_probs)

print("Tuned Logistic Regression")
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1:", lr_f1)
print("ROC-AUC:", lr_roc_auc)

Tuned Logistic Regression
Accuracy: 0.7430801987224982
Precision: 0.5104895104895105
Recall: 0.7807486631016043
F1: 0.6173361522198731
ROC-AUC: 0.847862254256116


# **Hyperparameter tuning of AdaBoost**

In [53]:
ada = AdaBoostClassifier(random_state=42)

ada_param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.5, 1.0, 1.5]
}

grid_ada = GridSearchCV(
    estimator=ada,
    param_grid=ada_param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    verbose=1
)

grid_ada.fit(X_train, y_train)

print("Best Ada Params:", grid_ada.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Ada Params: {'learning_rate': 1.5, 'n_estimators': 100}


In [54]:
best_ada = grid_ada.best_estimator_

ada_preds = best_ada.predict(X_test)
ada_probs = best_ada.predict_proba(X_test)[:,1]

ada_accuracy = accuracy_score(y_test, ada_preds)
ada_precision = precision_score(y_test, ada_preds)
ada_recall = recall_score(y_test, ada_preds)
ada_f1 = f1_score(y_test, ada_preds)
ada_auc = roc_auc_score(y_test, ada_probs)

print("Tuned AdaBoost")
print("Accuracy:", ada_accuracy)
print("Precision:", ada_precision)
print("Recall:", ada_recall)
print("F1:", ada_f1)
print("ROC-AUC:", ada_auc)

Tuned AdaBoost
Accuracy: 0.78708303761533
Precision: 0.6114457831325302
Recall: 0.5427807486631016
F1: 0.5750708215297451
ROC-AUC: 0.8473455785476246


In [65]:
final_results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "KNN",
        "SVM",
        "Decision Tree",
        "Random Forest",
        "AdaBoost",
        "XGBoost"
    ],
    "Accuracy": [
        lr_accuracy,
        knn_accuracy,
        svm_accuracy,
        dt_accuracy,
        rf_accuracy,
        ada_accuracy,
        xgb_accuracy
    ],
    "Precision": [
        lr_precision,
        knn_precision,
        svm_precision,
        dt_precision,
        rf_precision,
        ada_precision,
        xgb_precision
    ],
    "Recall": [
        lr_recall,
        knn_recall,
        svm_recall,
        dt_recall,
        rf_recall,
        ada_recall,
        xgb_recall
    ],
    "F1 Score": [
        lr_f1,
        knn_f1,
        svm_f1,
        dt_f1,
        rf_f1,
        ada_f1,
        xgb_f1
    ],
    "ROC-AUC": [
        lr_roc_auc,
        knn_auc,
        svm_auc,
        dt_auc,
        rf_auc,
        ada_auc,
        xgb_auc
    ]
})

final_results.sort_values(
    by=["F1 Score", "Recall"],
    ascending=False
)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC-AUC
6,XGBoost,0.756565,0.527337,0.799465,0.635494,0.854512
2,SVM,0.755855,0.526882,0.786096,0.630901,0.831776
4,Random Forest,0.766501,0.544379,0.737968,0.626561,0.83616
0,Logistic Regression,0.74308,0.51049,0.780749,0.617336,0.847862
1,KNN,0.781405,0.587766,0.590909,0.589333,0.825491
5,AdaBoost,0.787083,0.611446,0.542781,0.575071,0.847346
3,Decision Tree,0.735983,0.502591,0.518717,0.510526,0.667106


In [71]:
# Model saving
import joblib

joblib.dump(best_xgb, "/content/drive/MyDrive/Colab Notebooks/best_model.pkl")

['/content/drive/MyDrive/Colab Notebooks/best_model.pkl']