In [15]:
import pandas as pd

churn= pd.read_csv(f"final_churn_data.csv")

In [16]:
X = churn.drop('Exited', axis=1)
y = churn['Exited']


In [17]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Stratified split ensures both classes appear in train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,          # ✅ critical line
    random_state=42
)

# Apply SMOTE safely
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score



In [19]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("🔹 Random Forest Results:")
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))


🔹 Random Forest Results:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.77      0.45      0.57       404

    accuracy                           0.86      1997
   macro avg       0.82      0.71      0.74      1997
weighted avg       0.85      0.86      0.85      1997

ROC-AUC: 0.8511899212520122


In [20]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Get predicted probabilities
y_proba = rf.predict_proba(X_test)[:, 1]

# Set new threshold
threshold = 0.4
y_pred_new = np.where(y_proba >= threshold, 1, 0)

# Evaluate again
print(classification_report(y_test, y_pred_new))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

# Optional: check confusion matrix
print(confusion_matrix(y_test, y_pred_new))


              precision    recall  f1-score   support

           0       0.89      0.92      0.91      1593
           1       0.64      0.53      0.58       404

    accuracy                           0.85      1997
   macro avg       0.76      0.73      0.74      1997
weighted avg       0.84      0.85      0.84      1997

ROC-AUC: 0.8511899212520122
[[1472  121]
 [ 188  216]]


In [21]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

y_proba = rf.predict_proba(X_test)[:, 1]
y_pred_threshold = np.where(y_proba >= 0.4, 1, 0)

print(classification_report(y_test, y_pred_threshold))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(confusion_matrix(y_test, y_pred_threshold))


              precision    recall  f1-score   support

           0       0.89      0.92      0.91      1593
           1       0.64      0.53      0.58       404

    accuracy                           0.85      1997
   macro avg       0.76      0.73      0.74      1997
weighted avg       0.84      0.85      0.84      1997

ROC-AUC: 0.8511899212520122
[[1472  121]
 [ 188  216]]


In [22]:
import pandas as pd
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(feature_importances.head(10))


                          Feature  Importance
2                             Age    0.164838
5                   NumOfProducts    0.101025
16  NumOfProducts_EstimatedSalary    0.088794
4                         Balance    0.071350
13                CreditScore_Age    0.069776
12                    TenureByAge    0.068008
11             BalanceSalaryRatio    0.067891
14        Balance_EstimatedSalary    0.064061
0                     CreditScore    0.063950
8                 EstimatedSalary    0.060475
