In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
final_data = pd.read_csv('/Users/surajmathapati/Prep/Customer_churn_prediction/notebooks/final_data.csv')
final_data.head()

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_Germany,gender_Male
0,15634602,619,42,2,0.0,1,1,1,101348.88,1,False,False
1,15619304,502,42,8,159660.8,3,1,0,113931.57,1,False,False
2,15701354,699,39,1,0.0,2,0,0,93826.63,0,False,False
3,15592531,822,50,7,0.0,2,1,1,10062.8,0,False,True
4,15656148,376,29,4,115046.74,4,1,0,119346.88,1,True,False


In [4]:
X = final_data.drop(columns=['churn', 'customer_id'])
y = final_data['churn']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
scaler = StandardScaler()
X_train_smote = scaler.fit_transform(X_train_smote)
X_test = scaler.transform(X_test)

In [6]:
joblib.dump(scaler, "/Users/surajmathapati/Prep/Customer_churn_prediction/models/scaler.pkl")


['/Users/surajmathapati/Prep/Customer_churn_prediction/models/scaler.pkl']

In [None]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, class_weight='balanced', random_state=42)
rf_model.fit(X_train_smote, y_train_smote)

In [7]:
y_pred_rf = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
y_pred_adjusted = (y_pred_proba > 0.35).astype(int)

In [None]:
accuracy = accuracy_score(y_test, y_pred_adjusted)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred_adjusted))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_adjusted))

In [None]:
#USING XGBoost here

In [8]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.8,
    scale_pos_weight=3,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train_smote, y_train_smote)
y_pred_xgb = xgb_model.predict(X_test)

Parameters: { "use_label_encoder" } are not used.



In [12]:
accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

Accuracy: 0.7654

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.77      0.84      1180
           1       0.47      0.74      0.58       325

    accuracy                           0.77      1505
   macro avg       0.69      0.76      0.71      1505
weighted avg       0.82      0.77      0.78      1505


Confusion Matrix:
 [[910 270]
 [ 83 242]]
