# Customer Churn Prediction 

## Step 1: Importing necessary libraries

In [79]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [80]:
# importing libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Step 2: Importing the dataset

In [81]:
# importing dataset
churn_data = pd.read_csv('Churn_Modelling.csv')

## Step 3: Preprocessing

In [82]:
# preprocessing data
churn_data.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [83]:
# preprocessing data
churn_data.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [84]:
# preprocessing data
churn_data.duplicated().sum()

0

In [85]:
# converting categorical values to numerical values
from sklearn.preprocessing import LabelEncoder
le  = LabelEncoder()
churn_data['Geography'] = le.fit_transform(churn_data['Geography'])
churn_data['Gender'] = le.fit_transform(churn_data['Gender'])

In [86]:
# extracting column names
churn_data.columns.values

array(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'], dtype=object)

## Step 4: Selecting features and target variable

In [87]:
# Selecting feature and target variables
sf = ['CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']
X = churn_data[sf]
Y = churn_data['Exited']

## Step 5: Splitting the dataset into training and testing sets

In [88]:
# Splitting the dataset 
X_train, X_test, Y_train , Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

## Step 6 : Initializing and training the model

In [89]:
# training random forest model 
rf_model = RandomForestClassifier(random_state = 42)
rf_model.fit(X_train, Y_train)

## Step 7 : Predicting on test data

In [90]:
# predicting on test data
y_pred1 = rf_model.predict(X_test)
# model performance
print('Accuracy Score: ')
print(accuracy_score(Y_test, y_pred1))
print('Confusion Matrix: ')
print(confusion_matrix(Y_test, y_pred1))
print('classification report: ')
print(classification_report(Y_test, y_pred1))

Accuracy Score: 
0.8645
Confusion Matrix: 
[[1546   61]
 [ 210  183]]
classification report: 
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.47      0.57       393

    accuracy                           0.86      2000
   macro avg       0.82      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000



#### 1.  The model has an accuracy of 86.45%, meaning it correctly predicts whether a customer will churn or not in 86.45% of cases.
#### 2.  The model has higher precision for non-churn cases, meaning it’s good at identifying non-churn customers.
#### 3.   Recall for churn cases is low (0.47), indicating it misses many true churn cases.

### The model performs well but could be improved, especially in identifying churn cases.


## Step 8 : Scaling the dataset 

In [91]:
# scaling the dataset using standard scaler 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Step 9 : Handle class imbalance using SMOTE

In [92]:
# balancing the dataset using smote
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, Y_resampled = smote.fit_resample(X_scaled, Y)

## Step 10: Splitting the resampled data

In [93]:
# splitting the dataset for resampled data
X_train, X_test, Y_train , Y_test = train_test_split(X_resampled, Y_resampled, test_size = 0.2, random_state = 42)

## Step 11 : Hyperparameter tuning

In [94]:
# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

## Step 12 :  Implementing the GridSearchCV

In [96]:
#fitting the gridsearch 
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


## Step 13: Evaluating gridsearch 

In [97]:
# Evaluate the best model
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

## Step 14: Model Performance

In [98]:
# evaluating model performance
print('Tuned Model Accuracy Score:', accuracy_score(Y_test, y_pred))
print('Confusion Matrix:', confusion_matrix(Y_test, y_pred))
print('Classification Report:', classification_report(Y_test, y_pred))

Tuned Model Accuracy Score: 0.9099183929692404
Confusion Matrix: [[1489  144]
 [ 143 1410]]
Classification Report:               precision    recall  f1-score   support

           0       0.91      0.91      0.91      1633
           1       0.91      0.91      0.91      1553

    accuracy                           0.91      3186
   macro avg       0.91      0.91      0.91      3186
weighted avg       0.91      0.91      0.91      3186



#### 1. The tuned model has improved significantly from the previous iteration, achieving a 91% accuracy. 
#### 2. The precision, recall, and F1-score are well-balanced for both churn and non-churn classes.
#### 3. The model is now more effective at predicting both types of cases with minimal false positives and false negatives, showing a substantial improvement in its ability to detect churn.