In [2]:
# Importing pandas as pd and numpy as np

import pandas as pd
import numpy as np

In [3]:
# Loading the dataset into a pandas DataFrame

df = pd.read_csv("bank_churn_data.csv")

In [4]:
# Displaying the first row of the dataset

df.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [6]:
# Checking the information related to dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10002 non-null  int64  
 1   CustomerId       10002 non-null  int64  
 2   Surname          10002 non-null  object 
 3   CreditScore      10002 non-null  int64  
 4   Geography        10001 non-null  object 
 5   Gender           10002 non-null  object 
 6   Age              10001 non-null  float64
 7   Tenure           10002 non-null  int64  
 8   Balance          10002 non-null  float64
 9   NumOfProducts    10002 non-null  int64  
 10  HasCrCard        10001 non-null  float64
 11  IsActiveMember   10001 non-null  float64
 12  EstimatedSalary  10002 non-null  float64
 13  Exited           10002 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 1.1+ MB


In [7]:
# Describing the dataset

df.describe

<bound method NDFrame.describe of        RowNumber  CustomerId    Surname  CreditScore Geography  Gender   Age  \
0              1    15634602   Hargrave          619    France  Female  42.0   
1              2    15647311       Hill          608     Spain  Female  41.0   
2              3    15619304       Onio          502    France  Female  42.0   
3              4    15701354       Boni          699    France  Female  39.0   
4              5    15737888   Mitchell          850     Spain  Female  43.0   
...          ...         ...        ...          ...       ...     ...   ...   
9997        9998    15584532        Liu          709    France  Female  36.0   
9998        9999    15682355  Sabbatini          772   Germany    Male  42.0   
9999        9999    15682355  Sabbatini          772   Germany    Male  42.0   
10000      10000    15628319     Walker          792    France  Female  28.0   
10001      10000    15628319     Walker          792    France  Female  28.0   

     

In [8]:
# Checking number of rows and columns

df.shape

(10002, 14)

In [9]:
# Displaying the last five rows of the dataset

df.tail()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9997,9998,15584532,Liu,709,France,Female,36.0,7,0.0,1,0.0,1.0,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
9999,9999,15682355,Sabbatini,772,Germany,Male,42.0,3,75075.31,2,1.0,0.0,92888.52,1
10000,10000,15628319,Walker,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0
10001,10000,15628319,Walker,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0


In [11]:
# Counting the value of having chances of Exited

df["Exited"].value_counts()

0    7964
1    2038
Name: Exited, dtype: int64

In [10]:
# Step:1 Checking NaN values

df.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64

In [12]:
# We know that if there are NaN values in the dataset that we can drop the 5% of data. 
# But if NaN values are more that 5% of total dataset then we can't drop it.
# So we can drop Geography, Age, HasCrCard, IsActiveMember because there is only 1 NaN value in these columns.

In [13]:
# Let's drop the columns

df = df.dropna(subset=["Geography"])
df = df.dropna(subset=["Age"])
df = df.dropna(subset=["HasCrCard"])
df = df.dropna(subset=["IsActiveMember"])

In [14]:
df.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [15]:
df.head(1)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1


In [16]:
# Encoding of Categorical Data 

In [17]:
# 1.) Using Map: We use map function for encoding the Sex and Homeownership_Status columns.

df['Gender'] = df['Gender'].map({"Male":0, "Female":1})

In [18]:
# 2.) Using Label Encode: We use LabelEncoder for Geography

from sklearn.preprocessing import LabelEncoder

In [19]:
df["Geography"] = LabelEncoder().fit_transform(df["Geography"])

In [20]:
# I are not doing encoding on Surname because I will not include it in my features.

In [22]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,0,1,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,2,1,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,0,1,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,0,1,39.0,1,0.0,2,0.0,0.0,93826.63,0
5,6,15574012,Chu,645,2,0,44.0,8,113755.78,2,1.0,0.0,149756.71,1


In [23]:
# Normalization

# We will normalize the column Age, CreditScore, Balance, Tenure, NumOfProducts and EstimatedSalary in the range of 0 to 1.

In [24]:
# Importing MinMaxScaler for data preprocessing

from sklearn.preprocessing import MinMaxScaler

In [25]:
# Normalizing the Age, CreditScore, Balance, Tenure, NumOfProducts and EstimatedSalary columns.

df["CreditScore"] = MinMaxScaler().fit_transform(df[["CreditScore"]])
df["Age"] = MinMaxScaler().fit_transform(df[["Age"]])
df["Tenure"] = MinMaxScaler().fit_transform(df[["Tenure"]])
df["Balance"] = MinMaxScaler().fit_transform(df[["Balance"]])
df["NumOfProducts"] = MinMaxScaler().fit_transform(df[["NumOfProducts"]])
df["EstimatedSalary"] = MinMaxScaler().fit_transform(df[["EstimatedSalary"]])

In [26]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,0.538,0,1,0.324324,0.2,0.0,0.0,1.0,1.0,0.506735,1
1,2,15647311,Hill,0.516,2,1,0.310811,0.1,0.334031,0.0,0.0,1.0,0.562709,0
2,3,15619304,Onio,0.304,0,1,0.324324,0.8,0.636357,0.666667,1.0,0.0,0.569654,1
3,4,15701354,Boni,0.698,0,1,0.283784,0.1,0.0,0.333333,0.0,0.0,0.46912,0
5,6,15574012,Chu,0.59,2,0,0.351351,0.8,0.453394,0.333333,1.0,0.0,0.748797,1


In [27]:
# We will separate the Feature Varible and Target Variable

features = ["CreditScore", "Geography", "Gender", "Age", "Tenure", "Balance", "NumOfProducts", "HasCrCard", "IsActiveMember", "EstimatedSalary"]
X = df[features].values
Y = df["Exited"].values

In [28]:
# Displaying features columns 

df[features]

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,0.538,0,1,0.324324,0.2,0.000000,0.000000,1.0,1.0,0.506735
1,0.516,2,1,0.310811,0.1,0.334031,0.000000,0.0,1.0,0.562709
2,0.304,0,1,0.324324,0.8,0.636357,0.666667,1.0,0.0,0.569654
3,0.698,0,1,0.283784,0.1,0.000000,0.333333,0.0,0.0,0.469120
5,0.590,2,0,0.351351,0.8,0.453394,0.333333,1.0,0.0,0.748797
...,...,...,...,...,...,...,...,...,...,...
9997,0.718,0,1,0.243243,0.7,0.000000,0.000000,0.0,1.0,0.210390
9998,0.844,1,0,0.324324,0.3,0.299226,0.333333,1.0,0.0,0.464429
9999,0.844,1,0,0.324324,0.3,0.299226,0.333333,1.0,0.0,0.464429
10000,0.884,0,1,0.135135,0.4,0.518708,0.000000,1.0,0.0,0.190914


In [29]:
# Get the features variable in numpy array

df[features].values

array([[0.538     , 0.        , 1.        , ..., 1.        , 1.        ,
        0.50673489],
       [0.516     , 2.        , 1.        , ..., 0.        , 1.        ,
        0.56270874],
       [0.304     , 0.        , 1.        , ..., 1.        , 0.        ,
        0.56965435],
       ...,
       [0.844     , 1.        , 0.        , ..., 1.        , 0.        ,
        0.46442905],
       [0.884     , 0.        , 1.        , ..., 1.        , 0.        ,
        0.19091423],
       [0.884     , 0.        , 1.        , ..., 1.        , 0.        ,
        0.19091423]])

In [30]:
# Now, Let's train the model

from sklearn.model_selection import train_test_split

In [31]:
# Split the data in 70% and 30% for training and testing the model

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.03)

In [32]:
# Algorithms abbrevations full form

# LOR: Logistic Regression
# KNN: K-Nearest Neighbour
# DT: Decision Tree
# RF: Random Forest
# GB: Gradient Boosting
# XGB: Extreme Gradient Boosting

In [33]:
# In this we are using the LOR, KNN, DT, RF, GB and XGB alogorithm for taining the model

from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

In [34]:
# Here we are creating a model and training the model

model_LOR = LogisticRegression(max_iter=500).fit(X_train, Y_train) 
model_KNN = KNeighborsClassifier(n_neighbors=12).fit(X_train, Y_train)
model_DT = DecisionTreeClassifier().fit(X_train, Y_train)
model_RF = RandomForestClassifier().fit(X_train, Y_train)
model_GB = GradientBoostingClassifier(n_estimators=100, random_state=42).fit(X_train, Y_train)
model_XGB = xgb.XGBClassifier().fit(X_train, Y_train)

In [35]:
# Predicting the Test set results

Y_pred_LOR = model_LOR.predict(X_test)
Y_pred_KNN = model_KNN.predict(X_test)
Y_pred_DT = model_DT.predict(X_test)
Y_pred_RF = model_RF.predict(X_test)
Y_pred_GB = model_GB.predict(X_test)
Y_pred_XGB = model_XGB.predict(X_test)

In [36]:
# Importing accuracy_score from sklearn.metrics

from sklearn.metrics import accuracy_score

In [53]:
# Calculating the model accuracy
# accuracy_score: It is ratio of number of correct output and total number of output

acc_LOR = accuracy_score(Y_test, Y_pred_LOR)*100
acc_KNN = accuracy_score(Y_test, Y_pred_KNN)*100
acc_DT = accuracy_score(Y_test, Y_pred_DT)*100
acc_RF = accuracy_score(Y_test, Y_pred_RF)*100
acc_GB = accuracy_score(Y_test, Y_pred_GB)*100
acc_XGB = accuracy_score(Y_test, Y_pred_XGB)*100

In [54]:
# Printing the accuracy of all algorithms

print("The accuracy for LOR is: ", acc_LOR)
print("The accuracy for KNN is: ", acc_KNN)
print("The accuracy for Decision Tree is: ", acc_DT)
print("The accuracy for Random Forest is: ", acc_RF)
print("The accuracy for Gradient Boosting is: ", acc_GB)
print("The accuracy for Extreme Gradient Boosting is: ", acc_XGB)

The accuracy for LOR is:  79.0
The accuracy for KNN is:  83.33333333333334
The accuracy for Decision Tree is:  77.33333333333333
The accuracy for Random Forest is:  86.66666666666667
The accuracy for Gradient Boosting is:  86.33333333333333
The accuracy for Extreme Gradient Boosting is:  85.0


In [55]:
# Importing precision_score, recall_score and f1_score

from sklearn.metrics import precision_score, recall_score, f1_score

In [56]:
# Calculating the model precision_score, recall_score and f1_score values for LOR

precision_LOR = precision_score(Y_test, Y_pred_LOR)
recall_LOR = recall_score(Y_test, Y_pred_LOR)
f1_LOR = f1_score(Y_test, Y_pred_LOR)

In [57]:
# Printing the precision_score, recall_score and f1_score values for LOR algorithms

print("Precision of LOR:", precision_LOR*100)
print("Recall of LOR:", recall_LOR*100)
print("F1 Score of LOR:", f1_LOR*100)

Precision of LOR: 30.0
Recall of LOR: 5.084745762711865
F1 Score of LOR: 8.695652173913043


In [58]:
# Calculating the model precision_score, recall_score and f1_score values for KNN

precision_KNN = precision_score(Y_test, Y_pred_KNN, zero_division=1)
recall_KNN = recall_score(Y_test, Y_pred_KNN, zero_division=1)
f1_KNN = f1_score(Y_test, Y_pred_KNN, zero_division=1)


In [59]:
# Printing the precision_score, recall_score and f1_score values for KNN algorithms

print("Precision of KNN:", precision_KNN*100)
print("Recall of KNN:", recall_KNN*100)
print("F1 Score of KNN:", f1_KNN*100)

Precision of KNN: 76.47058823529412
Recall of KNN: 22.033898305084744
F1 Score of KNN: 34.210526315789465


In [60]:
# Calculating the model precision_score, recall_score and f1_score values for DT

precision_DT = precision_score(Y_test, Y_pred_DT)
recall_DT = recall_score(Y_test, Y_pred_DT)
f1_DT = f1_score(Y_test, Y_pred_DT)

In [61]:
# Printing the precision_score, recall_score and f1_score values for DT algorithms

print("Precision of DT:", precision_DT*100)
print("Recall of DT:", recall_DT*100)   
print("F1 Score of DT:", f1_DT*100)

Precision of DT: 42.857142857142854
Recall of DT: 45.76271186440678
F1 Score of DT: 44.26229508196722


In [62]:
# Calculating the model precision_score, recall_score and f1_score values for RF

precision_RF = precision_score(Y_test, Y_pred_RF)
recall_RF = recall_score(Y_test, Y_pred_RF)
f1_RF = f1_score(Y_test, Y_pred_RF)

In [63]:
# Printing the precision_score, recall_score and f1_score values for RF algorithms

print("Precision of RF:", precision_RF*100)
print("Recall of RF:", recall_RF*100)
print("F1 Score of RF:", f1_RF*100)

Precision of RF: 80.64516129032258
Recall of RF: 42.3728813559322
F1 Score of RF: 55.55555555555555


In [64]:
# Calculating the model precision_score, recall_score and f1_score values for GB

precision_GB = precision_score(Y_test, Y_pred_GB)
recall_GB = recall_score(Y_test, Y_pred_GB)
f1_GB = f1_score(Y_test, Y_pred_GB)

In [65]:
# Printing the precision_score, recall_score and f1_score values for GB algorithms

print("Precision of GB:", precision_GB*100)
print("Recall of GB:", recall_GB*100)
print("F1 Score of GB:", f1_GB*100)

Precision of GB: 80.0
Recall of GB: 40.67796610169492
F1 Score of GB: 53.93258426966292


In [66]:
# Calculating the model precision_score, recall_score and f1_score values for XGB

precision_XGB = precision_score(Y_test, Y_pred_XGB)
recall_XGB = recall_score(Y_test, Y_pred_GB)
f1_XGB = f1_score(Y_test, Y_pred_XGB)

In [67]:
# Printing the precision_score, recall_score and f1_score values for XGB algorithms

print("Precision of XGB:", precision_XGB*100)
print("Recall of XGB:", recall_XGB*100)
print("F1 Score of XGB:", f1_XGB*100)

Precision of XGB: 69.44444444444444
Recall of XGB: 40.67796610169492
F1 Score of XGB: 52.63157894736842


In [None]:
# Random Forest is the best choice due to:
# Accuracy: Highest for Random Forest (86.67%).
# Precision: Highest for Random Forest (80.65%).
# Recall: Competitive but lower than Decision Tree (42.37%).
# F1 Score: Highest for Random Forest (55.56%).
# It balances well between precision and recall.
#  It effectively handles overfitting, and performs robustly on complex data, making it ideal for predicting customer churn.

In [68]:
# Creating some plots

from matplotlib import pyplot as plt
import seaborn as sns

In [69]:
model_RF

In [70]:
pwd

'C:\\Users\\Lenovo\\OneDrive\\Desktop\\Juypter_projects'

In [71]:
# Saving the model

import joblib

In [73]:
# Save the model as pickel file

joblib.dump(model_GB, "bank_customer_churn_model.pkl")

['bank_customer_churn_model.pkl']

In [74]:
# load the model from the file

loaded_model = joblib.load("bank_customer_churn_model.pkl")