In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#importing churn_model dataset for classification problems
churn=pd.read_csv("Churn_Modelling.csv")
churn.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
#Drop the unwanted variables
#Since it is in series that is the variables to be dropped are together we can use iloc as well
churn1=churn.iloc[:,3:]
churn1.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
#Pre-Processing
#1.Missing value
churn1.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [None]:
#No  missing data


In [6]:
#2.Encoding
churn1.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [9]:
#2.a)Label_encoder
churn1['Geography']=churn1['Geography'].astype('category')
churn1['Geography']=churn1['Geography'].cat.codes

churn1['Gender']=churn1['Gender'].astype('category')
churn1['Gender']=churn1['Gender'].cat.codes

In [10]:
churn1.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [11]:
#2.b)One-Hot-Encoder
churn1=pd.get_dummies(data=churn1,columns=['Geography','Gender'])

In [13]:
churn1.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_0,Geography_1,Geography_2,Gender_0,Gender_1
0,619,42,2,0.0,1,1,1,101348.88,1,1,0,0,1,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,1,0,0,1,0
3,699,39,1,0.0,2,0,0,93826.63,0,1,0,0,1,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1,1,0


In [22]:
#2.c)Dummy Variables
churn1.drop(['Geography_0','Gender_0'],inplace=True,axis=1)


In [16]:
churn1.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_1,Geography_2,Gender_1
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [17]:
#3.Outlier
Q1=churn1.quantile(0.25)
Q3=churn1.quantile(0.75)
IQR=Q3-Q1

churn2=churn1[~((churn1<(Q1-1.5*IQR))|(churn1>(Q3+1.5*IQR))).any(axis=1)]

In [18]:
churn2.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_1,Geography_2,Gender_1
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
6,822,50,7,0.0,2,1,1,10062.8,0,0,0,1
8,501,44,4,142051.07,2,0,1,74940.5,0,0,0,1
9,684,27,2,134603.88,1,1,1,71725.73,0,0,0,1
10,528,31,6,102016.72,2,0,0,80181.12,0,0,0,1


In [20]:
print(churn1.shape)
print(churn2.shape)

(10000, 12)
(5689, 12)


In [None]:
#Here almost half of the data got dropped because of outlier treatment and since its classification problem we just need to predict yes/No etc rather 
#than predicting exact number as in regression, so suggest not to do outlier treatment ---so dont do outlier part

In [23]:
#So taking orgibal dataset
#4.Feature Scaling(Instead of outlier treatment we can do Feature Scaling,so we are not dropping any data)
#But since we are doing ensemble model like RF-Bagging method feature scaling is also not required

In [24]:
#5.Imbalance Dataset
churn1['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [None]:
#here majority class>=2*minority class

In [None]:
#Do oversampling

In [25]:
#Split data into x and y
x=churn1.drop(['Exited'],axis=1)
y=churn1['Exited']

In [26]:
x.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_1,Geography_2,Gender_1
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


In [27]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [None]:
#Since we have imbalance dataset so we have to use oversampling method to balance the data

In [28]:
import imblearn
#status of imbalance
#majority class-->0  -->  7963
#minority class-->1  -->  2037

#Status of balance
#majority class-->0  -->  7963
#minority class-->1  -->  7963

from imblearn.over_sampling import RandomOverSampler
over=RandomOverSampler()
x_over,y_over=over.fit_resample(x,y)

In [32]:
print(x_over.shape)
print(y_over.shape)

(15926, 11)
(15926,)


In [33]:
7963*2

15926

In [None]:
#Preprocessing DOne

In [None]:
#Split the data into training and test for model building and evaluation

In [35]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_over,y_over,train_size=0.75,random_state=505)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(11944, 11)
(11944,)
(3982, 11)
(3982,)


In [None]:
#Bagging Model


In [36]:
from sklearn.ensemble import BaggingClassifier
bagging=BaggingClassifier()
bagging.fit(x_train,y_train)

BaggingClassifier()

In [38]:
y_pred_train=bagging.predict(x_train)
y_pred_test=bagging.predict(x_test)

In [39]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score


In [41]:
print(classification_report(y_train,y_pred_train))
print("####"*20)
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5982
           1       1.00      1.00      1.00      5962

    accuracy                           1.00     11944
   macro avg       1.00      1.00      1.00     11944
weighted avg       1.00      1.00      1.00     11944

################################################################################
              precision    recall  f1-score   support

           0       0.96      0.91      0.93      1981
           1       0.91      0.97      0.94      2001

    accuracy                           0.94      3982
   macro avg       0.94      0.94      0.94      3982
weighted avg       0.94      0.94      0.94      3982



In [42]:
print(confusion_matrix(y_train,y_pred_train))
print("####"*20)
print(confusion_matrix(y_test,y_pred_test))

[[5964   18]
 [  13 5949]]
################################################################################
[[1793  188]
 [  70 1931]]


In [43]:
print(accuracy_score(y_train,y_pred_train))
print("####"*20)
print(accuracy_score(y_test,y_pred_test))

0.9974045545880776
################################################################################
0.9352084379708689


In [None]:
#No high variance as 6% diiference in accuracy between train and test and no high bias as both is 70%problem

In [None]:
#Random Forest Model

In [None]:
#With entropy

In [44]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=200,criterion='entropy')
rf.fit(x_train,y_train)


RandomForestClassifier(criterion='entropy', n_estimators=200)

In [45]:
y_predict_rf_train=rf.predict(x_train)
y_predict_rf_test=rf.predict(x_test)

In [47]:
print(accuracy_score(y_train,y_predict_rf_train))
print("####"*20)
print(accuracy_score(y_test,y_predict_rf_test))

1.0
################################################################################
0.9500251130085384


In [None]:
#Compare with Bagging ,we can see that RF-with entropy is better

In [None]:
#With Gini

In [49]:
from sklearn.ensemble import RandomForestClassifier
rf1=RandomForestClassifier(n_estimators=200,criterion='gini')
rf1.fit(x_train,y_train)


RandomForestClassifier(n_estimators=200)

In [50]:
y_predict_rf_gini_test=rf1.predict(x_test)
y_predict_rf_gini_train=rf1.predict(x_train)

In [51]:
print(accuracy_score(y_train,y_predict_rf_gini_train))
print("####"*20)
print(accuracy_score(y_test,y_predict_rf_gini_test))

1.0
################################################################################
0.9502762430939227


In [None]:
#Compare with Bagging and RF-with entropy ,we can see that RF-with gini is better

In [None]:
#DecisionTree Classifier

In [52]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(x_train,y_train)

DecisionTreeClassifier()

In [53]:
y_pred_dt_train=dt.predict(x_train)
y_pred_dt_test=dt.predict(x_test)

In [54]:
print(accuracy_score(y_train,y_pred_dt_train))
print('####'*20)
print(accuracy_score(y_test,y_pred_dt_test))

1.0
################################################################################
0.9085886489201407


In [None]:
#RF and bagging is better than DT

In [None]:
#Logistic Regression

In [55]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train,y_train)

LogisticRegression()

In [56]:
y_pred_lr_train=lr.predict(x_train)
y_pred_lr_test=lr.predict(x_test)

In [57]:
print(accuracy_score(y_train,y_pred_lr_train))
print('####'*20)
print(accuracy_score(y_test,y_pred_lr_test))

0.6575686537173476
################################################################################
0.6627322953289804


In [None]:
#there is high bias --reject it-- worst 

In [None]:
#Check whether it is because we have oversampled data so instead of x_over and y_over while splitting , just use x and y
#still its not a good approach as classification preoblem we have to handle imbalance dataset

In [58]:
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1=train_test_split(x,y,train_size=0.75,random_state=505)
print(x_train1.shape)
print(y_train1.shape)
print(x_test1.shape)
print(y_test1.shape)

(7500, 11)
(7500,)
(2500, 11)
(2500,)


In [59]:
from sklearn.linear_model import LogisticRegression
lr1=LogisticRegression()
lr1.fit(x_train1,y_train1)

LogisticRegression()

In [60]:
y_pred_lr_train1=lr.predict(x_train1)
y_pred_lr_test1=lr.predict(x_test1)

In [61]:
print(accuracy_score(y_train1,y_pred_lr_train1))
print('####'*20)
print(accuracy_score(y_test1,y_pred_lr_test1))

0.6550666666666667
################################################################################
0.6608


In [None]:
#it still worst