In [3]:
import pandas  as pd
import matplotlib.pyplot as plt 
import numpy as np 
import seaborn as sns 

## Data Collection and Exploration

In [4]:
df = pd.read_csv(r"Bank_churn_modelling.csv")
df.shape

(10000, 14)

In [5]:
df.head(2)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [6]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


## Data Cleaning

In [7]:
#check for missing values
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [8]:
# check for duplicates
df.duplicated().sum()

0

In [9]:
# check for outliers
df.skew()

  df.skew()


RowNumber          0.000000
CustomerId         0.001149
CreditScore       -0.071607
Age                1.011320
Tenure             0.010991
Balance           -0.141109
NumOfProducts      0.745568
HasCrCard         -0.901812
IsActiveMember    -0.060437
EstimatedSalary    0.002085
Exited             1.471611
dtype: float64

In [10]:
# drop unwanted columns
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [11]:
df.drop(columns=['RowNumber','CustomerId','Surname'],inplace=True)
df.shape

(10000, 11)

## Feature Selection

### ANOVA
- to assess whether the distribution of a numeric attribute is similar across groups or not (exited and not exited)

NUll Hypothesis: distribution of numeric attribute across two groups is similar
consider CI = 95%, significance level, alpha = 5% = 0.05

- if pvalue>alpha = we fail to reject null hypothesis = feature is not informative
- if pvalue<alpha = we successfully reject null hypothesis = feature is informative


In [13]:
# ANOVA 
nums = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
xnum = df[nums]
y = df['Exited']

from sklearn.feature_selection import f_classif
fscore,pvalue = f_classif(xnum,y)

for i in range(len(nums)):print(nums[i],pvalue[i])

CreditScore 0.006738213892258643
Age 1.2399313093415039e-186
Tenure 0.1615268494952801
Balance 1.275563319153163e-32
NumOfProducts 1.7173330048040421e-06
EstimatedSalary 0.22644042802376574


### Chi Square test
- to assess whether the distribution of a categoric attribute is similar across groups or not (exited and not exited)

NUll Hypothesis: distribution of categoric attribute across two groups is similar
consider CI = 95%, significance level, alpha = 5% = 0.05

- if pvalue>alpha = we fail to reject null hypothesis = feature is not informative
- if pvalue<alpha = we successfully reject null hypothesis = feature is informative


In [14]:
cats = [ 'Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
xcat = df[cats]
y = df['Exited']
from sklearn.preprocessing import LabelEncoder
xcat['Gender'] = LabelEncoder().fit_transform(xcat['Gender'])
xcat['Geography'] = LabelEncoder().fit_transform(xcat['Geography'])

from sklearn.feature_selection import chi2
cscore,pvalue = chi2(xcat,y)
for i in range(len(cats)):print(cats[i],pvalue[i])

Geography 0.0005756078382573235
Gender 7.015574513879596e-13
HasCrCard 0.6984962089530451
IsActiveMember 1.5680362405434552e-27


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xcat['Gender'] = LabelEncoder().fit_transform(xcat['Gender'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xcat['Geography'] = LabelEncoder().fit_transform(xcat['Geography'])


In [15]:
x = df[['CreditScore', 'Geography', 'Gender', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember']]
y =df['Exited']

In [16]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,stratify=y,random_state=5)
print(x.shape,xtrain.shape,xtest.shape)
print(y.shape,ytrain.shape,ytest.shape)

(10000, 7) (8000, 7) (2000, 7)
(10000,) (8000,) (2000,)


### Preprocessing

In [17]:
x.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Balance,NumOfProducts,IsActiveMember
0,619,France,Female,42,0.0,1,1
1,608,Spain,Female,41,83807.86,1,1
2,502,France,Female,42,159660.8,3,0
3,699,France,Female,39,0.0,2,0
4,850,Spain,Female,43,125510.82,1,1


In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

transformer = ColumnTransformer([('ohe',OneHotEncoder(drop="first"),[1,2]),],remainder='passthrough')
transformer.fit(xtrain)
xtrain = transformer.transform(xtrain)
xtest = transformer.transform(xtest)
print(xtrain.shape)

(8000, 8)


# Modelling

In [21]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='gini',min_samples_leaf=10,max_depth=12,
                               class_weight='balanced',random_state=5)
model.fit(xtrain,ytrain)

In [23]:
from sklearn import metrics
print(metrics.classification_report(ytest,model.predict(xtest)))

              precision    recall  f1-score   support

           0       0.92      0.77      0.84      1593
           1       0.46      0.74      0.56       407

    accuracy                           0.77      2000
   macro avg       0.69      0.76      0.70      2000
weighted avg       0.83      0.77      0.78      2000

