In [22]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import numpy as np # linear algebra

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [23]:
df = pd.read_csv("Churn_Modelling.csv")

In [24]:
df.head(5) # Generally done to display get first look of the dataset in first five rows by default.

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [25]:
#Gives one an idea of number of features 
# and rows that will be trained further in the model.
df.shape

(10000, 14)

In [26]:
#Returns all statiscal details for each feature present in the data.

df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [27]:
# Returns the data type of each column .
# Commmon idea - One gets to decide which columns are categorical

df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [28]:
#Returns null values present in the dataset


df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [29]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


* The main idea from here onwards is to convert numeric values into categorical ones since we are trying to classify customers who churned or not.

* Methods that we will try in this dataset -

* 1) Binning
* 2) One Hot Encoding

In [30]:
#Returns "what it says" - counts of a value in a particular column.
#bins - is a very underrated parameter but it can be used to get some really good
#grouping of data values

df.CreditScore.value_counts(bins=3)

(516.667, 683.333]    5350
(683.333, 850.0]      3741
(349.499, 516.667]     909
Name: CreditScore, dtype: int64

In [31]:
# To get number of unique values in Geography column

df.Geography.nunique()

3

In [32]:
# Creation of dummy variables for Gender and Geography 
# Idea behind this - 'Geography' if label encoded in 0, 1 ,2 will have no meaning , thus the model wont't be able to understand the importance of this column . 
#However label encoding 'Gender' could have made sense in a way and can also be done alternatively.

gender_cat = pd.get_dummies(df['Gender'] , drop_first=True)
geo_cat = pd.get_dummies(df['Geography'] , drop_first=True)

df=pd.concat([df , gender_cat ,geo_cat] , axis=1)

In [33]:
# Practice of checking the update done by above cell
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Male,Germany,Spain
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [34]:
#Dropping "of no use" columns to eliminate redundancy.

df.drop(columns=['Gender', 'Geography' , 'RowNumber', 'CustomerId' , 'Surname'] , axis = 1 , inplace = True )

In [35]:
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Male,Germany,Spain
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [38]:
# Data that needs to trained goes in X and respective labels into y 

from sklearn.model_selection import train_test_split
y = df['Exited']
X = df.drop(['Exited'] , axis = 1)

#Splitting the data into training and testing by specifying it in the test_size ,using other parameters i.e. random_state and shuffle depends on you solely.

X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.3 , random_state=0 , shuffle=False)


* Standardization - we center the feature columns at mean 0 with standard deviation 1 so that the feature columns take the form of a normal distribution, which makes it easier to learn the weights.


In [40]:
# Scaling is a crucial step to get apt results when your data value range across columns differ in large scale.

# StandardScaler and MinMaxScaler are more common when dealing with continuous numerical data.
from sklearn.preprocessing import StandardScaler # It will take mean =0 & std.deviation  = 1
ss = StandardScaler()

#Below columns get scaled for train and test respectively.
X_train[['Age',"Tenure" , 'Balance' , 'CreditScore' , "EstimatedSalary" , 'NumOfProducts']] = ss.fit_transform(X_train[['Age',"Tenure" , 'Balance' , 'CreditScore' , "EstimatedSalary" , 'NumOfProducts']])
X_test[['Age',"Tenure" , 'Balance' , 'CreditScore' , "EstimatedSalary" , 'NumOfProducts']] = ss.fit_transform(X_test[['Age',"Tenure" , 'Balance' , 'CreditScore' , "EstimatedSalary" , 'NumOfProducts']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[['Age',"Tenure" , 'Balance' , 'CreditScore' , "EstimatedSalary" , 'NumOfProducts']] = ss.fit_transform(X_train[['Age',"Tenure" , 'Balance' , 'CreditScore' , "EstimatedSalary" , 'NumOfProducts']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/

In [41]:
#Importing the libraries we need to build a neural network

from tensorflow import keras
from tensorflow.keras import layers , Sequential 
from keras.layers import Dense

# We can piece it all together by adding each layer:

* The model expects sample of data with 11 features mentioned in the input_dim = 11 argument

* The first and the second hidden layer comprises of 8 nodes and uses the relu activation function.

* The output layer has 1 node and uses sigmoid activation function.

* Choosing number of nuerons for each hidden layer is intutive Using too many neurons in the hidden layers may result in overfitting.

# Few rules of thumb that one can consider for determining acceptable number of nuerons to use in the hidden layer -

* No. of hidden nuerons should be between the size of input layer and size of the output layer.

* No. of hidden layer neurons should be 2/3 the size of the input layer , plus the size of the output layer.

* No. of hidden nuerons should be less than twice the size of the input layer

* These three rules can give you a good start and it can eventually come down to intutive reasoning of trial and error in selecting the no. of neurons.

In [42]:
model=Sequential([
    layers.Dense(8, activation = 'relu' , input_shape = [11]),
    layers.Dense(8 , activation = 'relu' ),
    layers.Dense(1 , activation = 'sigmoid')
])
      

### NOTE - The most confusing thing here is that the shape of the input to the model is defined as an argument on the first hidden layer. This means that the line of code that adds the first Dense layer is doing 2 things, defining the input or visible layer and the first hidden layer.

* Once the model gets defined it can now be compiled.

* optimizer - adam ( stochasticc gradient descent algorithm ) , because it automatically tunes itself and give good results.

* loss - binary crossentropy ( to evaluate the error in current state of the model which will be estimated repeatedly )

* metrics = reports the classification accuracy



In [43]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

* Once the model gets compiled its ready to be trained.

* Epoch can be thought of as a nested for-loop that iterates over each batch of samples, where one batch has the specified “batch size” number of samples.

* Batch is analogous to a for-loop iterating over one or more samples and making predictions.

* These configurations can be chosen by trial and error.

In [44]:
model.fit(X_train, y_train, batch_size = 25, epochs = 100,verbose = 0)

<keras.callbacks.History at 0x28f3d290820>

In [45]:
# Predicting on train data
y_pred = model.predict(X_train)
score, acc = model.evaluate(X_train, y_train,batch_size=10)
print('Train score:', score)
print('Train accuracy:', acc*100)

Train score: 0.3242236375808716
Train accuracy: 86.68571710586548


In [46]:
# Predicting on test data
y_pred = model.predict(X_test)
score, acc = model.evaluate(X_test, y_test,batch_size=10)
print('Test score:', score)
print('Test accuracy:', acc*100)

Test score: 0.3327031433582306
Test accuracy: 86.33333444595337


In [47]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error


y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)*1
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print("Confusion Matrix Accuracy: "+ str(accuracy*100)+"%")

#F1 score
recall=(cm[0][0])/(cm[0][0]+cm[0][1])
precision=(cm[0][0])/(cm[0][0]+cm[1][0])
F1=(2*recall*precision)/(precision+recall)
print("F1 Score:"+str(F1))

#MAE
mae=mean_absolute_error(y_test, y_pred)
print("MAE:"+str(mae))

[[2310   96]
 [ 314  280]]
Confusion Matrix Accuracy: 86.33333333333333%
F1 Score:0.9184890656063619
MAE:0.13666666666666666
