In [2]:
# Import the libraries
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix

In [40]:
# Read the dataset
ds = pd.read_csv("Churn_Modelling.csv")

In [41]:
ds.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [42]:
# Drop the columns which are unique for all users like IDs
ds['Geography'].value_counts()

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [43]:
# Rownumber, Customer id, and surname are unique so we drop them
ds = ds.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [44]:
# Distinguish the features and target set
X = ds.iloc[:, 0: 10].values
y = ds.iloc[:, 10].values

In [45]:
#Encoding categorical (string based) data. Country: there are 3 options: France, Spain and Germany. This will convert those strings into scalar values for analysis.
print(X[:8, 1], '... will become: ')
lable_X_country_encoder = LabelEncoder()
X[:, 1] = lable_X_country_encoder.fit_transform(X[:, 1])
print(X[:8, 1])

['France' 'Spain' 'France' 'France' 'Spain' 'Spain' 'France' 'Germany'] ... will become: 
[0 2 0 0 2 2 0 1]


In [46]:
# We will do the same thing for gender, this will be binary in the dataset
print(X[6, 1], '... will become: ')
label_X_gender_encoder = LabelEncoder()
X[:, 2] = label_X_gender_encoder.fit_transform(X[:, 2])
print(X[:6, 1])

0 ... will become: 
[0 2 0 0 2 2]


In [47]:
#The Problem here is that we are treating the countries as one variable with ordinal values (0 < 1 < 2). Therefore, one way to get rid of that problem is to split the countries into respective dimensions. Gender does not need this as it is binary
#Converting the string features into their own dimensions. Gender doesn't matter here because its binary
countryhotencoder = OneHotEncoder(categorical_features=[1])
X = countryhotencoder.fit_transform(X).toarray()

In [48]:
X.shape

(10000, 12)

In [49]:
X

array([[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, ..., 0.0000000e+00,
        1.0000000e+00, 1.1254258e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 1.1393157e+05],
       ...,
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        1.0000000e+00, 4.2085580e+04],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 9.2888520e+04],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 3.8190780e+04]])

In [50]:
# A 0 on two countries means that the country has to be the one variable which wasn't included 
# This will save us from the problem of using too many dimensions
X = X[:,1:] # Got rid of Spain as a dimension.

In [53]:
# Divide the data into train and test
# Splitting the data into trainand test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 10)

In [None]:
# Normalize the train and test data
from sklearn.preprocessing import StandardScaler()
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.fit_transform(X)

In [71]:
classifier = Sequential()

In [72]:
classifier.add(Dense(activation='relu', input_dim = 11, units=11, kernel_initializer='uniform'))

In [73]:
classifier.add(Dense(6, activation='sigmoid', kernel_initializer='uniform'))

In [76]:
classifier.add(Dense(1, activation='sigmoid', kernel_initializer='uniform'))

In [77]:
classifier.compile(optimizer='sgd', loss='mse', metrics=['accuracy'])

In [78]:
classifier.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 11)                132       
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 72        
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 211
Trainable params: 211
Non-trainable params: 0
_________________________________________________________________


In [79]:
classifier.fit(X_train, y_train,
               validation_data = (X_test, y_test),
               epochs = 100,
               batch_size = 32)

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

<tensorflow.python.keras.callbacks.History at 0x15575c410>

In [82]:
# Predict the result as 0.5 as threshold
y_pred = classifier.predict(X_test)
print(y_pred)

[[0.20187521]
 [0.20187521]
 [0.20187521]
 ...
 [0.20187521]
 [0.20187521]
 [0.20187521]]


In [83]:
y_pred = (y_pred > 0.5)
print(y_pred)

[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


In [85]:
cm1 = confusion_matrix(y_test,y_pred)
print(cm1)

[[1578    0]
 [ 422    0]]


In [86]:
accuracy_model1 = ((cm1[0][0]+cm1[1][1])*100)/(cm1[0][0]+cm1[1][1]+cm1[0][1]+cm1[1][0])
print (accuracy_model1, '% of testing data was classified correctly')

78.9 % of testing data was classified correctly


In [87]:
Optimize the model
Some important parameters to look out for while optimizing neural networks are:

-Type of architecture

-Number of Layers

-Number of Neurons in a layer

-Regularization parameters

-Learning Rate

-Type of optimization / backpropagation technique to use

-Dropout rate

-Weight sharing

Number of Layers:
We will keep it similar to the above model so that we can compare the accuracy. 1 hidden layer.

Activation:
input layer: relu becasue we are in an input layer. uses the ReLu activation function for ϕ output layer: sigmoid becasue we are in an output layer. uses the Sigmoid activation function for ϕ . This is used instead of the ReLu function becasue it generates probabilities for the outcome. We want the probability that each customer leaves the bank.

Type of optimization / backpropagation technique to use:
We will use Adam. Adam is a very efficeint variation of Stochastic Gradient Descent. For Adam and its variant, learning rate or the decay rate does not really matter too much.

Learning Rate:
default learning rate 0.001.

Number of Neurons in a layer:
We will keep it 6 as per our initial calculation above.

Weight sharing / kernel_initializer:
uniform the distribution with which we randomly initialize weights for the nodes in this layer.

Loss:
loss: binary_crossentropy This is the loss function used within adam. This should be the logarthmic loss. If our dependent (output variable) is Binary, it is binary_crossentropy. If Categorical, then it is called categorical_crossentropy

Rebuilding the model using these optimised parameters¶

SyntaxError: invalid syntax (<ipython-input-87-e82a908f5a06>, line 1)

In [103]:
classifier = Sequential()

In [104]:
classifier.add(Dense(activation='relu', input_dim = 11, units=5, kernel_initializer='uniform'))

In [105]:
classifier.add(Dense(activation='relu', units=6, kernel_initializer='uniform'))

In [106]:
classifier.add(Dense(activation='sigmoid', units=1, kernel_initializer='uniform'))

In [107]:
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [108]:
classifier.fit(X_train, y_train,
               validation_data=(X_test, y_test),
               epochs=100,
               batch_size=200)

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

<tensorflow.python.keras.callbacks.History at 0x1562d5d10>

In [109]:
y_pred = classifier.predict(X_test)
print(y_pred)

[[0.20227776]
 [0.0753125 ]
 [0.23014206]
 ...
 [0.20357516]
 [0.1756397 ]
 [0.16867915]]


In [110]:
y_pred = (y_pred > 0.5)

In [111]:
cm2 = confusion_matrix(y_test, y_pred)
print(cm2)

[[1578    0]
 [ 422    0]]


In [112]:
accuracy_model2 = ((cm2[0][0]+cm2[1][1])*100)/(cm2[0][0]+cm2[1][1]+cm2[0][1]+cm2[1][0])
print (accuracy_model2, '% of testing data was classified correctly')

78.9 % of testing data was classified correctly
