In [1]:
#Import Libraries
import pandas as pd
import numpy as np

%matplotlib inline 

import matplotlib
import matplotlib.pyplot as plt 
from keras.layers import Dropout
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn import metrics

Using TensorFlow backend.


## 1. Read the dataset

In [2]:
# reading the CSV file into pandas dataframe
Loan = pd.read_csv("C:/Users/Tanu/Desktop/Greatleaning/Feb'19/Project_Neuralnet/bank.csv")  

print('Getting the glimpse of the data')
Loan.head(5)

print('Dimensions')
Loan.shape

print('variable type')
Loan.info()


Getting the glimpse of the data
Dimensions
variable type
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [3]:
# To check if any missing value in the dataset

Loan.isnull().values.any()

# No missing value in the data

False

# 2. Drop the columns which are unique for all users like IDs 

In [4]:
# dropping id, date & zipcode columns  as these columns are unique for all user
Loan1 = Loan.drop({'RowNumber','CustomerId','Surname'}, axis=1)

# Seeing output
Loan1.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# 3. Distinguish the feature and target set 

In [5]:
# From the problem statement of the project, it is clear that bank want to predict if the person will stay or it will exit
# Bank want to build the classifier to make the prediction

# Basis this information, 'Exit' is the target variable which shows if the customer is stayed or leave. 
#Exit = 1 menas customer has left

#Rest of the variable from Loan1 dataframe can be used for feature set  

# The task of creating Feature set & target set is done below:

In [6]:
# creating the feature set 
X = Loan1.drop('Exited', axis = 1)


# creating the target set 
y = Loan1['Exited']

In [7]:
# Seeing output of X

X.head(2)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58


In [8]:
# From the above output, we can see that 'Geogarphy' & 'Gender' are categorical 
#we need to convert categorical variables into numerical vars so that we can process them in the model

In [9]:
# Converting 'Gender' into numeric

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
X['Gender'] = labelencoder.fit_transform(X.Gender)

X['Gender'] = X['Gender'].astype('int64')

In [10]:
# Converting 'Geography' into numeric
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
X['Geography'] = labelencoder.fit_transform(X.Geography)

In [11]:
# As 'Geography' has three labels, changing these labels by creating dummy variables for each label & storing them in specific columns 
# using one hot coding 

# Importing one-hot encoder
from sklearn.preprocessing import OneHotEncoder

# creating dummy columns 
countryhotencoder = OneHotEncoder(categorical_features = [1]) 
X = countryhotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [12]:
# Shape
X.shape

(10000, 12)

In [13]:
# Dropping one label of geography so that multicolinerty won't occure
X = X[:,1:]

In [14]:
# Seeing output of y
y.head(5)

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

# 4. Divide the data set into Train and test sets

In [15]:
#Test train split into 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=201)


# 5. Normalize the train and test data 

In [16]:
# Scaling 
#Importing Standardscaler 
from sklearn.preprocessing import StandardScaler

# Standardizing 
sc=StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 6. Initialize & build the model 

In [17]:
# Importing Keras & corresponding packages
import keras
from keras.models import Sequential 
from keras.layers import Dense

In [18]:
# Initializing Neural net & setting up the parameters for its layer

#Initializing Neural Network
classifier = Sequential()

# Creating the first hidden layer
classifier.add(Dense(activation = 'relu', input_dim = 11, units=6, kernel_initializer='uniform'))

# Creating the second hidden layer
classifier.add(Dense(activation = 'relu', units=6, kernel_initializer='uniform')) 

#Creating output layer
classifier.add(Dense(activation = 'sigmoid', units=1, kernel_initializer='uniform')) 

# 7.Optimize the model 

In [19]:
#Compiling the ANN using 'adam' optimizer
classifier.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [20]:
# Checking length of Test -train 
print(len(X_train))
print(len(y_train))

8000
8000


In [21]:
#Fitting the classifier 
classifier.fit(X_train, y_train, batch_size=10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0xef7dac8>

# 8.Predict the results using 0.5 as a threshold 

In [26]:
#Predicting the test results without any thresh hold
y_pred = classifier.predict(X_test)
print(y_pred)

#Predicting the test results with 5% threshhold
y_pred = (y_pred > 0.5)
print(y_pred1)

[[0.07451803]
 [0.22839832]
 [0.3333406 ]
 ...
 [0.06693814]
 [0.07258716]
 [0.01427286]]
[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


# 9.Print confusion matrix  and Accuracy score

In [30]:
# Calculating confusion matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1515   62]
 [ 244  179]]


In [31]:
# Accuracy Score
print ('Accuracy Score')
print (((cm[0][0]+cm[1][1])*100)/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0]))

Accuracy Score
84.7


In [None]:
# From the above accuracy score, model is able to classify 84.7% observation correctly