In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn import preprocessing
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, auc
import matplotlib.pyplot as plt
from tensorflow.keras import optimizers

In [2]:
from google.colab import drive

In [3]:
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
project_path = '/content/drive/My Drive/aiml/'

In [5]:
dataset_file = project_path + 'bank.csv'

### 1. Read the dataset

In [6]:
data = pd.read_csv(dataset_file)
#data = pd.read_csv('bank.csv')

In [7]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### 2. Drop the columns which are unique for all users like IDs

In [8]:
df1 = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [9]:
df2 = pd.get_dummies(data=df1, columns=['Geography', 'Gender'], drop_first = True)

### 3. Distinguish the feature and test sets

In [10]:
df2.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [11]:
df2.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037,0.2509,0.2477,0.5457
std,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769,0.433553,0.431698,0.497932
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0,0.0,0.0,0.0
25%,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0,0.0,0.0,0.0
50%,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0,0.0,0.0,1.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0,1.0,0.0,1.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0,1.0,1.0,1.0


In [12]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CreditScore        10000 non-null  int64  
 1   Age                10000 non-null  int64  
 2   Tenure             10000 non-null  int64  
 3   Balance            10000 non-null  float64
 4   NumOfProducts      10000 non-null  int64  
 5   HasCrCard          10000 non-null  int64  
 6   IsActiveMember     10000 non-null  int64  
 7   EstimatedSalary    10000 non-null  float64
 8   Exited             10000 non-null  int64  
 9   Geography_Germany  10000 non-null  uint8  
 10  Geography_Spain    10000 non-null  uint8  
 11  Gender_Male        10000 non-null  uint8  
dtypes: float64(2), int64(7), uint8(3)
memory usage: 732.5 KB


In [13]:
y_data = df2['Exited']

In [14]:
y_data

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

In [15]:
y_data.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [16]:
X_data = df2.drop(['Exited'], axis = 1)

In [17]:
X_data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


In [18]:
X_data.shape

(10000, 11)

### 4. Divide the data set into training and test set

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 7)

In [20]:
print(X_train.shape)
print(X_test.shape)

(8000, 11)
(2000, 11)


### 5. Normalize the train and test data

In [21]:
#Started with normalizer but didn't yield good result so decided to go with StandardScaler with F1 Score of 0.
#After changing the axis=0 for normalization, I got F1 of 0.47. But still not optimal so moved to using StandardScaler.
#X_train = preprocessing.normalize(X_train, axis=0)
#X_test = preprocessing.normalize(X_test, axis=0)

#With StandardScalar, I can get the F1 score of 0.59 as opposed to above normalize of 0.47
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### 6. Initialize & build the model. Identify the points of improvements and implement the same.

In [22]:
model = Sequential()

In [23]:
# Tried many combinaton of using various activation functions (relu, tanh, swish and sigmoid)
# relu and swish are the best activation function out of all.
# Tried to have one, two and three hidden layers. Found having two hidden layers are the best.
# Also, tried different units for each hidden layer, but found having 6 units for the first hidden layer
# and 3 units for the second hidden layer. 
# Also, tried adding the third hidden layer, but numbers degraded.

# I have found following is the best model.

# Please find some of the results at the end.

model.add(Dense(6, input_shape = (X_train.shape[1],), activation = 'relu'))
model.add(Dense(3, activation = 'relu'))
#model.add(Dense(4, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

In [24]:

#Adam is giving val_accuracy compare to other. I tried sgd and adamax with F1 score of 0.58. Adadelta is worst.
#But, adam gave 0.59 so sticking with adam
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 6)                 72        
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 21        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 4         
Total params: 97
Trainable params: 97
Non-trainable params: 0
_________________________________________________________________


In [26]:
y_train.value_counts()

0    6374
1    1626
Name: Exited, dtype: int64

In [27]:
# I tried with epochs value of 40, 100, 200. 200 is giving the best F1 score so using this.

model.fit(X_train, y_train.values, 32, epochs = 200, verbose = 1, validation_split=0.1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f03eedd2e10>

### 7. Predict the results using 0.5 as a threshold

In [28]:
# Predict class returns 0 and 1. This will automatically takes care of the threshold.
# I also did it using the lambda function, but this is simpler.
Y_train_pred = model.predict_classes(X_train, batch_size=32, verbose=0)
Y_test_pred = model.predict_classes(X_test, batch_size=32, verbose=0)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


### 8. Print Accuracy Score and Confusion Matrix

In [29]:
_, accuracy_train = model.evaluate(X_train, np.asarray(y_train), verbose=0)
print('Accuracy Train : %.2f' % (accuracy_train*100))
_, accuracy_test = model.evaluate(X_test, np.asarray(y_test), verbose = 0)
print('Accuracy Test : %.2f' % (accuracy_test*100))

Accuracy Train : 86.29
Accuracy Test : 85.85


In [30]:

print('Recall_score (Train): ' + str(recall_score(y_train.values,Y_train_pred)))
print('Precision_score (Train): ' + str(precision_score(y_train.values, Y_train_pred)))
print('F-score (Train):' + str(f1_score(y_train.values,Y_train_pred)))
confusion_matrix(y_train.values, Y_train_pred)

Recall_score (Train): 0.523370233702337
Precision_score (Train): 0.7254901960784313
F-score (Train):0.6080743122543766


array([[6052,  322],
       [ 775,  851]])

In [31]:

print('Recall_score (Test): ' + str(recall_score(y_test.values,Y_test_pred)))
print('Precision_score (Test): ' + str(precision_score(y_test.values, Y_test_pred)))
print('F-score (Test): ' + str(f1_score(y_test.values,Y_test_pred)))
confusion_matrix(y_test.values, Y_test_pred)

Recall_score (Test): 0.5255474452554745
Precision_score (Test): 0.7105263157894737
F-score (Test): 0.6041958041958042


array([[1501,   88],
       [ 195,  216]])

**Summary and Addtional Resullts**

- Tried Various optimizer, found adam is the best optimizer.
- Tried 1,2,3 hidden layers. 2 Hidden Layers with relu is giving the best results.
- 1st hidden layer with 6 units and 2nd hidden layer with 3 units giving the best results.
- Tried relu, tanh, swish activation functions in hidden layers. Relu and Swish results are the best. Chose relu for the final model.
- Sigmoid is the best activation function for the output layker.
- Some additional results (Average over 3 runs):
  - 1st HL activation function (1st HL Units): 2nd HL activation function (2nd HL Units): F1 Score
  - Relu (6) : Relu (6) : 0.586
  - Swish (6) : Relu (6) : 0.581
  - Swish (6) : Swish(6): 0.587
  - Relu (12) : Relu(6)  :0.585
  - Swish (12) : Relu(6) : 0.589
  - Swish (12) : Swish(6) : 0.584
  - ***Relu (6) : Relu (3) : 0.595***
  - Relu (16) : Relu (8) : Relu (4) : 0.575
  - Relu (16) : Relu (8) : 0.587
  - Relu (8) : Relu (4) : 0.584
  - ***Swish (6) : Swish (3) : 0.592***
  - Relu (6) : tanh (3): 0.577