### Bank Customer Satisfaction Prediction Using CNN and Feature Selectin

In [1]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv1D, MaxPool1D, Flatten, Dense, Dropout, BatchNormalization

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

### Data

In [4]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [5]:
# Data shape
data.shape

(76020, 371)

In [6]:
# drop the ID column - not useful
data.drop('ID', axis = 1, inplace = True)

In [34]:
# Separate feature set and target
X = data.drop('TARGET', axis = 1)
y = data['TARGET']

X.shape,  y.shape

((76020, 369), (76020,))

#### Train Test Split

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

In [54]:
X_train.shape, X_test.shape

((60816, 369), (15204, 369))

#### Remove Constant, Quasi Constant and Duplicate Features

In [55]:
# Constant and Quasi constant features
filter_cq = VarianceThreshold(threshold=0.01)
X_train = filter_cq.fit_transform(X_train)
X_test = filter_cq.transform(X_test)

In [56]:
# data shape after removal of constant and quasi constant features
X_train.shape, X_test.shape

((60816, 273), (15204, 273))

In [57]:
# Remove Dupicate features

# Transpose of the data
X_train_T = X_train.T
X_test_T = X_test.T

In [58]:
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)

In [19]:
# Number of duplicate features
dup_num = X_train_T.duplicated().sum()
print(f'There are {dup_num} of duplicate features')

There are 17 of duplicate features


In [59]:
# identify the duplicated features
is_duplicated = X_train_T.duplicated()
is_duplicated

0      False
1      False
2      False
3      False
4      False
       ...  
268    False
269    False
270    False
271    False
272    False
Length: 273, dtype: bool

In [60]:
#retain only the unique columns
unique_columns_index = is_duplicated[~is_duplicated].index

In [61]:
# select only unique features
X_train = X_train_T.loc[unique_columns_index,:].T
X_train.shape

(60816, 256)

In [62]:
X_test = X_test_T.loc[unique_columns_index,:].T
X_test.shape

(15204, 256)

In [63]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,263,264,265,266,267,268,269,270,271,272
0,2.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016
1,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85472.34
2,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,317769.24
3,2.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76209.96
4,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,302754.0


#### Standardising 

In [64]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [65]:
#Reshaping the data for convulutional NN
X_train = X_train.reshape(60816, 256, 1)
X_test = X_test.reshape(15204, 256, 1)

# target
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

### Build CNN model

In [66]:
model = Sequential()

# 1st CNN layer
model.add(Conv1D(filters = 32, kernel_size = 3, activation = 'relu', input_shape = (256,1)))
model.add(BatchNormalization())
model.add(MaxPool1D(2))
model.add(Dropout(0.3))

# 2nd CNN layer
model.add(Conv1D(filters = 64, kernel_size = 3, activation = 'relu'))
model.add(BatchNormalization())
model.add(MaxPool1D(2))
model.add(Dropout(0.3))
                  
#3rd CNN layer                
model.add(Conv1D(filters = 128, kernel_size = 3, activation = 'relu'))
model.add(BatchNormalization())
model.add(MaxPool1D(2))
model.add(Dropout(0.3))

# Flatten 
model.add(Flatten())

model.add(Dense(256, activation = 'relu'))
model.add(Dropout(0.5))

#output layer
model.add(Dense(units = 1, activation = 'sigmoid'))        

In [48]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_3 (Conv1D)            (None, 254, 32)           128       
_________________________________________________________________
batch_normalization_3 (Batch (None, 254, 32)           128       
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 127, 32)           0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 127, 32)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 125, 64)           6208      
_________________________________________________________________
batch_normalization_4 (Batch (None, 125, 64)           256       
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 62, 64)           

In [67]:
# compile the model
from tensorflow.keras.optimizers import Adam

model.compile(optimizer = Adam(lr = 0.0005), loss = 'binary_crossentropy', metrics = ['accuracy'])

In [68]:
# Model Training
history = model.fit(X_train, y_train, epochs = 10, validation_data = (X_test, y_test), verbose = 1)

Train on 60816 samples, validate on 15204 samples
Epoch 1/10
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [69]:
# Prediction
y_pred = model.predict(X_test)
y_pred

array([[0.00297648],
       [0.05829933],
       [0.00730246],
       ...,
       [0.01596017],
       [0.02166818],
       [0.00112749]], dtype=float32)

In [71]:
# AUC score
from sklearn.metrics import accuracy_score, roc_auc_score
print(f'AUC Score: {roc_auc_score(y_test, y_pred)}')

AUC Score: 0.7989576474528362


In [72]:
#Accuracy
y_pred_class = model.predict_classes(X_test)
print(f'ACCURACY: {accuracy_score(y_test, y_pred_class)}')

ACCURACY: 0.9604051565377533
