
### Dataset details includes bank's customers data. Our aim is to predict if the customer will continue working with the bank or will left it.

In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
import datetime
import dataframe_image as dfi
%load_ext tensorboard

In [None]:
# Importing the dataset
df = pd.read_csv('./Dataset/Churn_Modelling.csv.xls', index_col='RowNumber')
df.head(10)

In [None]:
# save df as image
dfi.export(df.head(10), 'churntable.png')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.drop(['CustomerId','Surname'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df_dummies = pd.get_dummies(prefix='Geo', data=df, columns=['Geography'])

In [None]:
df_dummies.head()

In [None]:
df_encoded = df_dummies.replace(to_replace={'Gender': {'Female': 1,'Male':0}})

In [None]:
df_encoded.head(10)

In [None]:
sns.countplot(x=df_encoded.Exited ,data=df_encoded)
plt.ylabel("Count of each Target class")
plt.xlabel("Target classes")
plt.show()

In [None]:
df_encoded.hist(figsize=(15,12),bins = 15)
plt.title("Features Distribution")
plt.show()

In [None]:
corrMtrx = df_encoded.corr()
mask = np.zeros_like(corrMtrx)
mask[np.triu_indices_from(mask)] = True

fig, axs = plt.subplots(figsize=(15, 15))
plt.title('Feature Correlation')

cmap = sns.diverging_palette(260, 10, n=10, as_cmap=True)
sns.heatmap(corrMtrx, mask=mask, vmax=1.2, cmap=cmap, ax=axs, annot=True, fmt='0.2g', linewidths=1, square=False)
plt.show()

In [None]:
df_encoded.plot(kind='box', subplots=True, layout=(8,4), sharex=False, sharey=False, fontsize=12, figsize=(15,20))
plt.show()

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(30, 40))
fig.subplots_adjust(hspace=0.2, wspace=0.15)
axs = axs.ravel()

for i, col in enumerate(df_encoded.columns[:]):
    sns.boxplot(x=df_encoded.Exited, y=df_encoded[col], ax=axs[i])

In [None]:
fig, ax = plt.subplots(1, figsize=(20, 8))
sns.boxplot(data=df_encoded.iloc[:,:], ax=ax)
plt.show()

In [None]:
X = df_encoded.drop(['Exited'],axis=1)
y = df_encoded.Exited

In [None]:
X.head(10)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}')

In [None]:
fig, ax = plt.subplots(1, figsize=(20, 8))
g = sns.boxplot(data=X_train, ax=ax,)
g.set_xticklabels(df_encoded.columns[:-1], rotation=0)
plt.show()

In [None]:
fig, ax = plt.subplots(1, figsize=(20, 8))
g = sns.boxplot(data=X_test, ax=ax)
g.set_xticklabels(df_encoded.columns[:-1], rotation=0)
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f'Logistic Regression Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Cross Entropy model loss: {log_loss(y_test, y_pred)}')

In [None]:
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
X_train_pca.shape


In [None]:
plt.figure(figsize=(6, 6))
plt.plot(X_train_pca[:, :1][y_train == 0], X_train_pca[:, 1:2][y_train == 0], 'bo', label='0', alpha=0.7, markeredgecolor='k')
plt.plot(X_train_pca[:, :1][y_train == 1], X_train_pca[:, 1:2][y_train == 1], 'ro', label='1', alpha=0.7, markeredgecolor='k')

plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.legend()
plt.show()

In [None]:
lda = LDA()
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)
X_train_lda.shape

In [None]:
plt.figure(figsize=(6, 6))
plt.plot(X_train_lda[:, 0][y_train == 0], [0 for _ in range(X_train_lda[:, 0][y_train == 0].shape[0])], 'bo', label='0', alpha=0.7, markeredgecolor='k')
plt.plot(X_train_lda[:, 0][y_train == 1], [0 for _ in range(X_train_lda[:, 0][y_train == 1].shape[0])], 'ro', label='1', alpha=0.7, markeredgecolor='k')

plt.legend()
plt.show()

In [None]:
train_acc = lda.score(X_train, y_train)
test_acc = lda.score(X_test, y_test)
print(f'Train accuracy: {train_acc:.2f}, Test accuracy: {test_acc:.2f}')
# calculate loss of lda
from sklearn.metrics import log_loss, mean_squared_error
y_pred = lda.predict_proba(X_test)
print(f'Log loss: {log_loss(y_test, y_pred):.2f}')
y_pred = lda.predict(X_test)
print(f'MSE: {mean_squared_error(y_test, y_pred):.2f}')

In [None]:
qda = QDA()
qda.fit(X_train, y_train)
train_acc = qda.score(X_train, y_train)
test_acc = qda.score(X_test, y_test)
print(f'Train accuracy: {train_acc:.2f}, Test accuracy: {test_acc :.2f}')

In [None]:
def Model(num_neurons:list, learning_rate:float):

    model = Sequential()
    # Adding the input layer and the first hidden layer
    model.add(Dense(units = num_neurons[0], kernel_initializer = 'uniform', activation = 'relu', input_dim = 12))

    # Adding the hidden layers
    for i in range(1, len(num_neurons)):
        # model.add(Dense(units = num_neurons[i], kernel_initializer = 'uniform', activation = 'relu'))
        model.add(Dense(units = num_neurons[i], kernel_initializer = 'uniform', activation = 'relu', kernel_regularizer=l2(0.01)))
        # model.add(Dropout(0.2))

    # Adding the output layer
    model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

    # Compiling the ANN
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss = 'binary_crossentropy', metrics = ['accuracy'])

    # Model Summary
    print(model.summary())

    return model

In [None]:
# Tensorboard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
# Fitting the ANN to the Training set
# model = Model([5, 5], 0.001)
model = Model([20, 20, 10, 6], 0.001)
# model = Model([20, 40, 60, 80, 60, 40, 20], 0.001)
history = model.fit(X_train, y_train, batch_size = 10, epochs = 1, verbose = 1, 
                    validation_data=(X_test, y_test), callbacks=[tensorboard_callback])

In [None]:
# Evaluate the model on the test data using `evaluate`
score, acc = model.evaluate(X_train, y_train,
                            batch_size=10)
print(f'Train score: {score: .3f}')
print(f'Train accuracy: {acc: .3f}')

# Predicting the Test set results
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)

print('-----'*20)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=10)
print(f'Test score: {score: .3f}')
print(f'Test accuracy: {acc: .3f}')
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

Evaluation

In [None]:
c_matrix = sns.heatmap(pd.DataFrame(cm), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print(classification_report(y_test,y_pred))