# Part 1 - preprocessing of data

In [1]:
#Import libraries
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 

#Import Dataset

dataset = pd.read_csv("./Data/Churn_Modelling.csv")

X = dataset.iloc[:,3:-1]
y = dataset.iloc[:,-1]

In [2]:
#Coding categorical data
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Geography","Gender"]

column_transformer = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(drop='first'), categorical_features) #We avoid multilinearity using "drop=first"
    ],
    remainder='passthrough'
)

X_transformed = column_transformer.fit_transform(X)

#Split dataset into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=0 )

#Variable scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


# Part 2 - Building the ANN


In [None]:
#Import Keras and additional libraries
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

#Initialise RNA
classifier= Sequential()
#Input layer and first hidden layer
classifier.add(Dense(units = 6, kernel_initializer="uniform",
                     activation="relu", input_dim= X_train.shape[1]))
#Add second hidden layer
classifier.add(Dense(units = 6, kernel_initializer="uniform",
                     activation="relu"))
#Exit layer
classifier.add(Dense(units = 1, kernel_initializer="uniform",
                     activation="sigmoid"))
#Compiling RNA
classifier.compile(optimizer="adam", loss="binary_crossentropy", metrics= ["accuracy"])

#Fitting the ANN to the training dataset
classifier.fit(X_train,y_train, batch_size= 10, epochs= 100 , verbose=0)


# Part 3 - Evaluate the model and calculate final predictions

In [4]:
# Using the model to predict
y_pred = classifier.predict(X_test)
y_pred = (y_pred>0.5)
 
#Developing a confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

# Data of the function matrix (right now it is not important that it looks nice)
print(cm)
print((cm[0][0]+cm[1][1])/cm.sum())

[[1544   51]
 [ 267  138]]
0.841


In [5]:
X.tail()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
9995,771,France,Male,39,5,0.0,2,1,0,96270.64
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77
9997,709,France,Female,36,7,0.0,1,0,1,42085.58
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52
9999,792,France,Female,28,4,130142.79,1,1,0,38190.78


In [6]:
# Particular case of use
person = np.array([[600,"France","Male","40",3,60000,2,1,1,50000]])
person = pd.DataFrame(person, columns = X.columns)

person_1 = column_transformer.transform(person)
person_2 = sc_X.transform(person_1)
y_person = classifier.predict(person_2)
print(y_person)

[[0.08894467]]


# Par 4 - Reconstruction part 2  adding cross validation 

In [7]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import cross_val_score

def build_classifier():
    #Initialise RNA
    classifier= Sequential()
    #Input layer and first hidden layer
    classifier.add(Dense(units = 6, kernel_initializer="uniform",
                        activation="relu", input_dim= X_train.shape[1]))
    classifier.add(Dropout(rate = 0.1)) #10% of the layers are disabled. Better to go from 10% to 10% to test.
    #Add second hidden layer
    classifier.add(Dense(units = 6, kernel_initializer="uniform",
                        activation="relu"))
    classifier.add(Dropout(rate = 0.1))
     
    #Exit layer
    classifier.add(Dense(units = 1, kernel_initializer="uniform",
                        activation="sigmoid"))
    #Compiling the ANN
    classifier.compile(optimizer="adam", loss="binary_crossentropy", metrics= ["accuracy"])
    
    #Return the classifier
    return  classifier

classifier = KerasClassifier(build_fn = build_classifier, batch_size= 10, epochs=100 )
accuracies = cross_val_score(estimator=classifier, X=X_train, y= y_train, cv=10, n_jobs = -1, verbose=0) #Validación cruzada de 10 muestras

#Priting cross validation score accuracies
print("Accuracies: ", accuracies)
mean = accuracies.mean()
variance = accuracies.std()
print("Mean: ",mean)
print("Variance: ", variance)


Accuracies:  [0.84    0.8375  0.8775  0.84125 0.83375 0.825   0.84    0.825   0.8175
 0.84125]
Mean:  0.837875
Variance:  0.015381096352341073


# IMPROVING THE NNR

In [8]:
from sklearn.model_selection import GridSearchCV

def build_classifier(optimizer= "adam"):
    #Initialise RNA
    classifier= Sequential()
    #Input layer and first hidden layer
    classifier.add(Dense(units = 6, kernel_initializer="uniform",
                        activation="relu", input_dim= X_train.shape[1]))
    #Add second hidden layer
    classifier.add(Dense(units = 6, kernel_initializer="uniform",
                        activation="relu"))
    #classifier.add(Dropout(rate = 0.1))
    #Exit layer
    classifier.add(Dense(units = 1, kernel_initializer="uniform",
                        activation="sigmoid"))
    #Compiling RNA
    classifier.compile(optimizer=optimizer, loss="binary_crossentropy", metrics= ["accuracy"])
    
    #Devolver el Return classifier
    return  classifier

classifier = KerasClassifier(build_fn = build_classifier)

parameters = {
    "batch_size": [10,25,32],
    "epochs": [100, 250],
    "optimizer" : ["adam", "rmsprop"],
}

In [None]:
grid_search = GridSearchCV(estimator = classifier, 
                            param_grid = parameters,
                            scoring="accuracy",
                            cv = 10 )
grid_search = grid_search.fit(X_train, y_train, verbose=0)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

In [None]:
print("Best accuracy: ",best_accuracy)
print("Best parameters: ",best_parameters)

0.853375
{'batch_size': 10, 'epochs': 250, 'optimizer': 'adam'}
