In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [0]:
import pandas as pd
import numpy as np
dataset = pd.read_csv("https://raw.githubusercontent.com/mwitiderrick/kerasDO/master/HR_comma_sep.csv")


In [77]:
dataset.tail(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
14994,0.4,0.57,2,151,3,0,1,0,support,low
14995,0.37,0.48,2,160,3,0,1,0,support,low
14996,0.37,0.53,2,143,3,0,1,0,support,low
14997,0.11,0.96,6,280,4,0,1,0,support,low
14998,0.37,0.52,2,158,3,0,1,0,support,low


In [78]:
# One hot encoding of categorical values such as depertment and salary as we can see
# They are not numbers and the model can't process them as strings
feats = ['department','salary']
final = pd.get_dummies(dataset,columns=feats,drop_first=True)
final.head()



Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,0,0,0,0,1,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,0,0,0,0,1,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,0,0,0,0,1,0,0,1,0


In [0]:
# Now we have to prepare our model for the training so we use the left as category to know if they left

from sklearn.model_selection import train_test_split
X = final.drop(['left'],axis=1).values
y = final['left'].values
# Now we split the data depending on our preference
# In this case we will use 80 - 20 spliting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [0]:
# We can now use a scaler privided by scikit-learn to scale our data so that none
# of it matters more than the other for our model

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [0]:
# We will proceed with the construction of our model
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [0]:
classifier = Sequential()
classifier.add(Dense(9, kernel_initializer = "uniform",activation = "relu", input_dim=18))
classifier.add(Dense(1, kernel_initializer = "uniform",activation = "sigmoid"))
classifier.compile(optimizer= "adam",loss = "binary_crossentropy",metrics = ["accuracy"])



In [83]:
classifier.fit(X_train, y_train, batch_size = 10, epochs = 1)


Epoch 1/1


<keras.callbacks.History at 0x7fec373ae940>

In [0]:
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

In [85]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[2764,   94],
       [ 565,  327]])

### Having a single single layer we get
This results say that we have (2690+714)/3750=82.3%
### But we can improve our model adding another layer and a dropout layer so it can generalize better


### Adding a Dropout Layer helps us to generalize more by helping to reduce the overfitting

In [86]:

classifier = Sequential()
classifier.add(Dense(9, kernel_initializer = "uniform", activation = "relu", input_dim=18))
classifier.add(Dropout(rate = 0.1))
classifier.add(Dense(1, kernel_initializer = "uniform", activation = "sigmoid"))
classifier.compile(optimizer= "adam",loss = "binary_crossentropy",metrics = ["accuracy"])
classifier.fit(X_train, y_train, batch_size = 10, epochs = 1)
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

Epoch 1/1


array([[2765,   93],
       [ 546,  346]])

## (2765+346)/3750=82.9

Now we will use K-Folds to get better accuracy by doing cross-validation with different parts of our dataset

In [0]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
def make_classifier():
    classifier = Sequential()
    classifier.add(Dense(9, kernel_initializer = "uniform", activation = "relu", input_dim=18))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(1, kernel_initializer = "uniform", activation = "sigmoid"))
    classifier.compile(optimizer= "adam",loss = "binary_crossentropy",metrics = ["accuracy"])
    return classifier

In [0]:
classifier = KerasClassifier(build_fn = make_classifier, batch_size=10, nb_epoch=1)

In [0]:
accuracies = cross_val_score(estimator = classifier,X = X_train,y = y_train,cv = 10,n_jobs = -1)


In [92]:
mean = accuracies.mean()
print("MEAN: ",mean)
variance = accuracies.var()
print("Variance:", variance)

MEAN:  0.832961877882952
Variance: 0.001537391599627102


### As we can see, we have improved in small steps the performance of our model, with K-Folds we
### have the hability to do some more training to explore the data and validate with it if we don't have
### a huge dataset like ours