# HR Churn - Fast Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlretrieve

file = 'turnover.csv'
url  = 'https://assets.datacamp.com/production/course_6221/datasets/' + file
urlretrieve(url, file)
hr_df = pd.read_csv(file)
hr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction            14999 non-null float64
evaluation              14999 non-null float64
number_of_projects      14999 non-null int64
average_montly_hours    14999 non-null int64
time_spend_company      14999 non-null int64
work_accident           14999 non-null int64
churn                   14999 non-null int64
promotion               14999 non-null int64
department              14999 non-null object
salary                  14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [2]:
cat_cols = ['department', 'salary']
for col in cat_cols:
    hr_df[col] = hr_df[col].astype('category')

X = pd.get_dummies(hr_df.drop('churn', axis='columns'))
y = hr_df.churn

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [3]:
X_train.shape, X_test.shape

((11249, 20), (3750, 20))

In [4]:
# Logistic model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Model Score: ", clf.score(X_test, y_test))
print("F1 Score: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Model Score:  0.7941333333333334
F1 Score:  0.4340175953079179
[[2682  186]
 [ 586  296]]


In [5]:
# Random model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Model Score: ", clf.score(X_test, y_test))
print("F1 Score: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Model Score:  0.9904
F1 Score:  0.979310344827586
[[2862    6]
 [  30  852]]


  from numpy.core.umath_tests import inner1d


In [24]:
# Neural Network
from keras.models import Sequential
from keras.layers import Dense
from keras.activations import relu, softmax, sigmoid
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

X_train_np = X_train.values
y_train_np = y_train.values
X_test_np = X_test.values
y_test_np = y_test.values

def create_model(optimizer='rmsprop', init='glorot_uniform'):
    model = Sequential()
    model.add(Dense(50, input_dim=20, kernel_initializer=init, activation='relu'))
    model.add(Dense(10, input_dim=20, kernel_initializer=init, activation='relu'))
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

# grid search epochs, batch size and optimizer
# This takes nearly 4 hours...
# smallest batch, largest epoch's (no surprise) - but interesting
# normal init, and rmsprop optimizer (rather than adam). 
# Best: 0.904880 using {'batch_size': 5, 'epochs': 15, 'init': 'normal', 'optimizer': 'rmsprop'}
optimizers = ['rmsprop', 'adam']
inits = ['glorot_uniform', 'normal', 'uniform']
epochs = [5, 10, 15]
batches = [5, 10, 20]

param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches, init=inits)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_train_np, y_train_np)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Best: 0.904880 using {'batch_size': 5, 'epochs': 15, 'init': 'normal', 'optimizer': 'rmsprop'}
0.781136 (0.042954) with: {'batch_size': 5, 'epochs': 5, 'init': 'glorot_uniform', 'optimizer': 'rmsprop'}
0.761934 (0.003740) with: {'batch_size': 5, 'epochs': 5, 'init': 'glorot_uniform', 'optimizer': 'adam'}
0.832785 (0.025148) with: {'batch_size': 5, 'epochs': 5, 'init': 'normal', 'optimizer': 'rmsprop'}
0.831452 (0.037644) with: {'batch_size': 5, 'epochs': 5, 'init': 'normal', 'optimizer': 'adam'}
0.855632 (0.015386) with: {'batch_size': 5, 'epochs': 5, 'init': 'uniform', 'optimizer': 'rmsprop'}
0.791270 (0.047921) with: {'batch_size': 5, 'epochs': 5, 'init': 'uniform', 'optimizer': 'adam'}
0.664504 (0.299629) with: {'batch_size': 5, 'epochs': 10, 'init': 'glorot_uniform', 'optimizer': 'rmsprop'}
0.810116 (0.066414) with: {'batch_size': 5, 'epochs': 10, 'init': 'glorot_uniform', 'optimizer': 'adam'}
0.902036 (0.005230) with: {'batch_size': 5, 'epochs': 10, 'init': 'normal', 'optimizer': 

In [26]:
# Use best hyper-parameters - increase epoch
model = create_model(init='uniform')
model.fit(X_train_np, y_train_np, epochs=25, batch_size=5)
y_pred = (model.predict(X_test_np) > 0.5)

print(model.evaluate(X_train_np, y_train_np))
print("F1 Score: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
[0.31072746631804865, 0.8902124633300738]
F1 Score:  0.7553699284009547
[[2707  161]
 [ 249  633]]


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.activations import relu, sigmoid
from keras.optimizers import RMSprop
from keras.initializers import normal

def create_model(optimizer='rmsprop', init='normal'):
    model = Sequential()
    model.add(Dense(25, input_dim=20, kernel_initializer=init, activation='relu'))
    model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# explore tensorboard of a very simple model
from keras.callbacks import TensorBoard
tb = TensorBoard(log_dir='./logs', histogram_freq=0,
            batch_size=32, write_graph=True,
            write_grads=False, write_images=False,
            embeddings_freq=0, embeddings_layer_names=None,
            embeddings_metadata=None, embeddings_data=None)

model = create_model()
model.fit(X_train_np, y_train_np, epochs=25, batch_size=5, callbacks=[tb])
y_pred = (model.predict(X_test_np) > 0.5)

print(model.evaluate(X_train_np, y_train_np))
print("F1 Score: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25