In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing, cross_validation, neighbors, svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier, \
  RandomForestRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

from math import sqrt

import pickle
DIR_DATA = "data"
DIR_PROCESSED = "processed"
LABEL = "rating"
NON_PREDICTORS = [LABEL]#, "name","anime_id"]
CV_FOLDS = 5



In [2]:
# HR_comma_sep.csv

# Employee satisfaction level
# Last evaluation
# Number of projects
# Average monthly hours
# Time spent at the company
# Whether they have had a work accident
# Whether they have had a promotion in the last 5 years
# Department
# Salary
# Whether the employee has left

In [3]:
hr = pd.read_csv(DIR_DATA + '/HR_comma_sep.csv')
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
print('Percent who left: {:.2f}'.format(np.sum(hr.left) / len(hr.left) * 100))

Percent who left: 23.81


In [5]:
hr.sales = hr.sales.astype('category').cat.codes
hr.salary = hr.salary.astype('category').cat.codes
# hr.Work_accident = hr.Work_accident.astype('category')
# hr.promotion_last_5years = hr.promotion_last_5years.astype('category')
# hr.left = hr.left.astype('category')


In [6]:
hr.salary.dtype

dtype('int8')

In [7]:
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,7,1
1,0.8,0.86,5,262,6,0,1,0,7,2
2,0.11,0.88,7,272,4,0,1,0,7,2
3,0.72,0.87,5,223,5,0,1,0,7,1
4,0.37,0.52,2,159,3,0,1,0,7,1


In [8]:
def predict_left(df, clf, test_size=0.2):
    X = df.drop(['left'],1)
    y = df.left 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    
    clf.fit(X_train, y_train)
    
    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)
    print('Training Score: {:.3f}'.format(clf.score(X_train, y_train)))
    print('Testing Score: %.3f' % (clf.score(X_test, y_test)))
    
    print()

In [9]:
classifiers = [RandomForestClassifier(n_jobs=-1), RandomForestClassifier(criterion='entropy', n_jobs=-1), svm.SVC(), LogisticRegression(), LinearRegression(), AdaBoostClassifier(), xgb.XGBClassifier(), neighbors.KNeighborsClassifier(n_jobs=-1)] #, GradientBoostingClassifier(),  neighbors.KNeighborsClassifier(n_jobs=-1)]
# classifiers = [RandomForestClassifier(n_estimators=500 ,n_jobs=-1), RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=-1), xgb.XGBClassifier(n_estimators=500, nthread=-1)]# svm.SVC()]

for i, clf in enumerate(classifiers):
    print('Classifier ', i)
    
    predict_left(hr, clf)

Classifier  0
Training Score: 0.998
Testing Score: 0.991

Classifier  1
Training Score: 0.999
Testing Score: 0.989

Classifier  2
Training Score: 0.962
Testing Score: 0.949

Classifier  3
Training Score: 0.765
Testing Score: 0.770

Classifier  4
Training Score: 0.194
Testing Score: 0.195

Classifier  5
Training Score: 0.961
Testing Score: 0.961

Classifier  6
Training Score: 0.976
Testing Score: 0.976

Classifier  7
Training Score: 0.954
Testing Score: 0.929



In [10]:
def cross_val_left(hr, clf, cv_folds=CV_FOLDS, drop=['left']):
    X = hr.drop(drop, 1)#, 'sales', 'salary'],1)
    y = hr.left 
    scores = cross_val_score(clf, X, y, cv=cv_folds, n_jobs=-1)
    
    
    
    print('Cross val score: ', sum(scores) / cv_folds )
    print(scores)
    
    print()
    

In [19]:
classifiers = [RandomForestClassifier(n_estimators=500 ,n_jobs=-1), RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=-1)]#, svm.SVC()]#xgb.XGBClassifier(n_estimators=500, nthread=-1)]# svm.SVC()]

for i, clf in enumerate(classifiers):
    print('Classifier ', i)
    
    cross_val_left(hr, clf)

Classifier  0
Cross val score:  0.991999666496
[ 0.99866711  0.98133333  0.98633333  0.99899967  0.99466489]

Classifier  1
Cross val score:  0.991866333163
[ 0.99866711  0.98133333  0.98566667  0.99899967  0.99466489]



In [12]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
X = hr.drop(['left'],1)
y = hr.left 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    
clf.fit(X_train, y_train)
print(X.columns)
clf.feature_importances_
# Drop all lower than 0.01 relevance

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')


array([ 0.33944067,  0.1151839 ,  0.18123384,  0.14789144,  0.18944206,
        0.0061087 ,  0.00099434,  0.0120704 ,  0.00763465])

In [13]:
# Dropping all with <0.01 relevance seems to not affect score much (.9920->.9916)
drop = ['left', 'promotion_last_5years', 'Work_accident', 'sales', 'salary']
for i, clf in enumerate(classifiers):
    print('Classifier ', i)
    
    cross_val_left(hr, clf, drop=drop)

Classifier  0
Cross val score:  0.991599644237
[ 0.99800067  0.981       0.98633333  0.99833278  0.99433144]

Classifier  1
Cross val score:  0.991599577563
[ 0.99833389  0.98166667  0.986       0.99833278  0.99366455]



In [14]:
hr.corr()["left"]

satisfaction_level      -0.388375
last_evaluation          0.006567
number_project           0.023787
average_montly_hours     0.071287
time_spend_company       0.144822
Work_accident           -0.154622
left                     1.000000
promotion_last_5years   -0.061788
sales                    0.032105
salary                  -0.001294
Name: left, dtype: float64

In [15]:
# Drop everything with corr to left of < 0.005
# Makes it worse
drop = ['left', 'last_evaluation', 'number_project', 'sales', 'salary']
classifiers = [RandomForestClassifier(n_estimators=500 ,n_jobs=-1), RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=-1)]#, svm.SVC()]#xgb.XGBClassifier(n_estimators=500, nthread=-1)]# svm.SVC()]

for i, clf in enumerate(classifiers):
    print('Classifier ', i)
    
    cross_val_left(hr, clf, drop=drop)

Classifier  0
Cross val score:  0.979398999118
[ 0.98767078  0.965       0.97166667  0.99033011  0.98232744]

Classifier  1
Cross val score:  0.979132354681
[ 0.98767078  0.96433333  0.97066667  0.98999667  0.98299433]



In [16]:
from sklearn.feature_selection import RFE
model = RandomForestClassifier(n_estimators=500, n_jobs=-1)
clf = RFE(model,5 )

X = hr.drop(['left'],1)
y = hr.left 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    
clf.fit(X_train, y_train)
print(clf.support_)
print(clf.ranking_)
# Gets same result as feature_importance, which makes sense

[ True  True  True  True  True False False False False]
[1 1 1 1 1 4 5 2 3]


In [17]:
# from sklearn.feature_selection import RFE
# model = svm.SVC(kernel='linear')
# clf = RFE(model,5 )

# X = hr.drop(['left'],1)
# y = hr.left 
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    
# clf.fit(X_train, y_train)
# print(clf.support_)
# print(clf.ranking_)
# # Gets same result as feature_importance