# Project HR

Predict attrition of your valuable employees

[IBM HR Analytics Employee Attrition & Performance](https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset)

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix

In [15]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [16]:
from IPython.display import HTML
pd.set_option('display.max_colwidth', -1)
display(HTML(df.head().to_html()))
pd.reset_option('display.max_colwidth')

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null object
BusinessTravel              1470 non-null object
DailyRate                   1470 non-null int64
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EducationField              1470 non-null object
EmployeeCount               1470 non-null int64
EmployeeNumber              1470 non-null int64
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
HourlyRate                  1470 non-null int64
JobInvolvement              1470 non-null int64
JobLevel                    1470 non-null int64
JobRole                     1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome         

In [18]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [19]:
features = ['Age', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']

In [20]:
df.Attrition.value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

In [21]:
X = df[features]
y = df.Attrition

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, 
                                                    stratify=y)

In [23]:
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

In [24]:
num = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount', 'EmployeeNumber', 
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction',             
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager']

cat = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 
       'MaritalStatus', 'Over18', 'OverTime']

In [25]:
le_bt = preprocessing.LabelEncoder()
le_dept = preprocessing.LabelEncoder()
le_edu = preprocessing.LabelEncoder()
le_gender = preprocessing.LabelEncoder()
le_job = preprocessing.LabelEncoder()
le_marri = preprocessing.LabelEncoder()
le_over18 = preprocessing.LabelEncoder()
le_ot = preprocessing.LabelEncoder()

enc_bt = OneHotEncoder(sparse=False)
enc_dept = OneHotEncoder(sparse=False)
enc_edu = OneHotEncoder(sparse=False)
enc_gender = OneHotEncoder(sparse=False)
enc_job = OneHotEncoder(sparse=False)
enc_marri = OneHotEncoder(sparse=False)
enc_over18 = OneHotEncoder(sparse=False)
enc_ot = OneHotEncoder(sparse=False)

bt = le_bt.fit_transform(X_train.BusinessTravel)
dept = le_dept.fit_transform(X_train.Department)
edu = le_edu.fit_transform(X_train.EducationField)
gender = le_gender.fit_transform(X_train.Gender)
job = le_job.fit_transform(X_train.JobRole)
marri = le_marri.fit_transform(X_train.MaritalStatus)
over18 = le_over18.fit_transform(X_train.Over18)
ot = le_ot.fit_transform(X_train.OverTime)

bt = enc_bt.fit_transform(bt.reshape(-1,1))
dept = enc_dept.fit_transform(dept.reshape(-1,1))
edu = enc_edu.fit_transform(edu.reshape(-1,1))
gender = enc_gender.fit_transform(gender.reshape(-1,1))
job = enc_job.fit_transform(job.reshape(-1,1))
marri = enc_marri.fit_transform(marri.reshape(-1,1))
over18 = enc_over18.fit_transform(over18.reshape(-1,1))
ot = enc_ot.fit_transform(ot.reshape(-1,1))

In [26]:
X_train = pd.concat([X_train[num],
                     pd.DataFrame(np.hstack((bt, dept, edu, gender,
                                             job, marri, over18, ot)))], axis=1)

In [27]:
X_train.shape

(1333, 55)

In [28]:
comb = pd.concat([X_train, y_train], axis=1)
comb = comb.dropna()
comb.shape

(725, 56)

In [29]:
y_train = comb['Attrition']

In [30]:
X_train = comb[X_train.columns]

In [31]:
test_bt = le_bt.transform(X_test.BusinessTravel)
test_dept = le_dept.transform(X_test.Department)
test_edu = le_edu.transform(X_test.EducationField)
test_gender = le_gender.transform(X_test.Gender)
test_job = le_job.transform(X_test.JobRole)
test_marri = le_marri.transform(X_test.MaritalStatus)
test_over18 = le_over18.transform(X_test.Over18)
test_ot = le_ot.transform(X_test.OverTime)

test_bt = enc_bt.fit_transform(test_bt.reshape(-1,1))
test_dept = enc_dept.fit_transform(test_dept.reshape(-1,1))
test_edu = enc_edu.fit_transform(test_edu.reshape(-1,1))
test_gender = enc_gender.fit_transform(test_gender.reshape(-1,1))
test_job = enc_job.fit_transform(test_job.reshape(-1,1))
test_marri = enc_marri.fit_transform(test_marri.reshape(-1,1))
test_over18 = enc_over18.fit_transform(test_over18.reshape(-1,1))
test_ot = enc_ot.fit_transform(test_ot.reshape(-1,1))

In [32]:
X_test = pd.concat([X_test[num],
                     pd.DataFrame(np.hstack((bt, dept, edu, gender,
                                             job, marri, over18, ot)))], axis=1)

In [33]:
X_test.shape

(1166, 55)

In [34]:
test_comb = pd.concat([X_test, y_test], axis=1)
test_comb = test_comb.dropna()
test_comb.shape

(304, 56)

In [35]:
y_test = test_comb['Attrition']
X_test = test_comb[X_test.columns]

In [36]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Model 1:

In [37]:
clf = LogisticRegression(solver="lbfgs")
clf.fit(X_train, y_train)
print(confusion_matrix(clf.predict(X_train), y_train))
print(accuracy_score(clf.predict(X_train), y_train))
print(confusion_matrix(clf.predict(X_test), y_test))
print(accuracy_score(clf.predict(X_test), y_test))

[[610 111]
 [  2   2]]
0.8441379310344828
[[244  57]
 [  1   2]]
0.8092105263157895


In [38]:
y_test.value_counts()

No     245
Yes     59
Name: Attrition, dtype: int64

# Model 2:

In [39]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
print(confusion_matrix(clf.predict(X_train), y_train))
print(accuracy_score(clf.predict(X_train), y_train))
print(confusion_matrix(clf.predict(X_test), y_test))
print(accuracy_score(clf.predict(X_test), y_test))

[[611  25]
 [  1  88]]
0.9641379310344828
[[238  50]
 [  7   9]]
0.8125


# Model 3:

In [40]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print(confusion_matrix(clf.predict(X_train), y_train))
print(accuracy_score(clf.predict(X_train), y_train))
print(confusion_matrix(clf.predict(X_test), y_test))
print(accuracy_score(clf.predict(X_test), y_test))

[[612   0]
 [  0 113]]
1.0
[[200  37]
 [ 45  22]]
0.7302631578947368


# Model 4:

In [41]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print(confusion_matrix(clf.predict(X_train), y_train))
print(accuracy_score(clf.predict(X_train), y_train))
print(confusion_matrix(clf.predict(X_test), y_test))
print(accuracy_score(clf.predict(X_test), y_test))

[[610  15]
 [  2  98]]
0.976551724137931
[[238  53]
 [  7   6]]
0.8026315789473685


***