# Employee Attrition Prediction using ML


In [621]:
# Importing Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot  as plt

In [622]:
# Reading Csv File as data frame
df = pd.read_csv(r"../CSV/Attrition.csv")

In [623]:
# Shape of Dataframe
df.shape

(1470, 35)

In [624]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [625]:
pd.set_option("display.max_rows", None,      "display.max_columns", None)
df.drop('EmployeeCount',inplace=True, axis = 1)
df.drop('Over18',inplace=True, axis = 1)
df.drop('StandardHours',inplace=True, axis = 1)
df.drop('MonthlyRate',inplace=True, axis = 1)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,Female,94,3,2,Sales Executive,4,Single,5993,8,Yes,11,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,2,3,Male,61,2,2,Research Scientist,2,Married,5130,1,No,23,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,6,Yes,15,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,5,4,Female,56,3,1,Research Scientist,3,Married,2909,1,Yes,11,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,9,No,12,3,4,1,6,3,3,2,2,2,2


In [626]:
# for col in df.columns:
#     print('---------------------')
#     print(df[col]
#     print('---------------------')

In [627]:
# Converting categorical data into numbers using label encoder
from sklearn.preprocessing import LabelEncoder 
le  = LabelEncoder() 
df[df.select_dtypes(include=['object']).columns]  = df[df.select_dtypes(include=['object']).columns].apply(le.fit_transform)

In [628]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,1102,2,1,2,1,1,2,0,94,3,2,7,4,2,5993,8,1,11,3,1,0,8,0,1,6,4,0,5
1,49,0,1,279,1,8,1,1,2,3,1,61,2,2,6,2,1,5130,1,0,23,4,4,1,10,3,3,10,7,1,7
2,37,1,2,1373,1,2,2,4,4,4,1,92,2,1,2,3,2,2090,6,1,15,3,2,0,7,3,3,0,0,0,0
3,33,0,1,1392,1,3,4,1,5,4,0,56,3,1,6,3,1,2909,1,1,11,3,3,0,8,3,3,8,7,3,0
4,27,0,2,591,1,2,1,3,7,1,1,40,3,1,2,2,1,3468,9,0,12,3,4,1,6,3,3,2,2,2,2


In [629]:
# Spliting the data into test-train data frames
from sklearn.model_selection  import train_test_split
df_train , df_test  = train_test_split(df ,  test_size  =  .2)

In [630]:
# Not Selecting the columns that have same information for all enteries
#-'DailyRate','MonthlyIncome', 'EmployeeNumber',
#-'Over18', 'StandardHours', 'MonthlyRate'
x_variable_list=['Age', 'BusinessTravel',  'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
        'NumCompaniesWorked',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager','DailyRate','MonthlyIncome', 'EmployeeNumber']
y_variable_list=['Attrition']

In [631]:
# Splitting the data frames into X and Y variables for Train and Test data set
df_train_x = df_train.loc[: ,x_variable_list]
df_train_y = df_train.iloc[: , 1 ]
df_test_x = df_test.loc[:   ,x_variable_list]
df_test_y = df_test.iloc[: , 1]

In [632]:
df.Attrition.value_counts()
# Data imbalance is present in the data set for value 1. So we will fix this later.

0    1233
1     237
Name: Attrition, dtype: int64

# Logistic Regression Model

In [633]:
from sklearn.linear_model import LogisticRegression
logreg  = LogisticRegression()
logreg.fit(df_train_x , df_train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [634]:
# Scaling tne data using StandardScaler to fix the above issu
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [635]:
# Creating a model of Logistic Regression Type
log_reg = make_pipeline(StandardScaler(), LogisticRegression())

In [636]:
# Training the model on scaled data
log_reg.fit(df_train_x, df_train_y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [637]:
# Making the prediction using the created Log Reg Model
pred_value = log_reg.predict(df_test_x)

In [638]:
# Calculating the accuracy score for Log Reg Model
from sklearn.metrics import accuracy_score
as_lr = accuracy_score(pred_value, df_test_y) * 100
as_lr

88.43537414965986

In [639]:
# Creation of Confusion Matrix for Log Reg Model
from sklearn.metrics import confusion_matrix
tab1  = confusion_matrix(pred_value , df_test_y)
tab1

array([[234,  29],
       [  5,  26]])

In [640]:
# Checking the value of AUROC for Log Reg Model
from sklearn.metrics  import  roc_auc_score
from sklearn.metrics  import  roc_curve   
log_roc_auc =  roc_auc_score(df_test_y,pred_value )
log_roc_auc

0.7259033853176112

# Create balanced class Log Reg

In [641]:
# Create a new Log Reg Model with class weight balanced
log_reg_cb = make_pipeline(StandardScaler(), LogisticRegression(class_weight='balanced'))

In [642]:
# Training the Weight balanced model Log Reg
log_reg_cb.fit(df_train_x, df_train_y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced'))])

In [643]:
# Making the prediction on Weight balanced model Log Reg 
pred_value_cb = log_reg_cb.predict(df_test_x)

In [644]:
# Calculating the accuracy score for Weight balanced model Log Reg
as_lrcb = accuracy_score(pred_value_cb , df_test_y) * 100
as_lrcb

72.78911564625851

In [645]:
# Confusion Matrix for Weight balanced model Log Reg
tab_cb  = confusion_matrix(pred_value_cb , df_test_y)
tab_cb

array([[174,  15],
       [ 65,  40]])

# Decision Tree Model

In [646]:
# Create a new Decision Tree Model with class weight balanced
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(class_weight='balanced')

In [647]:
# Training the decision tree model
dt.fit(df_train_x , df_train_y)

DecisionTreeClassifier(class_weight='balanced')

In [648]:
# Making the prediction on decision tree model
pred_test_dt =  dt.predict(df_test_x)

In [649]:
# Creating confusion matrix for Decision Tree model
tab_dt = confusion_matrix(pred_test_dt , df_test_y)
tab_dt

array([[215,  39],
       [ 24,  16]])

In [650]:
# Finding the accuracy score of Decision Tree model
as_dt = accuracy_score( pred_test_dt,df_test_y ) * 100
as_dt

78.57142857142857

# Random Forest Model

In [651]:
# Create a Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rfc  = RandomForestClassifier(n_estimators= 100)

In [652]:
# Training the RFC Model and Training it on Data set
rfc.fit(df_train_x , df_train_y)
pred_rfc  =  rfc.predict(df_test_x)

In [653]:
# Creating Confusion Matrix for Random Forest
tab_rfc = confusion_matrix(pred_rfc, df_test_y)
tab_rfc

array([[236,  44],
       [  3,  11]])

In [654]:
# Finding the accuracy score of RFC Model
as_rfc =accuracy_score(pred_rfc , df_test_y) * 100
as_rfc

84.01360544217688

# Handling the Class Imbalance Manually

In [655]:
df.Attrition.value_counts()

0    1233
1     237
Name: Attrition, dtype: int64

In [656]:
# Splitting the new balanced data frame
bal_df_train , bal_df_test  = train_test_split(df ,  test_size  =  .2)

In [657]:
bal_df_train   = pd.concat([bal_df_train,       
                          df[df.Attrition == 1]  ,
                          df[df.Attrition == 1] ,
                          df[df.Attrition == 1] ,
                          df[df.Attrition == 1] ] ) 

In [658]:
bal_df_train.Attrition.value_counts()

1    1143
0     981
Name: Attrition, dtype: int64

In [659]:
# Splitting the data frames into X and Y variables for Train and Test data set
bal_df_train_x = bal_df_train.loc[: ,x_variable_list]
bal_df_train_y = bal_df_train.iloc[: , 1 ]
bal_df_test_x = bal_df_test.loc[:   ,x_variable_list]
bal_df_test_y = bal_df_test.iloc[: , 1]

In [660]:
# Creating a new RFC with for training on balanced data set
rfc_bal  = RandomForestClassifier(n_estimators= 100)

In [661]:
# Training the new RFC and making the prediction
rfc_bal.fit(bal_df_train_x , bal_df_train_y)
bal_pred_rfc  =  rfc_bal.predict(bal_df_test_x)

In [662]:
# Creating Confusion Matrix for new RFC
bal_tab_rfc = confusion_matrix(bal_pred_rfc, bal_df_test_y)
bal_tab_rfc

array([[244,   0],
       [  8,  42]])

In [663]:
# Finding the accuracy score of new RFC Model
as_rfcb = accuracy_score(bal_pred_rfc , bal_df_test_y) * 100
as_rfcb

97.27891156462584

# ADA Boost

In [664]:
from sklearn.ensemble import AdaBoostClassifier

In [665]:
# Creating a Ada Boost classifier with RFC
rfb  = RandomForestClassifier(n_estimators= 100)
ada_rfb  = AdaBoostClassifier( rfb,n_estimators=10) 

In [666]:
# Train the ADA+RFC model on balanced dataset
ada_rfb.fit(bal_df_train_x , bal_df_train_y)

AdaBoostClassifier(base_estimator=RandomForestClassifier(), n_estimators=10)

In [667]:
pred_ada_rfb  = ada_rfb.predict(bal_df_test_x)
tab_ada_rfb = confusion_matrix(pred_ada_rfb ,bal_df_test_y)
tab_ada_rfb

array([[246,   0],
       [  6,  42]])

In [668]:
# Finding the accuracy score of ADA+RFC
as_adrf = accuracy_score(pred_ada_rfb , bal_df_test_y) * 100
as_adrf

97.95918367346938

# Cross Validation

In [669]:
from sklearn.model_selection import cross_val_score

In [670]:
# Checking the score of ADA+RFC model using CV
scores  = cross_val_score(ada_rfb , bal_df_train_x , bal_df_train_y ,cv  = 10)
scores

array([0.98591549, 0.96713615, 0.97652582, 0.98122066, 0.98113208,
       0.96698113, 0.97641509, 0.97169811, 0.97169811, 0.99056604])

In [671]:
scores.mean()

0.9769288688103464

In [672]:
scores.max()

0.9905660377358491

# Compairing all the Models accuracy score and Confusion Matrix

In [673]:
compare_df = pd.DataFrame({"Data Balanced":["No","Yes","Yes","No","Yes","Yes"],"Accuracy Score":[as_lr,as_lrcb,as_dt,as_rfc,as_rfcb,as_adrf]},index=["1. Logarithmic Regression","2. Logarithmic Regression","3. Decision Tree","4. Random Forest","5. Random Forest","6. ADA Boost+RFC"])
compare_df.head(6)

Unnamed: 0,Data Balanced,Accuracy Score
1. Logarithmic Regression,No,88.435374
2. Logarithmic Regression,Yes,72.789116
3. Decision Tree,Yes,78.571429
4. Random Forest,No,84.013605
5. Random Forest,Yes,97.278912
6. ADA Boost+RFC,Yes,97.959184


In [674]:
# Confusion Matrix for all models
print('1. Logarithmic Regression '+str(round(as_lr))+'%')
print(tab1)
print()
print('2. Logarithmic Regression '+str(round(as_lrcb))+'%')
print(tab_cb)
print()
print('3. Decision Tree '+str(round(as_dt))+'%')
print(tab_dt)
print()
print('4. Random Forest '+str(round(as_rfc))+'%')
print(tab_rfc)
print()
print('5. Random Forest '+str(round(as_rfcb))+'%')
print(bal_tab_rfc)
print()
print('6. ADA Boost+RFC '+str(round(as_adrf))+'%')
print(tab_ada_rfb)

1. Logarithmic Regression 88%
[[234  29]
 [  5  26]]

2. Logarithmic Regression 73%
[[174  15]
 [ 65  40]]

3. Decision Tree 79%
[[215  39]
 [ 24  16]]

4. Random Forest 84%
[[236  44]
 [  3  11]]

5. Random Forest 97%
[[244   0]
 [  8  42]]

6. ADA Boost+RFC 98%
[[246   0]
 [  6  42]]


# Based on the Confusion Matrix and their respective Accuracies, Random Forest and Ada Boost + Random Forest models are giving accurate results.

In [1]:
pwd

'/home/adarsh/Videos'