In [51]:
msg = "Welcome to model generator."
print(msg)

Welcome to model generator.


## Load necessary packages

In [52]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Importing data

In [53]:
# Importing the csv file
data = pd.read_excel('employee_training_data.xls')
# data = pd.read_excel('employee_with_PR_randomness.xlsx')
# data = pd.read_excel('employee_training_data_with_ai.xlsx')

### Data Analysis

In [54]:
data.shape

(1200, 28)

In [55]:
data.columns

Index(['EmpNumber', 'Age', 'Gender', 'EducationBackground', 'MaritalStatus',
       'EmpDepartment', 'EmpJobRole', 'BusinessTravelFrequency',
       'DistanceFromHome', 'EmpEducationLevel', 'EmpEnvironmentSatisfaction',
       'EmpHourlyRate', 'EmpJobInvolvement', 'EmpJobLevel',
       'EmpJobSatisfaction', 'NumCompaniesWorked', 'OverTime',
       'EmpLastSalaryHikePercent', 'EmpRelationshipSatisfaction',
       'TotalWorkExperienceInYears', 'TrainingTimesLastYear',
       'EmpWorkLifeBalance', 'ExperienceYearsAtThisCompany',
       'ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Attrition', 'PerformanceRating'],
      dtype='object')

In [56]:
data.head()

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3


In [57]:
# Looking for missing data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   EmpNumber                     1200 non-null   object
 1   Age                           1200 non-null   int64 
 2   Gender                        1200 non-null   object
 3   EducationBackground           1200 non-null   object
 4   MaritalStatus                 1200 non-null   object
 5   EmpDepartment                 1200 non-null   object
 6   EmpJobRole                    1200 non-null   object
 7   BusinessTravelFrequency       1200 non-null   object
 8   DistanceFromHome              1200 non-null   int64 
 9   EmpEducationLevel             1200 non-null   int64 
 10  EmpEnvironmentSatisfaction    1200 non-null   int64 
 11  EmpHourlyRate                 1200 non-null   int64 
 12  EmpJobInvolvement             1200 non-null   int64 
 13  EmpJobLevel       

## Feature Selection    
-     There are a lot of columns in the predictor variable. So, the correlation coeffecient is calculated to see which of them are important and these are then used for training methods. From there, we also get the top factors which affect performance. We can see that the most important features selectd were Department, Job Role, Environment Satisfaction, Last Salary Hike Percent, Work Life Balance, Experience Years At This Company, Experience Years In Current Role, Years Since Last Promotion, Years With Current Manager. These were selected because their correlation coeffecient with Performance Rating was more than 0.1. 
-     Standardization and Label Encoding was also used for feature transformation.
-     A separate analysis considering all the predictors was carried out but it resulted in decreasing the accuracy. Similarly, Principal Component Analysis also reduces the accuracy.
-     Top 3 factors which affect the employee performance are 1. Employee EnvironmentSatisfaction, 2. Employee Last Salary Hike Percent and 3. Years Since Last Promotion

In [58]:
# Encoding all the ordinal columns and creating a dummy variable for them to see if there are any effects on Performance Rating
enc = LabelEncoder()
for i in (2,3,4,5,6,7,16,26):
    data.iloc[:,i] = enc.fit_transform(data.iloc[:,i])
data.head()

# Dropping the first columns as it is of no use for analysis.
data.drop(['EmpNumber'],inplace=True,axis=1)

In [59]:
# Finding out the correlation coeffecient to find out which predictors are significant.
data_corr = data.corr()

In [60]:
data.head()

Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,32,1,2,2,5,13,2,10,3,4,...,4,10,2,2,10,7,0,8,0,3
1,47,1,2,2,5,13,2,14,4,4,...,4,20,2,3,7,7,1,7,0,3
2,40,1,1,1,5,13,1,5,4,4,...,3,20,2,3,18,13,1,12,0,4
3,41,1,0,0,3,8,2,10,4,2,...,2,23,2,2,21,6,12,6,0,3
4,60,1,2,2,5,13,2,16,4,1,...,4,10,1,3,2,2,2,2,0,3


In [61]:
# Selecting only the important columns
y = data.PerformanceRating
# y = data.sd
X = data.iloc[:,[4,9,16,20,21,22,23,24]] # Taking only variables with correlation coeffecient greater than 0.1
X.head()

Unnamed: 0,EmpDepartment,EmpEnvironmentSatisfaction,EmpLastSalaryHikePercent,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,5,4,12,2,10,7,0,8
1,5,4,12,3,7,7,1,7
2,5,4,21,3,18,13,1,12
3,3,2,15,2,21,6,12,6
4,5,1,14,3,2,2,2,2


In [62]:
# Splitting into train and test for calculating the accuracy
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [63]:
# Standardization technique is used
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train.shape

(840, 8)

In [64]:
# Checking data used for testing
X_test.shape

(360, 8)

### ML Models

* we are using different models to find the most accurate one.
  * Logistic Regression
  * Support Vector Machine
  * Decision Tree
  * Random Forest
  * Naive Bayes
  * K-Nearest Neighbour
  * XGBoost Classifier
  * Artificial Neural Network

#### 1. Logistic Regression

In [65]:
# Training the model
from sklearn.linear_model import LogisticRegression
model_logr = LogisticRegression()
model_logr.fit(X_train,y_train)

In [66]:
# Predicting the model
y_predict_log = model_logr.predict(X_test)

In [67]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_log))
print(classification_report(y_test,y_predict_log))

0.8333333333333334
              precision    recall  f1-score   support

           2       0.66      0.49      0.56        63
           3       0.87      0.94      0.90       264
           4       0.79      0.67      0.72        33

    accuracy                           0.83       360
   macro avg       0.77      0.70      0.73       360
weighted avg       0.82      0.83      0.82       360



In [68]:
confusion_matrix(y_test,y_predict_log)

array([[ 31,  29,   3],
       [ 14, 247,   3],
       [  2,   9,  22]])

#### 2. Support Vector Machine

In [69]:
# Training the model
from sklearn.svm import SVC
rbf_svc = SVC(kernel='rbf', C=100, random_state=10).fit(X_train,y_train)

In [70]:
# Predicting the model
y_predict_svm = rbf_svc.predict(X_test)

In [71]:
# Predicting the model
y_predict_svm = rbf_svc.predict(X_test)

In [72]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_svm))
print(classification_report(y_test,y_predict_svm))

0.8666666666666667
              precision    recall  f1-score   support

           2       0.78      0.81      0.80        63
           3       0.93      0.90      0.92       264
           4       0.59      0.70      0.64        33

    accuracy                           0.87       360
   macro avg       0.77      0.80      0.78       360
weighted avg       0.87      0.87      0.87       360



In [73]:
confusion_matrix(y_test,y_predict_svm)

array([[ 51,  11,   1],
       [ 11, 238,  15],
       [  3,   7,  23]])

#### 3. Decision Tree with GridSearchCV

In [74]:
# Training the model
from sklearn.tree import DecisionTreeClassifier

classifier_dtg=DecisionTreeClassifier(random_state=42,splitter='best')
parameters=[{'min_samples_split':[2,3,4,5],'criterion':['gini']},{'min_samples_split':[2,3,4,5],'criterion':['entropy']}]

model_griddtree=GridSearchCV(estimator=classifier_dtg, param_grid=parameters, scoring='accuracy',cv=10)
model_griddtree.fit(X_train,y_train)

In [75]:
model_griddtree.best_params_

{'criterion': 'entropy', 'min_samples_split': 4}

In [76]:
# Predicting the model
y_predict_dtree = model_griddtree.predict(X_test)

In [77]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_dtree))
print(classification_report(y_test,y_predict_dtree))

0.8916666666666667
              precision    recall  f1-score   support

           2       0.85      0.79      0.82        63
           3       0.93      0.94      0.93       264
           4       0.69      0.73      0.71        33

    accuracy                           0.89       360
   macro avg       0.82      0.82      0.82       360
weighted avg       0.89      0.89      0.89       360



In [78]:
confusion_matrix(y_test,y_predict_dtree)

array([[ 50,  12,   1],
       [  7, 247,  10],
       [  2,   7,  24]])

#### 4. Random Forest with GridSearchCV

In [79]:
# Training the model
from sklearn.ensemble import RandomForestClassifier

classifier_rfg=RandomForestClassifier(random_state=33,n_estimators=23)
parameters=[{'min_samples_split':[2,3,4,5],'criterion':['gini','entropy'],'min_samples_leaf':[1,2,3]}]

model_gridrf=GridSearchCV(estimator=classifier_rfg, param_grid=parameters, scoring='accuracy',cv=10)
model_gridrf.fit(X_train,y_train)

In [80]:
model_gridrf.best_params_

{'criterion': 'entropy', 'min_samples_leaf': 3, 'min_samples_split': 2}

In [81]:
# Predicting the model
y_predict_rf = model_gridrf.predict(X_test)

In [82]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_rf))
print(classification_report(y_test,y_predict_rf))

0.925
              precision    recall  f1-score   support

           2       0.90      0.89      0.90        63
           3       0.94      0.96      0.95       264
           4       0.79      0.70      0.74        33

    accuracy                           0.93       360
   macro avg       0.88      0.85      0.86       360
weighted avg       0.92      0.93      0.92       360



In [83]:
confusion_matrix(y_test,y_predict_rf)

array([[ 56,   7,   0],
       [  4, 254,   6],
       [  2,   8,  23]])

#### 5. Naive Bayes Bernoulli

In [84]:
# Training the model
from sklearn.naive_bayes import BernoulliNB
model_nb = BernoulliNB()
model_nb.fit(X_train,y_train)

In [85]:
# Predicting the model
y_predict_nb = model_nb.predict(X_test)

In [86]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_nb))
print(classification_report(y_test,y_predict_nb))

0.8027777777777778
              precision    recall  f1-score   support

           2       0.77      0.52      0.62        63
           3       0.81      0.97      0.88       264
           4       0.00      0.00      0.00        33

    accuracy                           0.80       360
   macro avg       0.53      0.50      0.50       360
weighted avg       0.73      0.80      0.76       360



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [87]:
confusion_matrix(y_test,y_predict_nb)

array([[ 33,  30,   0],
       [  8, 256,   0],
       [  2,  31,   0]])

#### 6. K-Nearest Neighbor

In [88]:
# Training the model
from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier(n_neighbors=10,metric='euclidean') # Maximum accuracy for n=10
model_knn.fit(X_train,y_train)

In [89]:
# Predicting the model
y_predict_knn = model_knn.predict(X_test)

In [90]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_knn))
print(classification_report(y_test,y_predict_knn))

0.8333333333333334
              precision    recall  f1-score   support

           2       0.69      0.60      0.64        63
           3       0.87      0.93      0.90       264
           4       0.77      0.52      0.62        33

    accuracy                           0.83       360
   macro avg       0.78      0.68      0.72       360
weighted avg       0.83      0.83      0.83       360



In [91]:
confusion_matrix(y_test,y_predict_knn)

array([[ 38,  23,   2],
       [ 16, 245,   3],
       [  1,  15,  17]])

#### 7. XGBoost Classifier

In [92]:
# Training the model
from xgboost import XGBClassifier
model_xgb = XGBClassifier()
le = LabelEncoder()
xgboost_y = le.fit_transform(y_train)
model_xgb.fit(X_train,xgboost_y)

In [93]:
# Predicting the model
y_predict_xgb = model_xgb.predict(X_test)

In [94]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_xgb))
print(classification_report(y_test,y_predict_xgb))

0.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00      63.0
           3       0.00      0.00      0.00     264.0
           4       0.00      0.00      0.00      33.0

    accuracy                           0.00     360.0
   macro avg       0.00      0.00      0.00     360.0
weighted avg       0.00      0.00      0.00     360.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [95]:
confusion_matrix(y_test,y_predict_xgb)

array([[  0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0],
       [ 54,   9,   0,   0,   0],
       [  8, 250,   6,   0,   0],
       [  2,   6,  25,   0,   0]])

#### 8. Artificial Neural Network

In [96]:
# Training the model
from sklearn.neural_network import MLPClassifier
model_mlp = MLPClassifier(hidden_layer_sizes=(100,100,100),batch_size=10,learning_rate_init=0.01,max_iter=2000,random_state=10)
model_mlp.fit(X_train,y_train)

In [97]:
# Predicting the model
y_predict_mlp = model_mlp.predict(X_test)

In [98]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_mlp))
print(classification_report(y_test,y_predict_mlp))

0.9083333333333333
              precision    recall  f1-score   support

           2       0.84      0.84      0.84        63
           3       0.93      0.95      0.94       264
           4       0.82      0.70      0.75        33

    accuracy                           0.91       360
   macro avg       0.87      0.83      0.85       360
weighted avg       0.91      0.91      0.91       360



In [99]:
confusion_matrix(y_test,y_predict_mlp)

array([[ 53,  10,   0],
       [  8, 251,   5],
       [  2,   8,  23]])

In [100]:
# Exporting the trained model
import joblib
joblib.dump(model_gridrf,'employee_performance.ml')

['employee_performance.ml']