# Train Model

In [1]:
# Importing the necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# For machine learning
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#The data that was saved is now imported     
dataset = pd.read_csv('INX_Future_Inc_Employee_Performance_Exploratory.csv')
dataset.head()

Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,32,1,2,2,5,13,2,10,3,4,...,4,10,2,2,10,7,0,8,0,3
1,47,1,2,2,5,13,2,14,4,4,...,4,20,2,3,7,7,1,7,0,3
2,40,1,1,1,5,13,1,5,4,4,...,3,20,2,3,18,13,1,12,0,4
3,41,1,0,0,3,8,2,10,4,2,...,2,23,2,2,21,6,12,6,0,3
4,60,1,2,2,5,13,2,16,4,1,...,4,10,1,3,2,2,2,2,0,3


## Machine learning models implementation

In [3]:
#check the data-type of our target variable
#It is a categorical variable, consisting of three values (2, 3 and 4),making it a typical classification problem.
dataset['PerformanceRating'].unique()

array([3, 4, 2], dtype=int64)

In the section below,we will use the following algorithms :<br> 
K-Nearest Neighbor, <br>
Support Vector Machine, <br>
Naive Bayes Bernoulli<br>
K-Nearest Neighbor,<br>
Random Forest with GridSearchCV<br>
XGBoost Classifier

## Splitting Data into Train and Test set

In [4]:
#Setting the target and the predictors variable
X=dataset.iloc[:,[4,5,9,16,20,21,22,23,24]]# The correlation coeffecient of each variable must be greater than 0.1
y = dataset.PerformanceRating # Target Performance Rating 
#Split data into train and test sets as well as for validation and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=11)
print(X_train.shape,X_test.shape)

(840, 9) (360, 9)


## A. K-Nearest Neighbors¶
K Nearest Neighbour is a simple algorithm that stores all the available cases and classifies the new data or case based on a similarity measure. It is mostly used to classifies a data point based on how its neighbours are classified.

In [5]:
# The training of the model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7,metric='euclidean') 
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform')

In [6]:
#the model prediction 
predict_knn = knn.predict(X_test)

In [7]:
# Finding accuracy, classification report and confusion matrix
print(accuracy_score(y_test,predict_knn))
print(classification_report(y_test,predict_knn))

0.8222222222222222
              precision    recall  f1-score   support

           2       0.66      0.36      0.47        58
           3       0.83      0.97      0.90       261
           4       0.88      0.56      0.69        41

   micro avg       0.82      0.82      0.82       360
   macro avg       0.79      0.63      0.68       360
weighted avg       0.81      0.82      0.80       360



In [8]:
confusion_matrix(y_test,predict_knn)

array([[ 21,  36,   1],
       [  7, 252,   2],
       [  4,  14,  23]], dtype=int64)

## B. Support Vector Machine
SVM is a machine learning technique that can be used for both regression and classification problems. 
It constructs a hyperplane in multi-dimensional space to separate a dataset into different classes 
in the best possible way.

In [9]:
# Training model
svc_r = SVC(kernel='rbf', C=100, random_state=8).fit(X_train,y_train)
predict_svm = svc_r.predict(X_test)#the model prediction

In [10]:
# Finding accuracy, classification report and confusion matrix
print(accuracy_score(y_test,predict_svm))
print(classification_report(y_test,predict_svm))

0.775
              precision    recall  f1-score   support

           2       0.55      0.36      0.44        58
           3       0.81      0.91      0.86       261
           4       0.74      0.49      0.59        41

   micro avg       0.78      0.78      0.78       360
   macro avg       0.70      0.59      0.63       360
weighted avg       0.76      0.78      0.76       360



In [11]:
confusion_matrix(y_test,predict_svm)

array([[ 21,  37,   0],
       [ 16, 238,   7],
       [  1,  20,  20]], dtype=int64)

## C. Naive Bayes Bernoulli
A Naive Bayes classifier is a probabilistic machine learning model that’s used for classification task.The crux of the classifier is based on the Bayes theorem.

In [12]:
# Model training
nb = BernoulliNB()
nb.fit(X_train,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [13]:
#model prediction
predict_nb = nb.predict(X_test)

# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,predict_nb))
print(classification_report(y_test,predict_nb))

0.725
              precision    recall  f1-score   support

           2       0.00      0.00      0.00        58
           3       0.72      1.00      0.84       261
           4       0.00      0.00      0.00        41

   micro avg       0.72      0.72      0.73       360
   macro avg       0.24      0.33      0.28       360
weighted avg       0.53      0.72      0.61       360



In [14]:
confusion_matrix(y_test,predict_nb)

array([[  0,  58,   0],
       [  0, 261,   0],
       [  0,  41,   0]], dtype=int64)

## E. Random Forest with GridSearchCV
A Random Forest is an ensemble technique capable of performing both regression 
and classification tasks with the use of multiple decision trees and a technique 
called Bootstrap Aggregation. We will use grid search cross validation method to determine the optimal values to be used for the hyperparameters of our model from a specified range of values.

In [15]:
# Training the model
cls=RandomForestClassifier(random_state=34,n_estimators=10)
parameters=[{'min_samples_split':[2,3,4,5],'criterion':['gini','entropy'],'min_samples_leaf':[1,2,3]}]

model_grf=GridSearchCV(estimator=cls, param_grid=parameters, scoring='accuracy',cv=10)
model_grf.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=34, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'min_samples_split': [2, 3, 4, 5], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 2, 3]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [16]:
model_grf.best_params_

{'criterion': 'entropy', 'min_samples_leaf': 2, 'min_samples_split': 2}

In [17]:
# model prediction
y_predict_rf = model_grf.predict(X_test)

In [18]:
# Finding accuracy, precision, recall and confusion matrix
print(accuracy_score(y_test,y_predict_rf))
print(classification_report(y_test,y_predict_rf))

0.9222222222222223
              precision    recall  f1-score   support

           2       0.94      0.79      0.86        58
           3       0.91      0.99      0.95       261
           4       1.00      0.68      0.81        41

   micro avg       0.92      0.92      0.92       360
   macro avg       0.95      0.82      0.87       360
weighted avg       0.93      0.92      0.92       360



In [19]:
confusion_matrix(y_test,y_predict_rf)

array([[ 46,  12,   0],
       [  3, 258,   0],
       [  0,  13,  28]], dtype=int64)

## F. XGBoost Classifier
XGBoost is an open source library providing a high-performance implementation of gradient boosted decision trees. An underlying C++ codebase combined with a Python interface sitting on top makes for an extremely powerful yet easy to implement package.

In [20]:
# Model training
# Training the model
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

In [22]:
#model prediction
y_pred_xgb = xgb_model.predict(X_test)

In [23]:
# Finding accuracy, Classification reportand confusion matrix
print(accuracy_score(y_test,y_pred_xgb ))
print(classification_report(y_test,y_pred_xgb ))

0.9138888888888889
              precision    recall  f1-score   support

           2       0.87      0.81      0.84        58
           3       0.93      0.95      0.94       261
           4       0.85      0.80      0.83        41

   micro avg       0.91      0.91      0.91       360
   macro avg       0.88      0.86      0.87       360
weighted avg       0.91      0.91      0.91       360



In [24]:
confusion_matrix(y_test,y_pred_xgb)

array([[ 47,  10,   1],
       [  7, 249,   5],
       [  0,   8,  33]], dtype=int64)

In [25]:
#Saving the data
dataset.to_csv('INX_Future_Inc_Employee_Performance_Exploratory.csv',index=False)
dataset.head()

Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,32,1,2,2,5,13,2,10,3,4,...,4,10,2,2,10,7,0,8,0,3
1,47,1,2,2,5,13,2,14,4,4,...,4,20,2,3,7,7,1,7,0,3
2,40,1,1,1,5,13,1,5,4,4,...,3,20,2,3,18,13,1,12,0,4
3,41,1,0,0,3,8,2,10,4,2,...,2,23,2,2,21,6,12,6,0,3
4,60,1,2,2,5,13,2,16,4,1,...,4,10,1,3,2,2,2,2,0,3
