Your client is a large MNC and they have 9 broad verticals across the organisation. One of the problem your client is facing is around identifying the right people for promotion (only for manager position and below) and prepare them in time. Currently the process, they are following is:

They have provided multiple attributes around Employee's past and current performance along with demographics.

Problem Statement: Predict whether a potential promotee at checkpoint in the test set will be promoted or not after the evaluation process.



In [1]:
# import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# load the train data
hr_train=pd.read_csv(r"C:\Users\RBI\Downloads\train_hr.csv")
hr_train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [3]:
# check the shape of data
hr_train.shape

(54808, 14)

In [4]:
hr_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB


In [5]:
hr_train.describe()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,50684.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.830627,1.253011,34.803915,3.329256,5.865512,0.351974,0.023172,63.38675,0.08517
std,22586.581449,0.609264,7.660169,1.259993,4.265094,0.47759,0.15045,13.371559,0.279137
min,1.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


### Check for  Missing Values

In [27]:
hr_train.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

There are missing values present in the data

In [7]:
# Filling the missing values
for value in ["education",'previous_year_rating']:
    hr_train[value].fillna(hr_train[value].mode()[0],inplace=True)

In [8]:
hr_train.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

The missing values have been filled. Mode is used to fill the values.

### Convert Categorical Variable to Numerical 

In [9]:
colname=[]
for x in hr_train.columns:
    if hr_train[x].dtype=='object':
        colname.append(x)
colname

['department', 'region', 'education', 'gender', 'recruitment_channel']

In [10]:
# using label encoder for conversion
from sklearn.preprocessing import LabelEncoder
 
le=LabelEncoder()
for x in colname:
    hr_train[x]=le.fit_transform(hr_train[x])

In [11]:
hr_train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,7,31,2,0,2,1,35,5.0,8,1,0,49,0
1,65141,4,14,0,1,0,1,30,5.0,4,0,0,60,0
2,7513,7,10,0,1,2,1,34,3.0,7,0,0,50,0
3,2542,7,15,0,1,0,2,39,1.0,10,0,0,50,0
4,48945,8,18,0,1,0,1,45,3.0,2,0,0,73,0


### Scale the Data

In [13]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(X)
X= scaler.transform(X)

### Split Data into Train & Test

In [12]:
# Create X & Y in cars train
X=hr_train.values[:,0:-1]
Y=hr_train.values[:,-1]

In [14]:
from sklearn.model_selection import train_test_split
  
# split into 70:30 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, 
                                                    random_state = 10)
  

In [15]:
print(X.shape)
print(Y.shape)

(54808, 13)
(54808,)


### Model Building
Logictic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

# create a model object
classifier = LogisticRegression()

# fitting training data to the model
classifier.fit(X_train, Y_train)


Y_pred=classifier.predict(X_test)
print(Y_pred)

[0. 0. 0. ... 0. 0. 0.]


In [32]:
Y_pred_prob=classifier.predict_proba(X_test)
Y_pred_prob

array([[0.98730288, 0.01269712],
       [0.99267707, 0.00732293],
       [0.96677285, 0.03322715],
       ...,
       [0.98928198, 0.01071802],
       [0.91229088, 0.08770912],
       [0.95768392, 0.04231608]])

In [33]:
#print(list(zip(Y_test,Y_pred)))

In [34]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[14994    70]
 [ 1276   103]]
Classification report: 
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96     15064
         1.0       0.60      0.07      0.13      1379

    accuracy                           0.92     16443
   macro avg       0.76      0.54      0.54     16443
weighted avg       0.89      0.92      0.89     16443

Accuracy of the model:  0.9181414583713434


Recall value for Class 0- 1.00

Recall value for Class 1- 0.07

It can be seen that the model predicts very well on the class 0  but has very bad prediction on class 1. This is due to less data present for the class 1. Apply SMOTE to get equal number of data for class1.

###  Applying SMOTE

In [89]:
print("Before OverSampling, counts of label '1': ", (sum(Y_train == 1)))
print("Before OverSampling, counts of label '0': ", (sum(Y_train == 0)))
  
# import SMOTE from imblearn library
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 10,k_neighbors=5)
X_train_new, Y_train_new = sm.fit_resample(X_train, Y_train)
  
print('After OverSampling, the shape of train_X: ', (X_train_new.shape))
print('After OverSampling, the shape of train_y: ', (Y_train_new.shape))
  
print("After OverSampling, counts of label '1': ", (sum(Y_train_new == 1)))
print("After OverSampling, counts of label '0': ", (sum(Y_train_new == 0)))

Before OverSampling, counts of label '1':  3289
Before OverSampling, counts of label '0':  35076
After OverSampling, the shape of train_X:  (70152, 13)
After OverSampling, the shape of train_y:  (70152,)
After OverSampling, counts of label '1':  35076
After OverSampling, counts of label '0':  35076


### Model Building After SMOTE
#### 1. Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression

# create a model object
classifier = LogisticRegression()

# fitting training data to the model
classifier.fit(X_train_new, Y_train_new)


Y_pred=classifier.predict(X_test)
print(Y_pred)

[0. 0. 0. ... 0. 0. 0.]


In [46]:
Y_pred_prob=classifier.predict_proba(X_test)
Y_pred_prob

array([[0.92051971, 0.07948029],
       [0.95217854, 0.04782146],
       [0.7884158 , 0.2115842 ],
       ...,
       [0.94001393, 0.05998607],
       [0.53943896, 0.46056104],
       [0.7540198 , 0.2459802 ]])

In [18]:
print(list(zip(Y_test,Y_pred)))

[(0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (0.0, 0.0), (1.0, 0.0), (0.0, 0.0), (1.

In [47]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[10801  4263]
 [  427   952]]
Classification report: 
              precision    recall  f1-score   support

         0.0       0.96      0.72      0.82     15064
         1.0       0.18      0.69      0.29      1379

    accuracy                           0.71     16443
   macro avg       0.57      0.70      0.56     16443
weighted avg       0.90      0.71      0.78     16443

Accuracy of the model:  0.7147722435078757


Now the model predicts well on class 1 data. 
* Recall value for class 0- 0.72
* Recall value for class 1- 0.69
* Accuracy- 71%

#### Tunning

In [72]:
# check the error for 0.4 to 0.6 threashold
for a in np.arange(0.4,0.61,0.01):
    predict_mine = np.where(y_pred_prob[:,1] > a, 1, 0)
    cfm=confusion_matrix(Y_test, predict_mine)
    total_err=cfm[0,1]+cfm[1,0]
    print("Errors at threshold ", a, ":",total_err, " , type 2 error :", 
          cfm[1,0]," , type 1 error:", cfm[0,1])

Errors at threshold  0.4 : 5934  , type 2 error : 245  , type 1 error: 5689
Errors at threshold  0.41000000000000003 : 5812  , type 2 error : 258  , type 1 error: 5554
Errors at threshold  0.42000000000000004 : 5693  , type 2 error : 276  , type 1 error: 5417
Errors at threshold  0.43000000000000005 : 5560  , type 2 error : 292  , type 1 error: 5268
Errors at threshold  0.44000000000000006 : 5435  , type 2 error : 308  , type 1 error: 5127
Errors at threshold  0.45000000000000007 : 5320  , type 2 error : 335  , type 1 error: 4985
Errors at threshold  0.4600000000000001 : 5171  , type 2 error : 349  , type 1 error: 4822
Errors at threshold  0.4700000000000001 : 5058  , type 2 error : 369  , type 1 error: 4689
Errors at threshold  0.4800000000000001 : 4934  , type 2 error : 393  , type 1 error: 4541
Errors at threshold  0.4900000000000001 : 4818  , type 2 error : 415  , type 1 error: 4403
Errors at threshold  0.5000000000000001 : 4690  , type 2 error : 427  , type 1 error: 4263
Errors at

In [71]:
# threashold 0.49

y_pred_prob = classifier.predict_proba(X_test)
print(y_pred_prob)

 

y_pred_class=[]
for value in y_pred_prob[:,1]: 
    if value > 0.49:                # accuracy for 0.4 threashold
        y_pred_class.append(1)
    else:
        y_pred_class.append(0)
#print(y_pred_class)

[[0.92051971 0.07948029]
 [0.95217854 0.04782146]
 [0.7884158  0.2115842 ]
 ...
 [0.94001393 0.05998607]
 [0.53943896 0.46056104]
 [0.7540198  0.2459802 ]]


In [70]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cfm=confusion_matrix(Y_test,y_pred_class)
print(cfm)
acc=accuracy_score(Y_test, y_pred_class)
print("Accuracy of the model: ",acc)
print(classification_report(Y_test, y_pred_class))

[[10661  4403]
 [  415   964]]
Accuracy of the model:  0.7069877759532932
              precision    recall  f1-score   support

         0.0       0.96      0.71      0.82     15064
         1.0       0.18      0.70      0.29      1379

    accuracy                           0.71     16443
   macro avg       0.57      0.70      0.55     16443
weighted avg       0.90      0.71      0.77     16443



For threashold 0.49 the Recall & Accuracy scores are:
* Recall vlaue Class 0- 0.71
* Recall value Class 1- 0.70
* Accuracy- 71%

####  2. Descision Tree

In [48]:
from sklearn.tree import DecisionTreeClassifier

model_DecisionTree=DecisionTreeClassifier(random_state=10,criterion='gini')

# create a model object


# fitting training data to the model
model_DecisionTree.fit(X_train_new, Y_train_new)

Y_pred=model_DecisionTree.predict(X_test)

In [49]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report

# confusion matric
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[13951  1113]
 [  738   641]]
Classification report: 
              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94     15064
         1.0       0.37      0.46      0.41      1379

    accuracy                           0.89     16443
   macro avg       0.66      0.70      0.67     16443
weighted avg       0.90      0.89      0.89     16443

Accuracy of the model:  0.8874293012224047


For Descision Trees: 
* Recall Value Class 0- 0.94
* Recall value Class 1- 0.46
* Accuracy- 0.887

The Recall value for Class 1 is not good even tought the over all accuracy is very good.

#### 3. Random Forest

In [75]:
#predicting using the Random_Forest_Classifier
from sklearn.ensemble import RandomForestClassifier
 
model_RandomForest=RandomForestClassifier(n_estimators=100,                  
                                          random_state=10, bootstrap=True,   
                                         n_jobs=-1)                          
 
#fit the model on the data and predict the values
model_RandomForest.fit(X_train_new,Y_train_new)
 
Y_pred=model_RandomForest.predict(X_test)

In [76]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[14758   306]
 [  875   504]]
Classification report: 
              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96     15064
         1.0       0.62      0.37      0.46      1379

    accuracy                           0.93     16443
   macro avg       0.78      0.67      0.71     16443
weighted avg       0.92      0.93      0.92     16443

Accuracy of the model:  0.9281761235784224


For Random Forest: 
* Recall Value Class 0- 0.98
* Recall value Class 1- 0.37
* Accuracy- 0.928

The Recall value for Class 1 is very bad even tought the over all accuracy is 93%.

## Conclusion

The Logistic Regression Model with 0.49 threashold value gives the best recall value and accuracy. Hence it is the best model

For threashold 0.49 the Recall & Accuracy scores are:
* Recall vlaue Class 0- 0.71
* Recall value Class 1- 0.70
* Accuracy- 71%

# Testing Data

In [106]:
hr_test=pd.read_csv(r"C:\Users\RBI\Downloads\test_hr.csv")

In [107]:
hr_test

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23485,53478,Legal,region_2,Below Secondary,m,sourcing,1,24,3.0,1,0,0,61
23486,25600,Technology,region_25,Bachelor's,m,sourcing,1,31,3.0,7,0,0,74
23487,45409,HR,region_16,Bachelor's,f,sourcing,1,26,4.0,4,0,0,50
23488,1186,Procurement,region_31,Bachelor's,m,sourcing,3,27,,1,0,0,70


In [108]:
hr_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23490 entries, 0 to 23489
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           23490 non-null  int64  
 1   department            23490 non-null  object 
 2   region                23490 non-null  object 
 3   education             22456 non-null  object 
 4   gender                23490 non-null  object 
 5   recruitment_channel   23490 non-null  object 
 6   no_of_trainings       23490 non-null  int64  
 7   age                   23490 non-null  int64  
 8   previous_year_rating  21678 non-null  float64
 9   length_of_service     23490 non-null  int64  
 10  KPIs_met >80%         23490 non-null  int64  
 11  awards_won?           23490 non-null  int64  
 12  avg_training_score    23490 non-null  int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 2.3+ MB


In [109]:
hr_test.describe()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
count,23490.0,23490.0,23490.0,21678.0,23490.0,23490.0,23490.0,23490.0
mean,39041.399149,1.254236,34.782929,3.339146,5.810387,0.358834,0.022776,63.263133
std,22640.809201,0.60091,7.679492,1.263294,4.207917,0.479668,0.149191,13.41175
min,3.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0
25%,19370.25,1.0,29.0,3.0,3.0,0.0,0.0,51.0
50%,38963.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0
75%,58690.0,1.0,39.0,4.0,7.0,1.0,0.0,76.0
max,78295.0,9.0,60.0,5.0,34.0,1.0,1.0,99.0


In [110]:
# check for duplicate values
hr_test.duplicated().sum()

0

In [111]:
hr_test.dtypes

employee_id               int64
department               object
region                   object
education                object
gender                   object
recruitment_channel      object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
dtype: object

### Check for Missing values

In [112]:
hr_test.isnull().sum()

employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64

Missing values are present in the data. Therefore replace them.

In [113]:
for value in ["education",'previous_year_rating']:
    hr_test[value].fillna(hr_test[value].mode()[0],inplace=True)

In [114]:
hr_test.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64

Now the missing values are replaced.

### Converting Categorical Variables to Numerical 



In [115]:
# display the columns that needs to be converted

colname=[]
for x in hr_test.columns:
    if hr_test[x].dtype=='object':
        colname.append(x)
colname

['department', 'region', 'education', 'gender', 'recruitment_channel']

In [116]:
# For preprocessing the data
from sklearn.preprocessing import LabelEncoder
 
le=LabelEncoder()
 
for x in colname:
    hr_test[x]=le.fit_transform(hr_test[x])

In [117]:
hr_test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,8,18,0,1,2,1,24,3.0,1,1,0,77
1,74430,2,28,0,0,0,1,31,3.0,5,0,0,51
2,72255,7,4,0,1,0,1,31,1.0,4,0,0,47
3,38562,5,11,0,0,0,3,31,2.0,9,0,0,65
4,64486,1,21,0,1,2,1,30,4.0,7,0,0,61


### Rename test data as X_test_rev

In [119]:
X_test_new=hr_test

### Scale Data into with scaler.transform

In [121]:
# Scaling the data

X_test_new=scaler.transform(X_test_new)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


### Find out the Y values for test data

In [122]:
Y_pred_new=model_DecisionTree.predict(X_test_new)

In [124]:
hr_test.columns=['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score']

In [125]:
hr_test["Predictions"]=Y_pred_new


In [126]:
Y_pred_new

array([0., 0., 0., ..., 0., 0., 1.])

In [None]:
hr_test.to_excel("HR_analysis_OP.xlsx", header=True, index=False)