### Importing necessary libraries

In [1]:
# pandas
import pandas as pd

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import statsmodels.api as sm

# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.feature_selection import RFE

# Importing train-test-split 
from sklearn.model_selection import train_test_split

# Evaluation metrics
from sklearn.metrics import *

# Ignoring warning
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

### Importing the data

In [2]:
employee = pd.read_csv('employee.csv',index_col=0)
employee.head()

Unnamed: 0_level_0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won,avg_training_score,is_promoted
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
65438,SalesMarketing,region_7,Masters,f,sourcing,1,35,5,8,1,0,49,0
65141,Operations,region_22,Bachelors,m,other,1,30,5,4,0,0,60,0
7513,SalesMarketing,region_19,Bachelors,m,sourcing,1,34,3,7,0,0,50,0
2542,SalesMarketing,region_23,Bachelors,m,other,2,39,1,10,0,0,50,0
48945,Technology,region_26,Bachelors,m,other,1,45,3,2,0,0,73,0


In [3]:
employee.nunique()

department               9
region                  34
education                4
gender                   2
recruitment_channel      3
no_of_trainings         10
age                     41
previous_year_rating     6
length_of_service       35
KPIs_met >80%            2
awards_won               2
avg_training_score      61
is_promoted              2
dtype: int64

In [4]:
employee.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54808 entries, 65438 to 51526
Data columns (total 13 columns):
department              54808 non-null object
region                  54808 non-null object
education               54808 non-null object
gender                  54808 non-null object
recruitment_channel     54808 non-null object
no_of_trainings         54808 non-null int64
age                     54808 non-null int64
previous_year_rating    54808 non-null int64
length_of_service       54808 non-null int64
KPIs_met >80%           54808 non-null int64
awards_won              54808 non-null int64
avg_training_score      54808 non-null int64
is_promoted             54808 non-null int64
dtypes: int64(8), object(5)
memory usage: 5.9+ MB


The columns like `no_of_trainings, previous_year_rating , KPIs_met >80% , awards_won` are actualy categorical but given in the interger format ,so these columns need to be converted as objects.

### Converting numerical data into categorical

In [5]:
list_int_obj = ['no_of_trainings','previous_year_rating','KPIs_met >80%','awards_won']
for col in list_int_obj:
    employee[col] = employee[col].apply(lambda x: str(x))

In [6]:
employee.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54808 entries, 65438 to 51526
Data columns (total 13 columns):
department              54808 non-null object
region                  54808 non-null object
education               54808 non-null object
gender                  54808 non-null object
recruitment_channel     54808 non-null object
no_of_trainings         54808 non-null object
age                     54808 non-null int64
previous_year_rating    54808 non-null object
length_of_service       54808 non-null int64
KPIs_met >80%           54808 non-null object
awards_won              54808 non-null object
avg_training_score      54808 non-null int64
is_promoted             54808 non-null int64
dtypes: int64(4), object(9)
memory usage: 5.9+ MB


### Onehot encoding for categorical variables

In [7]:
data = pd.get_dummies(employee,drop_first=True)
data.shape

(54808, 67)

In [8]:
data.head()

Unnamed: 0_level_0,age,length_of_service,avg_training_score,is_promoted,department_Finance,department_HR,department_Legal,department_Operations,department_Procurement,department_RandD,...,no_of_trainings_7,no_of_trainings_8,no_of_trainings_9,previous_year_rating_1,previous_year_rating_2,previous_year_rating_3,previous_year_rating_4,previous_year_rating_5,KPIs_met >80%_1,awards_won_1
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65438,35,8,49,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
65141,30,4,60,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
7513,34,7,50,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2542,39,10,50,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
48945,45,2,73,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Standardizing the data

In [9]:
df = data[['age','length_of_service','avg_training_score']]
transformed_data = (df-df.mean())/df.std()
data = data.drop(['age','length_of_service','avg_training_score'],1)
data = pd.concat([data,transformed_data],axis=1)

In [10]:
data.head()

Unnamed: 0_level_0,is_promoted,department_Finance,department_HR,department_Legal,department_Operations,department_Procurement,department_RandD,department_SalesMarketing,department_Technology,region_region_10,...,previous_year_rating_1,previous_year_rating_2,previous_year_rating_3,previous_year_rating_4,previous_year_rating_5,KPIs_met >80%_1,awards_won_1,age,length_of_service,avg_training_score
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65438,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0.025598,0.500455,-1.075922
65141,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,-0.627129,-0.437391,-0.25328
7513,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,-0.104947,0.265994,-1.001136
2542,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0.54778,0.969378,-1.001136
48945,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,1.331052,-0.906313,0.718933


### Separating X-variables and Y-variable

In [11]:
x_data = data.drop('is_promoted',1)
y_data = data['is_promoted']

### Building base models

In [12]:
def model(clf,X_train,Y_train):
    x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.20, stratify = Y_train, random_state = 99)
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    print('Scores of the Model')
    print('Confusion Matrix of the model:')
    print(confusion_matrix(y_test,y_pred))
    print('-----------------------------------------')
    print('Accuracy Score:',accuracy_score(y_test,y_pred))
    print('-----------------------------------------')
    print('Classification report:')
    print(classification_report(y_test, y_pred))
    print('------------------------------------------')
    print('')
    print('Cross Validation using KFold:')
    kf = KFold(n_splits=5,random_state=99)
    print('Accuracy score using KFold cross validation:')
    score = cross_val_score(clf, X_train, Y_train, cv=kf, n_jobs=1)
    for i in score:
        print('cross_val_score:',i)
    print('Mean Acuuracy Score:',score.mean())

In [13]:
def r2(model):
    return 1-(model.deviance/model.null_deviance)

### Base model using Logistic Regression

In [14]:
model(LogisticRegression(),x_data,y_data)

Scores of the Model
Confusion Matrix of the model:
[[9977   51]
 [ 712  222]]
-----------------------------------------
Accuracy Score: 0.9303959131545338
-----------------------------------------
Classification report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     10028
           1       0.81      0.24      0.37       934

   micro avg       0.93      0.93      0.93     10962
   macro avg       0.87      0.62      0.67     10962
weighted avg       0.92      0.93      0.91     10962

------------------------------------------

Cross Validation using KFold:
Accuracy score using KFold cross validation:
cross_val_score: 0.9288451012588944
cross_val_score: 0.9351395730706076
cross_val_score: 0.9313993796752418
cross_val_score: 0.930389562996077
cross_val_score: 0.9294772374783322
Mean Acuuracy Score: 0.9310501708958305


### Base model using decision tree

In [15]:
model(DecisionTreeClassifier(),x_data,y_data)

Scores of the Model
Confusion Matrix of the model:
[[9385  643]
 [ 504  430]]
-----------------------------------------
Accuracy Score: 0.8953658091589126
-----------------------------------------
Classification report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94     10028
           1       0.40      0.46      0.43       934

   micro avg       0.90      0.90      0.90     10962
   macro avg       0.67      0.70      0.69     10962
weighted avg       0.90      0.90      0.90     10962

------------------------------------------

Cross Validation using KFold:
Accuracy score using KFold cross validation:
cross_val_score: 0.8986498814085021
cross_val_score: 0.8991060025542784
cross_val_score: 0.8970078452837074
cross_val_score: 0.8971809141501688
cross_val_score: 0.8940790073898367
Mean Acuuracy Score: 0.8972047301572987


### Base model using Random Forest

In [16]:
model(RandomForestClassifier(),x_data,y_data)

Scores of the Model
Confusion Matrix of the model:
[[9943   85]
 [ 706  228]]
-----------------------------------------
Accuracy Score: 0.9278416347381865
-----------------------------------------
Classification report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96     10028
           1       0.73      0.24      0.37       934

   micro avg       0.93      0.93      0.93     10962
   macro avg       0.83      0.62      0.66     10962
weighted avg       0.92      0.93      0.91     10962

------------------------------------------

Cross Validation using KFold:
Accuracy score using KFold cross validation:
cross_val_score: 0.9278416347381865
cross_val_score: 0.9317642765918628
cross_val_score: 0.928024083196497
cross_val_score: 0.9264665632697746
cross_val_score: 0.9281087492017152
Mean Acuuracy Score: 0.9284410613996071


### Logistic regression model

In [17]:
data.columns

Index(['is_promoted', 'department_Finance', 'department_HR',
       'department_Legal', 'department_Operations', 'department_Procurement',
       'department_RandD', 'department_SalesMarketing',
       'department_Technology', 'region_region_10', 'region_region_11',
       'region_region_12', 'region_region_13', 'region_region_14',
       'region_region_15', 'region_region_16', 'region_region_17',
       'region_region_18', 'region_region_19', 'region_region_2',
       'region_region_20', 'region_region_21', 'region_region_22',
       'region_region_23', 'region_region_24', 'region_region_25',
       'region_region_26', 'region_region_27', 'region_region_28',
       'region_region_29', 'region_region_3', 'region_region_30',
       'region_region_31', 'region_region_32', 'region_region_33',
       'region_region_34', 'region_region_4', 'region_region_5',
       'region_region_6', 'region_region_7', 'region_region_8',
       'region_region_9', 'education_BelowSecondary', 'education_Maste

In [18]:
logm1 = sm.GLM(y_data,(sm.add_constant(x_data)), family = sm.families.Binomial())
model1 = logm1.fit()

In [19]:
r2(model1)

0.33111376922178914

In [20]:
model1.summary2()

0,1,2,3
Model:,GLM,AIC:,21486.3099
Link Function:,logit,BIC:,-575959.1175
Dependent Variable:,is_promoted,Log-Likelihood:,-10676.0
Date:,2019-04-06 12:56,LL-Null:,-15961.0
No. Observations:,54808,Deviance:,21352.0
Df Model:,66,Pearson chi2:,44000.0
Df Residuals:,54741,Scale:,1.0
Method:,IRLS,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-10.4052,0.2344,-44.3970,0.0000,-10.8646,-9.9459
department_Finance,7.0614,0.1604,44.0367,0.0000,6.7471,7.3757
department_HR,9.9529,0.2117,47.0164,0.0000,9.5380,10.3678
department_Legal,6.8573,0.2126,32.2473,0.0000,6.4405,7.2740
department_Operations,7.2777,0.1417,51.3680,0.0000,7.0000,7.5553
department_Procurement,4.4102,0.1058,41.6783,0.0000,4.2028,4.6176
department_RandD,-0.5515,0.1478,-3.7325,0.0002,-0.8411,-0.2619
department_SalesMarketing,10.4652,0.1883,55.5865,0.0000,10.0962,10.8342
department_Technology,1.7745,0.0766,23.1719,0.0000,1.6244,1.9246


Removing the variables that are less significant (i.e p-value < 5%).

In [21]:
x_data2 = x_data.drop(['region_region_10','region_region_11','region_region_12','region_region_13',
                      'region_region_14','region_region_15','region_region_16','region_region_21',
                      'region_region_18','region_region_19','region_region_20','region_region_2',
                      'region_region_24','region_region_26','region_region_27','region_region_3',
                      'region_region_30','region_region_31','region_region_33','region_region_5',
                       'region_region_6','region_region_8','education_BelowSecondary','gender_m',
                      'recruitment_channel_sourcing','no_of_trainings_10','no_of_trainings_5',
                      'no_of_trainings_6','no_of_trainings_7','no_of_trainings_8','no_of_trainings_9',
                      'previous_year_rating_3'],1)

In [22]:
logm2 = sm.GLM(y_data,(sm.add_constant(x_data2)), family = sm.families.Binomial())
model2 = logm2.fit()

In [23]:
r2(model2)

0.32917418210354643

In [24]:
model2.summary2()

0,1,2,3
Model:,GLM,AIC:,21484.2258
Link Function:,logit,BIC:,-576246.3726
Dependent Variable:,is_promoted,Log-Likelihood:,-10707.0
Date:,2019-04-06 12:56,LL-Null:,-15961.0
No. Observations:,54808,Deviance:,21414.0
Df Model:,34,Pearson chi2:,43900.0
Df Residuals:,54773,Scale:,1.0
Method:,IRLS,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-10.3998,0.1451,-71.6504,0.0000,-10.6843,-10.1153
department_Finance,7.1026,0.1598,44.4489,0.0000,6.7894,7.4158
department_HR,9.9932,0.2107,47.4235,0.0000,9.5801,10.4062
department_Legal,6.8917,0.2118,32.5459,0.0000,6.4766,7.3067
department_Operations,7.2777,0.1406,51.7480,0.0000,7.0020,7.5533
department_Procurement,4.4225,0.1044,42.3611,0.0000,4.2179,4.6271
department_RandD,-0.5375,0.1475,-3.6430,0.0003,-0.8266,-0.2483
department_SalesMarketing,10.4955,0.1879,55.8661,0.0000,10.1273,10.8637
department_Technology,1.7638,0.0751,23.4771,0.0000,1.6166,1.9110


In [25]:
x_data2 = x_data2.drop(['no_of_trainings_3','no_of_trainings_4','recruitment_channel_referred'],1)

In [26]:
logm3 = sm.GLM(y_data,(sm.add_constant(x_data2)), family = sm.families.Binomial())
model3 = logm3.fit()

In [27]:
r2(model3)

0.32891386568423187

In [28]:
model3.summary2()

0,1,2,3
Model:,GLM,AIC:,21486.5357
Link Function:,logit,BIC:,-576270.7975
Dependent Variable:,is_promoted,Log-Likelihood:,-10711.0
Date:,2019-04-06 12:56,LL-Null:,-15961.0
No. Observations:,54808,Deviance:,21423.0
Df Model:,31,Pearson chi2:,43700.0
Df Residuals:,54776,Scale:,1.0
Method:,IRLS,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-10.4167,0.1450,-71.8401,0.0000,-10.7009,-10.1325
department_Finance,7.1047,0.1598,44.4727,0.0000,6.7916,7.4178
department_HR,10.0017,0.2105,47.5131,0.0000,9.5891,10.4143
department_Legal,6.9002,0.2117,32.5932,0.0000,6.4853,7.3152
department_Operations,7.2822,0.1405,51.8154,0.0000,7.0068,7.5577
department_Procurement,4.4219,0.1044,42.3759,0.0000,4.2174,4.6265
department_RandD,-0.5427,0.1476,-3.6764,0.0002,-0.8321,-0.2534
department_SalesMarketing,10.4972,0.1877,55.9111,0.0000,10.1292,10.8652
department_Technology,1.7585,0.0749,23.4718,0.0000,1.6117,1.9054


### Feature Selection Using RFE

In [29]:
logreg = LogisticRegression()
rfe = RFE(logreg, 15)             # running RFE with 20 variables as output
rfe = rfe.fit(x_data2,y_data)
print(rfe.support_)           # Printing the boolean results
print(rfe.ranking_)           # Printing the ranking

[ True  True  True  True  True False  True  True False False False False
 False  True  True  True False False  True False False False  True False
 False False  True  True False False  True]
[ 1  1  1  1  1  2  1  1  6  8  7  5  9  1  1  1  3 10  1 15 11 14  1 12
 13  4  1  1 16 17  1]


In [30]:
x_data2.columns

Index(['department_Finance', 'department_HR', 'department_Legal',
       'department_Operations', 'department_Procurement', 'department_RandD',
       'department_SalesMarketing', 'department_Technology',
       'region_region_17', 'region_region_22', 'region_region_23',
       'region_region_25', 'region_region_28', 'region_region_29',
       'region_region_32', 'region_region_34', 'region_region_4',
       'region_region_7', 'region_region_9', 'education_Masters',
       'education_NotSpecified', 'no_of_trainings_2', 'previous_year_rating_1',
       'previous_year_rating_2', 'previous_year_rating_4',
       'previous_year_rating_5', 'KPIs_met >80%_1', 'awards_won_1', 'age',
       'length_of_service', 'avg_training_score'],
      dtype='object')

In [31]:
x_data3 = x_data2[['department_Finance', 'department_HR', 'department_Legal',
       'department_Operations', 'department_Procurement','department_SalesMarketing', 'department_Technology',
                  'region_region_29','region_region_32', 'region_region_34','region_region_9',
                  'previous_year_rating_1','KPIs_met >80%_1', 'awards_won_1','avg_training_score']]

In [32]:
logm4 = sm.GLM(y_data,(sm.add_constant(x_data3)), family = sm.families.Binomial())
model4 = logm4.fit()

In [33]:
r2(model4)

0.31534247925811376

In [34]:
model4.summary2()

0,1,2,3
Model:,GLM,AIC:,21887.764
Link Function:,logit,BIC:,-576012.1546
Dependent Variable:,is_promoted,Log-Likelihood:,-10928.0
Date:,2019-04-06 12:56,LL-Null:,-15961.0
No. Observations:,54808,Deviance:,21856.0
Df Model:,15,Pearson chi2:,45200.0
Df Residuals:,54792,Scale:,1.0
Method:,IRLS,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-10.3224,0.1392,-74.1444,0.0000,-10.5953,-10.0496
department_Finance,7.2525,0.1574,46.0767,0.0000,6.9440,7.5610
department_HR,10.1901,0.2077,49.0665,0.0000,9.7830,10.5971
department_Legal,7.0167,0.2091,33.5593,0.0000,6.6069,7.4265
department_Operations,7.3935,0.1375,53.7740,0.0000,7.1240,7.6630
department_Procurement,4.4651,0.1001,44.6063,0.0000,4.2689,4.6613
department_SalesMarketing,10.6130,0.1853,57.2623,0.0000,10.2498,10.9763
department_Technology,1.7225,0.0703,24.4993,0.0000,1.5847,1.8603
region_region_29,-0.6367,0.1744,-3.6504,0.0003,-0.9786,-0.2949


In [35]:
# UDF for calculating vif value
def vif_cal(input_data, dependent_col):
    vif_df = pd.DataFrame( columns = ['Var', 'Vif'])
    x_vars=input_data
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]] 
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.OLS(y,x).fit().rsquared  
        vif=round(1/(1-rsq),2)
        vif_df.loc[i] = [xvar_names[i], vif]
    return vif_df.sort_values(by = 'Vif', axis=0, ascending=False, inplace=False)

In [36]:
vif_cal(x_data3,y_data)

Unnamed: 0,Var,Vif
14,avg_training_score,2.68
10,region_region_9,0.4
7,region_region_29,0.23
13,awards_won_1,0.2
8,region_region_32,0.19
9,region_region_34,0.12
0,department_Finance,0.1
1,department_HR,0.06
6,department_Technology,0.05
4,department_Procurement,0.04


- Variance influence factor is less for all variables.

### Building the final model

#### Splitting the data for validation

In [37]:
x_train, x_test, y_train, y_test = train_test_split(x_data3, y_data, test_size=0.20, stratify = y_data, random_state = 99)

In [38]:
log_final = LogisticRegression()
log_final.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Making Predictions

In [39]:
y_pred = log_final.predict(x_test)

In [40]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96     10028
           1       0.85      0.22      0.35       934

   micro avg       0.93      0.93      0.93     10962
   macro avg       0.89      0.61      0.66     10962
weighted avg       0.92      0.93      0.91     10962



In [41]:
accuracy_score(y_test,y_pred)

0.930122240467068

In [42]:
confusion_matrix(y_test,y_pred)

array([[9992,   36],
       [ 730,  204]], dtype=int64)