# Data loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv('Desktop/train_hr.csv')

In [3]:
data.head(5)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


# Data preprocessing

Filling null values

In [4]:
data.isna().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [5]:
data['previous_year_rating'].value_counts()

3.0    18618
5.0    11741
4.0     9877
1.0     6223
2.0     4225
Name: previous_year_rating, dtype: int64

In [6]:
data.previous_year_rating=data.previous_year_rating.fillna(0)

In [7]:
data.education=data.education.fillna("Bachelor's")

In [8]:
data.isna().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
employee_id             54808 non-null int64
department              54808 non-null object
region                  54808 non-null object
education               54808 non-null object
gender                  54808 non-null object
recruitment_channel     54808 non-null object
no_of_trainings         54808 non-null int64
age                     54808 non-null int64
previous_year_rating    54808 non-null float64
length_of_service       54808 non-null int64
KPIs_met >80%           54808 non-null int64
awards_won?             54808 non-null int64
avg_training_score      54808 non-null int64
is_promoted             54808 non-null int64
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB


Converting categorical values to numerical values

In [10]:
data1=pd.get_dummies(data)

In [11]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 60 columns):
employee_id                     54808 non-null int64
no_of_trainings                 54808 non-null int64
age                             54808 non-null int64
previous_year_rating            54808 non-null float64
length_of_service               54808 non-null int64
KPIs_met >80%                   54808 non-null int64
awards_won?                     54808 non-null int64
avg_training_score              54808 non-null int64
is_promoted                     54808 non-null int64
department_Analytics            54808 non-null uint8
department_Finance              54808 non-null uint8
department_HR                   54808 non-null uint8
department_Legal                54808 non-null uint8
department_Operations           54808 non-null uint8
department_Procurement          54808 non-null uint8
department_R&D                  54808 non-null uint8
department_Sales & Marketing    54808 non

Balancing dataset

In [12]:
data1.shape

(54808, 60)

In [13]:
data1['is_promoted'].value_counts()

0    50140
1     4668
Name: is_promoted, dtype: int64

# Oversampling dataset

In [14]:
X=data1.drop(columns=['is_promoted'])

In [15]:
X.shape

(54808, 59)

In [16]:
y=data1['is_promoted']

In [17]:
from imblearn.over_sampling import SMOTE
x_over_sample,y_over_sample=SMOTE().fit_sample(X,y)

In [18]:
x_over_sample.shape

(100280, 59)

In [19]:
y_over_sample.shape

(100280,)

In [20]:
y_over_sample.value_counts()

1    50140
0    50140
Name: is_promoted, dtype: int64

# Splitting dataset

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
x_over_sample=x_over_sample.drop(columns=['employee_id'])

In [23]:
X_train,X_test,y_train,y_test=train_test_split(x_over_sample,y_over_sample,test_size=0.3,random_state=5)

In [24]:
X_train.shape

(70196, 58)

In [25]:
X_test.shape

(30084, 58)

In [26]:
y_test.value_counts()

1    15052
0    15032
Name: is_promoted, dtype: int64

# Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix

In [29]:
logit=LogisticRegression()

In [30]:
logit.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
y_pred1=logit.predict(X_test)

In [32]:
f1_score(y_test,y_pred1)

0.9459626901631444

In [33]:
confusion_matrix(y_test,y_pred1)

array([[14772,   260],
       [ 1310, 13742]], dtype=int64)

# Random forest

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix

In [36]:
rft=RandomForestClassifier(random_state=50,
                           max_depth= 20 ,
                           min_samples_split= 5 ,
                           n_estimators= 500,
                           n_jobs=-1, 
                           min_samples_leaf=8,  
                           criterion='entropy',
  
                           class_weight= {0: 0.8, 1:1})
rft.fit(X_train,y_train) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 0.8, 1: 1}, criterion='entropy',
                       max_depth=20, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=8,
                       min_samples_split=5, min_weight_fraction_leaf=0.0,
                       n_estimators=500, n_jobs=-1, oob_score=False,
                       random_state=50, verbose=0, warm_start=False)

In [37]:
y_pred2=rft.predict(X_test)

In [38]:
f1_score(y_test,y_pred2)

0.955014187833068

In [39]:
confusion_matrix(y_test,y_pred2)

array([[14985,    47],
       [ 1253, 13799]], dtype=int64)

In [40]:
pd.Series(rft.feature_importances_,index=X_train.columns).sort_values(ascending=False)*100

previous_year_rating            14.088861
avg_training_score              12.880429
recruitment_channel_other        6.405745
recruitment_channel_sourcing     5.974482
education_Master's & above       5.548891
education_Bachelor's             5.160021
gender_m                         5.138811
gender_f                         4.974519
department_Operations            4.361524
department_Sales & Marketing     4.267278
department_Procurement           3.685431
department_Technology            3.580374
KPIs_met >80%                    3.568230
department_Analytics             3.326496
region_region_2                  3.081683
age                              1.526835
department_Finance               1.365254
no_of_trainings                  1.176959
department_HR                    1.150125
length_of_service                1.071931
region_region_22                 1.030688
department_R&D                   0.765119
recruitment_channel_referred     0.759865
region_region_7                  0

# Xg boost

In [41]:
from xgboost import XGBClassifier
classifier = XGBClassifier( learning_rate =0.1,
                            n_estimators=200,
                            max_depth=10,
                            min_child_weight=5,
                            gamma=0,
                            subsample=0.8,
                            colsample_bytree=0.6,
                            objective= 'binary:logistic',
                            nthread=4,
                            scale_pos_weight=13,
                            reg_lambda=5,
                            max_delta_step=1,
                            alpha=0,
                            base_score=0.5,
                            seed=1029)

classifier.fit(X_train, y_train)

XGBClassifier(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=1, max_depth=10,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, nthread=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=1029, reg_alpha=0,
              reg_lambda=5, scale_pos_weight=13, seed=1029, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [42]:
threshold = 0.88
y_pred_prob = classifier.predict_proba(X_test)[:,1]
y_pred3 = (y_pred_prob > threshold).astype(int)
 
print('acc is',accuracy_score(y_test,y_pred3))

print(f1_score(y_test,y_pred3))

confusion_matrix(y_test,y_pred3)

acc is 0.964133758808669
0.9630770283680663


array([[14933,    99],
       [  980, 14072]], dtype=int64)

# Predicting the result

Loading dataset

In [43]:
value=pd.read_csv('Desktop/test_data.csv')

In [44]:
value.columns

Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score'],
      dtype='object')

In [45]:
values=value.drop(columns=['employee_id'])

Data preprocessing

In [46]:
values.isna().sum()

department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64

In [47]:
values['education'].value_counts()

Bachelor's          15578
Master's & above     6504
Below Secondary       374
Name: education, dtype: int64

In [48]:
values.education=value.education.fillna("Bachelor's")

In [49]:
values.previous_year_rating=value.previous_year_rating.fillna(0)

In [50]:
values.isna().sum()

department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64

In [51]:
values1=pd.get_dummies(values)

In [52]:
values1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23490 entries, 0 to 23489
Data columns (total 58 columns):
no_of_trainings                 23490 non-null int64
age                             23490 non-null int64
previous_year_rating            23490 non-null float64
length_of_service               23490 non-null int64
KPIs_met >80%                   23490 non-null int64
awards_won?                     23490 non-null int64
avg_training_score              23490 non-null int64
department_Analytics            23490 non-null uint8
department_Finance              23490 non-null uint8
department_HR                   23490 non-null uint8
department_Legal                23490 non-null uint8
department_Operations           23490 non-null uint8
department_Procurement          23490 non-null uint8
department_R&D                  23490 non-null uint8
department_Sales & Marketing    23490 non-null uint8
department_Technology           23490 non-null uint8
region_region_1                 23490 non

Predicting values

In [53]:
result=classifier.predict(values1)

Copying to excel sheet

In [55]:
submit=pd.read_csv('result.csv')

In [56]:
submit['Employee id']=value.employee_id

In [57]:
submit['is_promoted']=result

In [58]:
submit

Unnamed: 0,Employee id,is_promoted
0,8724,1
1,74430,0
2,72255,0
3,38562,0
4,64486,0
...,...,...
23485,53478,0
23486,25600,0
23487,45409,0
23488,1186,0


In [59]:
pd.DataFrame(submit,columns=['Employee id','is_promoted']).to_csv('Desktop/result.csv',index=False)