In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn import metrics

In [5]:
#Reading in data
train = pd.read_csv("C:/Users/Aditya/Downloads/HR/aug_train.csv", delimiter = ",")
test = pd.read_csv("C:/Users/Aditya/Downloads/HR/aug_test.csv", delimiter = ",")
train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [6]:
train.info()

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

# Data wrangling
Null values wherever they are above 15% of total data,respective columns are dropped since imputing large data is counter productive and changes nature of given data. Label encoding is also carried out

In [7]:
#missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
company_type,6140,0.320493
company_size,5938,0.309949
gender,4508,0.235306
major_discipline,2813,0.146832
education_level,460,0.024011
last_new_job,423,0.02208
enrolled_university,386,0.020148
experience,65,0.003393
target,0,0.0
training_hours,0,0.0


In [8]:
#missing data
total = test.isnull().sum().sort_values(ascending=False)
percent = (test.isnull().sum()/test.isnull().count()).sort_values(ascending=False)
missing_datatest = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_datatest.head(20)

Unnamed: 0,Total,Percent
company_type,634,0.297792
company_size,622,0.292156
gender,508,0.23861
major_discipline,312,0.146548
education_level,52,0.024425
last_new_job,40,0.018788
enrolled_university,31,0.014561
experience,5,0.002349
training_hours,0,0.0
relevent_experience,0,0.0


In [9]:
train = train.drop((missing_data[missing_data['Percent'] > 0.15]).index,1)
test = test.drop((missing_data[missing_data['Percent'] > 0.15]).index,1)

In [10]:
catfeats_fillnamode = \
    ['major_discipline','education_level', 'last_new_job', 'enrolled_university', 'experience']
 
train.loc[:, catfeats_fillnamode] = \
    train[catfeats_fillnamode].fillna(train[catfeats_fillnamode].mode().iloc[0])

train.info()

catfeats_fillnamode1 = \
    ['major_discipline','education_level', 'last_new_job', 'enrolled_university', 'experience']

test.loc[:, catfeats_fillnamode1] = \
    test[catfeats_fillnamode1].fillna(test[catfeats_fillnamode1].mode().iloc[0])
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   relevent_experience     19158 non-null  object 
 4   enrolled_university     19158 non-null  object 
 5   education_level         19158 non-null  object 
 6   major_discipline        19158 non-null  object 
 7   experience              19158 non-null  object 
 8   last_new_job            19158 non-null  object 
 9   training_hours          19158 non-null  int64  
 10  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(7)
memory usage: 1.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 10 columns):
 #   Column                  Non-

In [11]:
s = (train.dtypes == 'object')
r = (test.dtypes == 'object')
object_cols = list(s[s].index)
object_cols = list(r[r].index)
label_encoder = LabelEncoder()
for col in object_cols:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.fit_transform(test[col])
    
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  int32  
 2   city_development_index  19158 non-null  float64
 3   relevent_experience     19158 non-null  int32  
 4   enrolled_university     19158 non-null  int32  
 5   education_level         19158 non-null  int32  
 6   major_discipline        19158 non-null  int32  
 7   experience              19158 non-null  int32  
 8   last_new_job            19158 non-null  int32  
 9   training_hours          19158 non-null  int64  
 10  target                  19158 non-null  float64
dtypes: float64(2), int32(7), int64(2)
memory usage: 1.1 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 10 columns):
 #   Column                  Non-Nu

# Buiding model

In [12]:
X = train.drop(["enrollee_id","target"],axis=1)
Y =  train["target"] 
X1 = test.drop(["enrollee_id"],axis=1)

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
FC_DT = DecisionTreeClassifier(criterion="gini")
FC_DT.fit(X_train,Y_train)
predTree = FC_DT.predict(X_test)
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(Y_test, predTree))

DecisionTrees's Accuracy:  0.6946764091858038


In [14]:
FC_RFC = RandomForestClassifier(bootstrap=True,
                       criterion='gini',  random_state=42)
FC_RFC.fit(X_train,Y_train)
preds = FC_RFC.predict(X_test)
print("Random forest's Accuracy: ", metrics.accuracy_score(Y_test, preds))

Random forest's Accuracy:  0.7479123173277662


In [15]:
from xgboost import XGBClassifier
XGBCLF_FC= XGBClassifier()
XGBCLF_FC.fit(X_train,Y_train)
y_pred = XGBCLF_FC.predict(X_test)
print("XG Boost's Accuracy: ", metrics.accuracy_score(Y_test, y_pred))

XG Boost's Accuracy:  0.7719206680584552


In [16]:
import lightgbm as lgb
model_lgb = lgb.LGBMClassifier()
model_lgb.fit(X_train,Y_train)
Predsl = model_lgb.predict(X_test)
print("LGBM's Accuracy: ", metrics.accuracy_score(Y_test, Predsl))

LGBM's Accuracy:  0.7815762004175365


In [17]:
GBoost = GradientBoostingClassifier()
GBoost.fit(X_train,Y_train)
Predcsl = GBoost.predict(X_test)
print("GBM's Accuracy: ", metrics.accuracy_score(Y_test, Predcsl))

GBM's Accuracy:  0.782098121085595


In [18]:
from catboost import CatBoostClassifier, Pool
CBoost = CatBoostClassifier()
CBoost.fit(X_train,Y_train)
Predcstl = CBoost.predict(X_test)
print("CB's Accuracy: ", metrics.accuracy_score(Y_test, Predcstl))

Learning rate set to 0.033045
0:	learn: 0.6775674	total: 300ms	remaining: 4m 59s
1:	learn: 0.6641943	total: 312ms	remaining: 2m 35s
2:	learn: 0.6517846	total: 332ms	remaining: 1m 50s
3:	learn: 0.6404161	total: 350ms	remaining: 1m 27s
4:	learn: 0.6292871	total: 362ms	remaining: 1m 11s
5:	learn: 0.6191655	total: 372ms	remaining: 1m 1s
6:	learn: 0.6095504	total: 383ms	remaining: 54.4s
7:	learn: 0.6005082	total: 395ms	remaining: 48.9s
8:	learn: 0.5921601	total: 406ms	remaining: 44.7s
9:	learn: 0.5842155	total: 417ms	remaining: 41.3s
10:	learn: 0.5773140	total: 428ms	remaining: 38.5s
11:	learn: 0.5706938	total: 441ms	remaining: 36.3s
12:	learn: 0.5645950	total: 451ms	remaining: 34.2s
13:	learn: 0.5587540	total: 462ms	remaining: 32.5s
14:	learn: 0.5534304	total: 473ms	remaining: 31s
15:	learn: 0.5488551	total: 487ms	remaining: 29.9s
16:	learn: 0.5445612	total: 498ms	remaining: 28.8s
17:	learn: 0.5404072	total: 510ms	remaining: 27.8s
18:	learn: 0.5361285	total: 522ms	remaining: 26.9s
19:	lear