In [1]:
# Imports
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import cross_val_score
# from lightgbm import LGBMClassifier
import pandas as pd

In [2]:
# Read data and display sample
df = pd.read_csv('hr.csv')
df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,73,0


In [3]:
# Drop redundant employee_id column
df = df.drop(columns=['employee_id'])

In [4]:
# Initial Exploratory Data Analysis
for col in ['department', 'education', 'gender', 'recruitment_channel', 'no_of_trainings', 'awards_won?', 'is_promoted', 'previous_year_rating']:
    print(df[col].value_counts(dropna=False))
    print()

Sales & Marketing    16840
Operations           11348
Technology            7138
Procurement           7138
Analytics             5352
Finance               2536
HR                    2418
Legal                 1039
R&D                    999
Name: department, dtype: int64

Bachelor's          36669
Master's & above    14925
NaN                  2409
Below Secondary       805
Name: education, dtype: int64

m    38496
f    16312
Name: gender, dtype: int64

other       30446
sourcing    23220
referred     1142
Name: recruitment_channel, dtype: int64

1     44378
2      7987
3      1776
4       468
5       128
6        44
7        12
8         5
10        5
9         5
Name: no_of_trainings, dtype: int64

0    53538
1     1270
Name: awards_won?, dtype: int64

0    50140
1     4668
Name: is_promoted, dtype: int64

3.0    18618
5.0    11741
4.0     9877
1.0     6223
2.0     4225
NaN     4124
Name: previous_year_rating, dtype: int64



In [5]:
# Check for nulls in columns
for col in df.columns:
    null_count = df[col].isnull().sum()
    if null_count:
        print(col, null_count)

education 2409
previous_year_rating 4124


In [6]:
# Transform data
df['no_education'] = df.education.isna()
education_map = {'Below Secondary': 0, 'Secondary': 1, 'Bachelor\'s': 2, 'Master\'s & above': 3}
df.education = df.education.fillna('Secondary').map(education_map)

# df.gender = df.gender == 'm'
df.previous_year_rating = df.previous_year_rating.fillna(df.previous_year_rating.median())
df = pd.get_dummies(df, columns=['department', 'region', 'recruitment_channel', 'gender', 'education'])
df.head()

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted,no_education,department_Analytics,department_Finance,...,region_region_9,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing,gender_f,gender_m,education_0,education_1,education_2,education_3
0,1,35,5.0,8,0,49,0,False,0,0,...,0,0,0,1,1,0,0,0,0,1
1,1,30,5.0,4,0,60,0,False,0,0,...,0,1,0,0,0,1,0,0,1,0
2,1,34,3.0,7,0,50,0,False,0,0,...,0,0,0,1,0,1,0,0,1,0
3,2,39,1.0,10,0,50,0,False,0,0,...,0,1,0,0,0,1,0,0,1,0
4,1,45,3.0,2,0,73,0,False,0,0,...,0,1,0,0,0,1,0,0,1,0


In [7]:
# Extract target
X, y = df.drop(columns='is_promoted'), df.is_promoted

In [8]:
clf = BalancedRandomForestClassifier(n_jobs=-1, random_state=1, min_samples_leaf=2)
print("Cross-validation accuracy:%f" % cross_val_score(clf, X, y, scoring='balanced_accuracy').mean())

Cross-validation accuracy:0.733068
