In [440]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

In [419]:
df = pd.read_csv("train_LZdllcl.csv")

In [430]:
df1 = df.copy()
df1 = df1[df1['length_of_service']<=15]
df1 = df1[df1['age']<55]

In [432]:
from sklearn.preprocessing import LabelEncoder

encoder1 = LabelEncoder()
encoder2 = LabelEncoder()
encoder3 = LabelEncoder()
encoder4 = LabelEncoder()
encoder5 = LabelEncoder()
df1['department'] = encoder1.fit_transform(df1['department'])
df1['region'] = encoder2.fit_transform(df1['region'])
df1['education'] = encoder3.fit_transform(df1['education'])
df1['gender'] = encoder4.fit_transform(df1['gender'])
df1['recruitment_channel'] = encoder5.fit_transform(df1['recruitment_channel'])

In [433]:
## removing null values
df1 = df1[df1['education']!=3]
df1 = df1[df1['previous_year_rating'].isnull()==False]

In [435]:
df1.drop(columns=['employee_id'], inplace=True)
df1.reset_index(drop = True, inplace= True)

In [437]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_x = pd.DataFrame(scaler.fit_transform(df1.drop(columns=['is_promoted'])), columns=df1.drop(columns=['is_promoted']).columns)
df_x['is_promoted'] = df1['is_promoted']

In [None]:
for column in df_x.columns:
    if pd.api.types.is_numeric_dtype(df_x[column]):
        print(column)
        print(df_x[column].skew())
        fig, axs = plt.subplots(2,1, figsize=(8,8))
        axs[0].hist(df_x[column], density = True)
        axs[0].set_title('Histogram')

        x_mean, x_std = np.mean(df_x[column]), np.std(df_x[column])
        x_min, x_max = plt.xlim(np.min(df_x[column]), np.max(df_x[column]))
        x = np.linspace(x_min, x_max, 100)
        p = norm.pdf(x, x_mean, x_std)
        axs[0].plot(x, p, 'k', linewidth = 2)

        axs[1].boxplot(df_x[column], vert=False, patch_artist=True)
        axs[1].set_title('boxplot')

        plt.tight_layout()
        plt.show()

## XGB

In [462]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [463]:
classifier = XGBClassifier()

In [464]:
x= df_x.drop(columns=['is_promoted']).reset_index(drop=True)
y= df_x['is_promoted'].reset_index(drop=True)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [467]:
from imblearn.combine import SMOTETomek
os = SMOTETomek(sampling_strategy=0.75)
x_train_s, y_train_s  = os.fit_resample(x_train,y_train)

In [488]:
# Finding out best parameters for xgb

from sklearn.model_selection import GridSearchCV

param_grid2 = {
    'n_estimators':[50,150,250],
    'max_depth':[None, 5, 10, 15]
}

grid_search2 = GridSearchCV(estimator=classifier, param_grid=param_grid2, cv = 4, verbose=2)
grid_search2.fit(x_train_s,y_train_s)

Fitting 4 folds for each of 12 candidates, totalling 48 fits
[CV] END ....................max_depth=None, n_estimators=50; total time=   0.2s
[CV] END ....................max_depth=None, n_estimators=50; total time=   0.1s
[CV] END ....................max_depth=None, n_estimators=50; total time=   0.1s
[CV] END ....................max_depth=None, n_estimators=50; total time=   0.1s
[CV] END ...................max_depth=None, n_estimators=150; total time=   0.3s
[CV] END ...................max_depth=None, n_estimators=150; total time=   0.3s
[CV] END ...................max_depth=None, n_estimators=150; total time=   0.2s
[CV] END ...................max_depth=None, n_estimators=150; total time=   0.3s
[CV] END ...................max_depth=None, n_estimators=250; total time=   0.6s
[CV] END ...................max_depth=None, n_estimators=250; total time=   0.5s
[CV] END ...................max_depth=None, n_estimators=250; total time=   0.5s
[CV] END ...................max_depth=None, n_es

In [489]:
grid_search2.best_params_

{'max_depth': 15, 'n_estimators': 50}

In [472]:
classifier_gs_xgb = grid_search2.estimator
classifier_gs_xgb.fit(x_train_s, y_train_s)

In [475]:
y_pred =classifier_gs_xgb.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9401427786930258

In [476]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.99      0.97      8323
           1       0.84      0.37      0.52       782

    accuracy                           0.94      9105
   macro avg       0.89      0.68      0.74      9105
weighted avg       0.94      0.94      0.93      9105



In [478]:
print(classification_report(y_train_s, classifier_gs_xgb.predict(x_train_s)))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     32910
           1       1.00      0.92      0.96     24614

    accuracy                           0.96     57524
   macro avg       0.97      0.96      0.96     57524
weighted avg       0.97      0.96      0.96     57524



In [477]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[8267   56]
 [ 489  293]]


In [479]:
df_sub = pd.read_csv('test_2umaH9m.csv')

In [400]:
print(encoder1.classes_)
print(encoder2.classes_)
print(encoder3.classes_)
print(encoder4.classes_)
print(encoder5.classes_)

['Analytics' 'Finance' 'HR' 'Legal' 'Operations' 'Procurement' 'R&D'
 'Sales & Marketing' 'Technology']
['region_1' 'region_10' 'region_11' 'region_12' 'region_13' 'region_14'
 'region_15' 'region_16' 'region_17' 'region_18' 'region_19' 'region_2'
 'region_20' 'region_21' 'region_22' 'region_23' 'region_24' 'region_25'
 'region_26' 'region_27' 'region_28' 'region_29' 'region_3' 'region_30'
 'region_31' 'region_32' 'region_33' 'region_34' 'region_4' 'region_5'
 'region_6' 'region_7' 'region_8' 'region_9']
["Bachelor's" 'Below Secondary' "Master's & above" nan]
['f' 'm']
['other' 'referred' 'sourcing']


In [480]:
df_sub

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23485,53478,Legal,region_2,Below Secondary,m,sourcing,1,24,3.0,1,0,0,61
23486,25600,Technology,region_25,Bachelor's,m,sourcing,1,31,3.0,7,0,0,74
23487,45409,HR,region_16,Bachelor's,f,sourcing,1,26,4.0,4,0,0,50
23488,1186,Procurement,region_31,Bachelor's,m,sourcing,3,27,,1,0,0,70


In [481]:
df_sub['department'] = encoder1.transform(df_sub['department'])
df_sub['region'] = encoder2.transform(df_sub['region'])
df_sub['education'] = encoder3.transform(df_sub['education'])
df_sub['gender'] = encoder4.transform(df_sub['gender'])
df_sub['recruitment_channel'] = encoder5.transform(df_sub['recruitment_channel'])

In [482]:
df_sub_std = df_sub.drop(columns=['employee_id'])
df_sub_std = pd.DataFrame(scaler.transform(df_sub_std), columns=df_sub_std.columns)
y_pred_sub = classifier.predict(df_sub_std)

In [485]:
df_sub['is_promoted'] =  y_pred_sub
df_sub_final = df_sub[['employee_id','is_promoted']]
df_sub_final.to_csv('sub3.csv', index=False)