In [467]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [468]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [469]:
train = pd.read_csv('./train_LZdllcl.csv')
test = pd.read_csv('./test_2umaH9m.csv')

In [470]:
train['index'] = train.index
test['index'] = test.index #For later combining purposes
total = train.append(test, ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


# Fill missing values

In [471]:
total.education = total.education.fillna('Unknown')
total.previous_year_rating.fillna(total.previous_year_rating.mean(), inplace=True)

# Preparing the data

In [472]:
total_prepared = pd.get_dummies(data=total, columns=['department','education', 'previous_year_rating'
                                                    ])

In [473]:
total = total_prepared.copy()

In [474]:
total['avg_training_score_grouped'] = pd.cut(total.avg_training_score, bins=[0,90,101], labels=['0', '1'])
total['length_of_service_grouped'] = pd.cut(total.length_of_service, bins=[0,20,200], labels=['0', '1'])

In [475]:
total.columns

Index(['KPIs_met >80%', 'age', 'avg_training_score', 'awards_won?',
       'employee_id', 'gender', 'index', 'is_promoted', 'length_of_service',
       'no_of_trainings', 'recruitment_channel', 'region',
       'department_Analytics', 'department_Finance', 'department_HR',
       'department_Legal', 'department_Operations', 'department_Procurement',
       'department_R&D', 'department_Sales & Marketing',
       'department_Technology', 'education_Bachelor's',
       'education_Below Secondary', 'education_Master's & above',
       'education_Unknown', 'previous_year_rating_1.0',
       'previous_year_rating_2.0', 'previous_year_rating_3.0',
       'previous_year_rating_3.3322185677565574', 'previous_year_rating_4.0',
       'previous_year_rating_5.0', 'avg_training_score_grouped',
       'length_of_service_grouped'],
      dtype='object')

In [487]:
MODEL_FEATURES = ['KPIs_met >80%', 'age', 'avg_training_score', 'awards_won?',
        'length_of_service',
       'department_Analytics', 'department_Finance','department_Operations', 'department_Procurement',
       'department_Technology',
        'previous_year_rating_1.0',
        'previous_year_rating_4.0',
       'previous_year_rating_5.0'
        ]

In [488]:
train_dat = total[total.index < 54808]
train_dat.shape
test_dat = total[total.index >= 54808]

(54808, 33)

In [489]:
train_dat[MODEL_FEATURES].dtypes
test_dat[MODEL_FEATURES].dtypes

KPIs_met >80%               int64
age                         int64
avg_training_score          int64
awards_won?                 int64
length_of_service           int64
department_Analytics        uint8
department_Finance          uint8
department_Operations       uint8
department_Procurement      uint8
department_Technology       uint8
previous_year_rating_1.0    uint8
previous_year_rating_4.0    uint8
previous_year_rating_5.0    uint8
dtype: object

KPIs_met >80%               int64
age                         int64
avg_training_score          int64
awards_won?                 int64
length_of_service           int64
department_Analytics        uint8
department_Finance          uint8
department_Operations       uint8
department_Procurement      uint8
department_Technology       uint8
previous_year_rating_1.0    uint8
previous_year_rating_4.0    uint8
previous_year_rating_5.0    uint8
dtype: object

In [490]:
strat_train = train_dat[train_dat.is_promoted == 1.0].append(train_dat[train_dat.is_promoted == 0.0].sample(5000))

In [491]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(strat_train[MODEL_FEATURES], strat_train['is_promoted'])

In [492]:
y_train.mean(), y_test.mean()

(0.4862777547924424, 0.4724865535788167)

In [493]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.neural_network import MLPClassifier
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
f1_score(y_test, preds)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

0.8021764477263895

In [494]:
for x,y in zip(MODEL_FEATURES,clf.feature_importances_):
    print(x, y)

KPIs_met >80% 0.15984710153193002
age 0.03777331097662819
avg_training_score 0.4257844451223491
awards_won? 0.04152586492725684
length_of_service 0.02699827310296277
department_Analytics 0.03710869100881556
department_Finance 0.0277484758797568
department_Operations 0.05598457008897381
department_Procurement 0.04553896535164717
department_Technology 0.0369736851100298
previous_year_rating_1.0 0.02531588735462804
previous_year_rating_4.0 0.009298746562042447
previous_year_rating_5.0 0.07010198298297929


In [495]:
clf = RandomForestClassifier()
clf.fit(strat_train[MODEL_FEATURES], strat_train['is_promoted'])
preds = clf.predict(test_dat[MODEL_FEATURES])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [496]:
ans = pd.DataFrame({'employee_id':test_dat['employee_id'],
                   'is_promoted':preds})
ans.is_promoted = ans.is_promoted.astype(int)

In [497]:
ans.to_csv('predict5.csv', index=False)

In [498]:
ans.mean()

employee_id    39041.399149
is_promoted        0.272840
dtype: float64

In [297]:
total.is_promoted.mean() #HEAVILY SKEWED CLASS! DAMN.

0.08517004816815063