In [1]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
from sklearn import preprocessing
from xgboost import plot_importance
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.shape

(54808, 14)

In [4]:
df['previous_year_rating'].describe()

count    50684.000000
mean         3.329256
std          1.259993
min          1.000000
25%          3.000000
50%          3.000000
75%          4.000000
max          5.000000
Name: previous_year_rating, dtype: float64

In [5]:
df['education'].describe()

count          52399
unique             3
top       Bachelor's
freq           36669
Name: education, dtype: object

In [6]:
## Filling missing enteris with maximum occuring event
df['previous_year_rating'].fillna(3.0, inplace=True)
df['education'].fillna('Bachelor\'s', inplace=True)

In [7]:
##One hot encoding
df1 = pd.concat([df[['no_of_trainings','age','previous_year_rating','length_of_service','KPIs_met >80%','awards_won?','avg_training_score','is_promoted']],
               pd.get_dummies(df['gender'],drop_first = True),pd.get_dummies(df['education'],drop_first = True),pd.get_dummies(df['recruitment_channel'],drop_first = True),pd.get_dummies(df['department'],drop_first = True),pd.get_dummies(df['region'],drop_first = True)],axis=1)

In [8]:
df1.shape

(54808, 54)

In [9]:
###Generating all possible pair of interactions between 2 pair of columns.
##Then removing all zero columns 

from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures

Y = df1.is_promoted
X = df1.drop(['is_promoted'],1)
 


def add_interactions(df):
    combos = list(combinations(list(df.columns), 2))
    colnames = list(df.columns)+['_'.join(x) for x in combos]
    
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = colnames
    
    noint_indices = [i for i,x in enumerate(list((df==0).all())) if x]
    df= df.drop(df.columns[noint_indices], axis=1)
    
    return df
X = add_interactions(X)
X.shape

(54808, 827)

In [10]:
seed = 2
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [None]:
###Hypertune the model intensively with a 5 fold cross validation strategy.
##first grid search for max_depth,min_child_weight then fix those and search for rest in same fashion
### parameter scale_pos_weight is quite important in case of imbalanced dataset

from sklearn.model_selection import GridSearchCV
param_test2b = {
  'min_child_weight':[5,6],
  'max_depth': range(3,10,2),
  'n_estimators':[150,200,300,400],
  #'scale_pos_weight':[1,2,3,4],
  #'colsample_bytree':[0.7,0.8], 
  #'subsample':[0.7,0.8],
  #'gamma':[0,0.2.0.4]
    
}
gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=150, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=3,seed=27), 
 param_grid = param_test2b, scoring='f1',n_jobs=4,iid=False, cv=5)
gsearch2b.fit(X_train, y_train)

In [14]:
print(gsearch2b.cv_results_)
print("gsearch2b.best_params_",gsearch2b.best_params_)
print("gsearch2b.best_score_",gsearch2b.best_score_)

{'mean_fit_time': array([101.97508392]), 'std_fit_time': array([27.72521067]), 'mean_score_time': array([0.22712359]), 'std_score_time': array([0.06027765]), 'params': [{}], 'split0_test_score': array([0.53658537]), 'split1_test_score': array([0.5]), 'split2_test_score': array([0.51315789]), 'split3_test_score': array([0.55434783]), 'split4_test_score': array([0.51676206]), 'mean_test_score': array([0.52417063]), 'std_test_score': array([0.01910589]), 'rank_test_score': array([1])}
gsearch2b.best_params_ {}
gsearch2b.best_score_ 0.5241706294368814
