In [178]:
import numpy as np
import pandas as pd
import xgboost

In [179]:
df = pd.read_csv("train.csv")

In [180]:
from sklearn.preprocessing import LabelEncoder

In [181]:
df.corr()["Attrition"].sort_values()

YearsInCurrentRole        -0.269408
TotalWorkingYears         -0.262922
MonthlyIncome             -0.240479
YearsAtCompany            -0.230061
YearsWithCurrManager      -0.222752
Age                       -0.200596
StockOptionLevel          -0.175496
JobInvolvement            -0.159710
JobSatisfaction           -0.148012
EnvironmentSatisfaction   -0.111746
TrainingTimesLastYear     -0.105663
EmployeeNumber            -0.051512
YearsSinceLastPromotion   -0.050379
Education                 -0.041390
PercentSalaryHike         -0.012668
PerformanceRating          0.010202
NumCompaniesWorked         0.034763
CommunicationSkill         0.103379
DistanceFromHome           0.109224
Id                         0.694838
Attrition                  1.000000
Behaviour                       NaN
Name: Attrition, dtype: float64

In [182]:
labelencoder = LabelEncoder()

In [183]:
df['BusinessTravel'] = labelencoder.fit_transform(df['BusinessTravel'])
df['Department'] = labelencoder.fit_transform(df['Department'])
df['EducationField'] = labelencoder.fit_transform(df['EducationField'])
df['Gender'] = labelencoder.fit_transform(df['Gender'])
df['JobRole'] = labelencoder.fit_transform(df['JobRole'])
df['MaritalStatus'] = labelencoder.fit_transform(df['MaritalStatus'])
df['OverTime'] = labelencoder.fit_transform(df['OverTime'])


In [184]:
df = df.drop(['Id', 'Behaviour', 'EmployeeNumber', 'Gender', 'PercentSalaryHike', 'PerformanceRating'], axis = 1)

In [185]:
from sklearn.model_selection import train_test_split

In [186]:
df1, df2 = train_test_split(df, train_size = 0.8)

In [187]:
X_train = df1.drop('Attrition', axis = 1)
y_train = df1['Attrition']
X_val = df2.drop("Attrition", axis = 1)
y_val = df2['Attrition']


In [188]:
params = {
        'min_child_weight': [1, 2, 3, 5, 10],
        'gamma': [0.1, 0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [6, 7, 8, 9 ,10]
        }


In [189]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=2000, objective='binary:logistic',
                    silent=True, nthread=1)

In [190]:
folds = 5
param_comb = 200

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )


In [191]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier


In [192]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   49.6s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  5.5min
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed:  9.3min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 14.2min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed: 18.0min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x00000233300E2348>,
                   error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.02, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=2000,
                                           n_jobs=1, nthread=1,
                                           objective='binary:l...
                                           seed=None, silent=True, subsample=1,
                                           verbosity=1),
                   iid='deprecated', n_iter=200, n_jobs=4,
                   param_distributions={'colsamp

In [193]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)



 All results:
{'mean_fit_time': array([11.10484958, 11.1713738 , 11.10584049,  2.90823269,  3.64565344,
        6.51561661,  3.75655699,  3.47590666,  5.51665187,  5.43866053,
        6.14357681,  6.51897001,  3.30935378,  4.79319386,  3.74359093,
        6.96218705,  2.61121907,  5.52642708,  2.7915452 ,  3.8038383 ,
        5.22622027,  3.09971266,  2.80730247,  3.87046075,  6.00893636,
        3.13362141,  4.20755177,  2.9920001 ,  6.75474215,  5.22702565,
        3.81501064,  5.5182487 ,  2.99918137,  4.14930763,  6.23672752,
        3.49186511,  3.56287389,  2.87831421,  4.35177741,  4.79657707,
        3.7212523 ,  4.13674097,  4.36413312,  5.56631908,  4.07370925,
        4.72876   ,  2.89107122,  4.21652732,  5.53021622,  3.15616088,
        5.57070751,  3.59718356,  3.45596833,  4.42157946,  3.1958488 ,
        3.53715429,  3.21400714,  5.54018884,  4.41619411,  3.01374249,
        8.12234459,  4.94066315,  5.72272611,  3.41114783,  4.18880095,
        3.54592028,  2.7189311 

In [194]:
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0.1,
              learning_rate=0.02, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=None, n_estimators=2000, n_jobs=1,
              nthread=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=1.0, verbosity=1)

In [195]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0.1,
              learning_rate=0.02, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=None, n_estimators=2000, n_jobs=1,
              nthread=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=1.0, verbosity=1)

In [196]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [197]:
accuracy_score(y_val, model.predict(X_val))

0.9662576687116564

In [198]:
roc_auc_score(y_val, model.predict(X_val))

0.9660493827160495

In [199]:
df_test = pd.read_csv("test.csv")

In [200]:
df_test['BusinessTravel'] = labelencoder.fit_transform(df_test['BusinessTravel'])
df_test['Department'] = labelencoder.fit_transform(df_test['Department'])
df_test['EducationField'] = labelencoder.fit_transform(df_test['EducationField'])
df_test['Gender'] = labelencoder.fit_transform(df_test['Gender'])
df_test['JobRole'] = labelencoder.fit_transform(df_test['JobRole'])
df_test['MaritalStatus'] = labelencoder.fit_transform(df_test['MaritalStatus'])
df_test['OverTime'] = labelencoder.fit_transform(df_test['OverTime'])


In [201]:
df_test = df_test.drop(['Id', 'Gender', 'EmployeeNumber', 'Behaviour', 'PercentSalaryHike', 'PerformanceRating'], axis = 1)

In [202]:
pred = pd.Series(model.predict_proba(df_test)[:,1])

In [203]:
pred.to_csv("sub24.csv")