In [2]:
import os
import datetime
import numpy as np
import pandas as pd

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.sans-serif'] = ['SimHei']
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [5]:
data = pd.read_csv('data/train_modified.csv')

target = 'Disbursed'
index = 'ID'
data.set_index(index, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(data.drop(columns=[target]), data[target],
                                                    test_size=0.3, random_state=0)

In [6]:
from xgboost import XGBClassifier

In [7]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
roc_auc_score(y_test, xgb.predict_proba(x_test)[:, 1])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

0.8358823994527216

In [9]:
params = {'n_estimators': range(10, 400, 50),
         'learning_rate': [0.0001, 0.0005, 0.001, 0.0015, 0.002, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5]}

gsearch = GridSearchCV(estimator=XGBClassifier(max_depth=6, 
                                               min_child_weight=1,
                                               gamma=0,
                                               subsample=0.8,
                                               colsample_bytree=0.8,
                                               objective='binary:logistic',
                                               seed=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1, silent=None,
       subsample=0.8, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': range(10, 400, 50), 'learning_rate': [0.0001, 0.0005, 0.001, 0.0015, 0.002, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'learning_rate': 0.05, 'n_estimators': 160}

In [10]:
params = {'max_depth': range(3, 10, 1),
         'min_child_weight': range(1, 6)}

gsearch = GridSearchCV(estimator=XGBClassifier(n_estimators=160,
                                               learning_rate=0.05,
                                               gamma=0,
                                               subsample=0.8,
                                               colsample_bytree=0.8,
                                               objective='binary:logistic',
                                               seed=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=160, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1, silent=None,
       subsample=0.8, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': range(3, 10), 'min_child_weight': range(1, 6)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'max_depth': 6, 'min_child_weight': 2}

In [12]:
params = {'gamma': [i/10 for i in range(0, 10)]}

gsearch = GridSearchCV(estimator=XGBClassifier(n_estimators=160,
                                               learning_rate=0.05,
                                               max_depth=6,
                                               min_child_weight=2,
                                               subsample=0.8,
                                               colsample_bytree=0.8,
                                               objective='binary:logistic',
                                               seed=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=2, missing=None, n_estimators=160, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1, silent=None,
       subsample=0.8, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'gamma': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'gamma': 0.0}

In [14]:
params = {'subsample': [i/100 for i in range(70, 91)], 
         'colsample_bytree': [i/100 for i in range(70, 91)]}

gsearch = GridSearchCV(estimator=XGBClassifier(n_estimators=160,
                                               learning_rate=0.05,
                                               max_depth=6,
                                               min_child_weight=2,
                                               gamma=0,
                                               objective='binary:logistic',
                                               seed=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.05,
       max_delta_step=0, max_depth=6, min_child_weight=2, missing=None,
       n_estimators=160, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=1, silent=None, subsample=1,
       verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'subsample': [0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9], 'colsample_bytree': [0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'colsample_bytree': 0.86, 'subsample': 0.76}

In [17]:
params = {'reg_lambda': [1e-08, 1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 1e-2]}

gsearch = GridSearchCV(estimator=XGBClassifier(n_estimators=160,
                                               learning_rate=0.05,
                                               subsample=0.76,
                                               colsample_bytree=0.86,
                                               max_depth=6,
                                               min_child_weight=2,
                                               gamma=0,
                                               reg_alpha=0,
                                               objective='binary:logistic',
                                               seed=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.86, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=2, missing=None, n_estimators=160, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1, silent=None,
       subsample=0.76, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'reg_lambda': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'reg_lambda': 0.0001}

In [18]:
params = {'learning_rate': [0.0075, 0.01, 0.025, 0.05, 0.075, 0.1]}

gsearch = GridSearchCV(estimator=XGBClassifier(n_estimators=160,
                                               subsample=0.76,
                                               colsample_bytree=0.86,
                                               max_depth=6,
                                               min_child_weight=2,
                                               gamma=0,
                                               reg_alpha=0,
                                               reg_lambda=0.0001,
                                               objective='binary:logistic',
                                               seed=1),
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       n_jobs=-1)
gsearch.fit(x_train, y_train)
gsearch.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.86, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=2, missing=None, n_estimators=160, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=0.0001, scale_pos_weight=1, seed=1,
       silent=None, subsample=0.76, verbosity=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.0075, 0.01, 0.025, 0.05, 0.075, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

{'learning_rate': 0.05}

In [19]:
xgb_best = XGBClassifier(n_estimators=160,
                         learning_rate=0.05,
                         subsample=0.76,
                         colsample_bytree=0.86,
                         max_depth=6,
                         min_child_weight=2,
                         gamma=0,
                         reg_alpha=0,
                         reg_lambda=0.0001,
                         objective='binary:logistic',
                         seed=1)
xgb_best.fit(x_train, y_train)
roc_auc_score(y_test, xgb_best.predict_proba(x_test)[:, 1])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.86, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=2, missing=None, n_estimators=160, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=0.0001, scale_pos_weight=1, seed=1,
       silent=None, subsample=0.76, verbosity=1)

0.8442342920450229