# GridSearchCV - XGBoost

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score

# RF
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import uniform, randint

from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

import xgboost as xgb


In [3]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : {}.4g ".format(metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)))
    print("AUC Score (Train): {}".format(metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [2]:
training = pd.read_csv('test/training-person-epa.csv')

In [3]:
RANDOM_SEED = 12
TEST_SIZE_PERCENT = 0.2

In [4]:
y = training['label']
X = training.drop(axis=1, labels=['label'])

## Hiper-Parameters GS CV

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(
                estimator = xgb.XGBClassifier(learning_rate =0.1, n_estimators=100, max_depth=5,
                min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=RANDOM_SEED), 
                param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch1.fit(X, y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

### min_child_weight

In [5]:
param_test2b = {
 'min_child_weight':[3,2,4,5]
}
gsearch2b = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.01, n_estimators=100, max_depth=3,
 min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=RANDOM_SEED), 
 param_grid = param_test2b, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch2b.fit(X, y)
gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_



([mean: 0.86202, std: 0.01178, params: {'min_child_weight': 3},
  mean: 0.86165, std: 0.01170, params: {'min_child_weight': 2},
  mean: 0.86204, std: 0.01208, params: {'min_child_weight': 4},
  mean: 0.86205, std: 0.01182, params: {'min_child_weight': 5}],
 {'min_child_weight': 5},
 0.8620457722913285)

### gamma

In [12]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate =0.05, n_estimators=100, max_depth=3,
 min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=RANDOM_SEED), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(X, y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_



([mean: 0.87097, std: 0.00894, params: {'gamma': 0.0},
  mean: 0.87097, std: 0.00894, params: {'gamma': 0.1},
  mean: 0.87105, std: 0.00882, params: {'gamma': 0.2},
  mean: 0.87106, std: 0.00882, params: {'gamma': 0.3},
  mean: 0.87093, std: 0.00863, params: {'gamma': 0.4}],
 {'gamma': 0.3},
 0.8710552040578913)

In [13]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate =0.05, n_estimators=177, max_depth=3,
 min_child_weight=5, gamma=0.3, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=RANDOM_SEED), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch4.fit(X, y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: 0.86979, std: 0.00981, params: {'colsample_bytree': 0.6, 'subsample': 0.6},
  mean: 0.87133, std: 0.00960, params: {'colsample_bytree': 0.6, 'subsample': 0.7},
  mean: 0.87122, std: 0.01046, params: {'colsample_bytree': 0.6, 'subsample': 0.8},
  mean: 0.87054, std: 0.01029, params: {'colsample_bytree': 0.6, 'subsample': 0.9},
  mean: 0.86980, std: 0.00924, params: {'colsample_bytree': 0.7, 'subsample': 0.6},
  mean: 0.87114, std: 0.00951, params: {'colsample_bytree': 0.7, 'subsample': 0.7},
  mean: 0.87087, std: 0.00891, params: {'colsample_bytree': 0.7, 'subsample': 0.8},
  mean: 0.87101, std: 0.00959, params: {'colsample_bytree': 0.7, 'subsample': 0.9},
  mean: 0.87031, std: 0.00963, params: {'colsample_bytree': 0.8, 'subsample': 0.6},
  mean: 0.87060, std: 0.00924, params: {'colsample_bytree': 0.8, 'subsample': 0.7},
  mean: 0.87091, std: 0.00921, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.87122, std: 0.00893, params: {'colsample_bytree': 0.8, 'subsample'

In [15]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate=0.05, n_estimators=177, max_depth=3,
 min_child_weight=5, gamma=0.3, subsample=0.7, colsample_bytree=0.6, reg_alpha = 1e-05,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=RANDOM_SEED), 
 param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch6.fit(X, y)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: 0.87152, std: 0.00988, params: {'reg_alpha': 1e-05},
  mean: 0.87144, std: 0.00992, params: {'reg_alpha': 0.01},
  mean: 0.87154, std: 0.00989, params: {'reg_alpha': 0.1},
  mean: 0.87115, std: 0.00929, params: {'reg_alpha': 1},
  mean: 0.85978, std: 0.00814, params: {'reg_alpha': 100}],
 {'reg_alpha': 0.1},
 0.8715442074036162)

## N-Estimators

In [6]:
param_test_n = {
 'n_estimators': [x for x in range(100, 1000, 100)]
}

gsearch7 = GridSearchCV(estimator = xgb.XGBClassifier(learning_rate =0.05, max_depth=3,
 min_child_weight=5, gamma=0.3, subsample=0.7, colsample_bytree=0.6, reg_alpha = 0.1,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=RANDOM_SEED), 
 param_grid = param_test_n, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch7.fit(X, y)
gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_



([mean: 0.86953, std: 0.01109, params: {'n_estimators': 100},
  mean: 0.86987, std: 0.01123, params: {'n_estimators': 200},
  mean: 0.86722, std: 0.01229, params: {'n_estimators': 300},
  mean: 0.86490, std: 0.01212, params: {'n_estimators': 400},
  mean: 0.86246, std: 0.01263, params: {'n_estimators': 500},
  mean: 0.85983, std: 0.01378, params: {'n_estimators': 600},
  mean: 0.85736, std: 0.01434, params: {'n_estimators': 700},
  mean: 0.85489, std: 0.01446, params: {'n_estimators': 800},
  mean: 0.85280, std: 0.01569, params: {'n_estimators': 900}],
 {'n_estimators': 200},
 0.8698694499916506)