## Imports

In [1]:
from utils import get_unsplit_data, get_test_data
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import numpy as np
import xgboost
import time
import pickle
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

X_train, y_train, = get_unsplit_data()
X_test = get_test_data()

## Crossvalidation Function

In [2]:
def crossvalidate_XGBoost(X_t, y_t, params, K_folds):
    
    start = time.time()
    # Perform cross validation
    clf = GridSearchCV(xgboost.XGBClassifier(), params, cv=K_folds, scoring='accuracy', n_jobs=4, verbose=2)
    clf.fit(X_t, y_t)
    end = time.time()
    
    print("Cross-validation Training Time = ", (end - start))
    print()

    print("Grid scores:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()
    
    print("Best parameter set:")
    print(clf.best_params_)
    print()
    
    return

## Hand-tuning

Initial hand-tuning seems to indicate max_depth = 5 seems to work for default parameters. Early stopping indicated ~350 estimators is when overfitting begins. Start with these parameters.  

First try to determine colsample_bytree.

In [5]:
K_folds = 3
params = [{'max_depth': [5], 'learning_rate': [.1], 'n_estimators':[350], 'gamma':[0], 'subsample':[1], 'colsample_bytree':[.7,.75,.8,.85,.9,.95,1.0], 'reg_lambda':[1]}]
crossvalidate_XGBoost(X_train, y_train, params, K_folds)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.7, subsample=1, learning_rate=0.1, gamma=0 
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.7, subsample=1, learning_rate=0.1, gamma=0 
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.7, subsample=1, learning_rate=0.1, gamma=0 
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.75, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.7, subsample=1, learning_rate=0.1, gamma=0, total= 1.3min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.75, subsample=1, learning_rate=0.1, gamma=0 


  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.7, subsample=1, learning_rate=0.1, gamma=0, total= 1.4min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.75, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.7, subsample=1, learning_rate=0.1, gamma=0, total= 1.4min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.8, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.75, subsample=1, learning_rate=0.1, gamma=0, total= 1.4min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.8, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.75, subsample=1, learning_rate=0.1, gamma=0, total= 1.4min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.8, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.75, subsample=1, learning_rate=0.1, gamma=0, total= 1.5min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.8, subsample=1, learning_rate=0.1, gamma=0, total= 1.5min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.8, subsample=1, learning_rate=0.1, gamma=0, total= 1.6min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.8, subsample=1, learning_rate=0.1, gamma=0, total= 1.5min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.9, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=1, learning_rate=0.1, gamma=0, total= 1.7min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.9, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=1, learning_rate=0.1, gamma=0, total= 1.6min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.9, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=1, learning_rate=0.1, gamma=0, total= 1.6min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.95, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.9, subsample=1, learning_rate=0.1, gamma=0, total= 1.6min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.95, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.9, subsample=1, learning_rate=0.1, gamma=0, total= 1.8min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.95, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.9, subsample=1, learning_rate=0.1, gamma=0, total= 1.7min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=1.0, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.95, subsample=1, learning_rate=0.1, gamma=0, total= 1.8min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=1.0, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.95, subsample=1, learning_rate=0.1, gamma=0, total= 1.7min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=1.0, subsample=1, learning_rate=0.1, gamma=0 


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.95, subsample=1, learning_rate=0.1, gamma=0, total= 1.8min


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=1.0, subsample=1, learning_rate=0.1, gamma=0, total= 1.9min


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=1.0, subsample=1, learning_rate=0.1, gamma=0, total= 1.9min


  if diff:
  if diff:


[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=1.0, subsample=1, learning_rate=0.1, gamma=0, total= 1.7min


[Parallel(n_jobs=4)]: Done  21 out of  21 | elapsed:  9.6min finished


Cross-validation Training Time =  713.4897327423096

Grid scores:

0.840 (+/-0.004) for {'subsample': 1, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.7, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.839 (+/-0.008) for {'subsample': 1, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.75, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.839 (+/-0.004) for {'subsample': 1, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.8, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.842 (+/-0.005) for {'subsample': 1, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.839 (+/-0.005) for {'subsample': 1, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.9, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.840 (+/-0.006) for {'subsample': 1, 'max_depth': 5, 'reg_lambda': 1, 

Find colsample_bytree = .85 works best.  

Now try subsample.

In [8]:
K_folds = 3
params = [{'max_depth': [5], 'learning_rate': [.1], 'n_estimators':[350], 'gamma':[0], 'subsample':[.7,.75,.8,.85,.9,.95,1.0], 'colsample_bytree':[.85], 'reg_lambda':[1]}]
crossvalidate_XGBoost(X_train, y_train, params, K_folds)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.7, learning_rate=0.1, gamma=0 
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.7, learning_rate=0.1, gamma=0 
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.7, learning_rate=0.1, gamma=0 
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.75, learning_rate=0.1, gamma=0 
[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.75, learning_rate=0.1, gamma=0, total= 1.9min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.75, learning_rate=0.1, gamma=0 
[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.7, learning_rate=0.1, gamma=

[Parallel(n_jobs=4)]: Done  21 out of  21 | elapsed: 10.6min finished


Cross-validation Training Time =  784.4040081501007

Grid scores:

0.842 (+/-0.011) for {'subsample': 0.7, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.842 (+/-0.009) for {'subsample': 0.75, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.842 (+/-0.010) for {'subsample': 0.8, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.841 (+/-0.009) for {'subsample': 0.85, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.843 (+/-0.007) for {'subsample': 0.9, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.840 (+/-0.007) for {'subsample': 0.95, 'max_depth': 5,

Find subsample = .9

Now try gamma.

In [9]:
K_folds = 3
params = [{'max_depth': [5], 'learning_rate': [.1], 'n_estimators':[350], 'gamma':[0,.2,.4,.6,.8,1.0,1.2,1.4], 'subsample':[.9], 'colsample_bytree':[.85], 'reg_lambda':[1]}]
crossvalidate_XGBoost(X_train, y_train, params, K_folds)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, learning_rate=0.1, gamma=0 
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, learning_rate=0.1, gamma=0 
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, learning_rate=0.1, gamma=0 
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, learning_rate=0.1, gamma=0.2 
[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, learning_rate=0.1, gamma=0, total= 1.7min
[CV] booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, learning_rate=0.1, gamma=0.2 
[CV]  booster=gbtree, max_depth=5, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, learning_rate=0.1, gamma

[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed: 10.7min finished


Cross-validation Training Time =  795.1574280261993

Grid scores:

0.843 (+/-0.007) for {'subsample': 0.9, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.842 (+/-0.005) for {'subsample': 0.9, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0.2, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.842 (+/-0.004) for {'subsample': 0.9, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0.4, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.841 (+/-0.008) for {'subsample': 0.9, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0.6, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.842 (+/-0.005) for {'subsample': 0.9, 'max_depth': 5, 'reg_lambda': 1, 'gamma': 0.8, 'colsample_bytree': 0.85, 'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 350}
0.842 (+/-0.008) for {'subsample': 0.9, 'max_depth

Find gamma = 0

Now return to tuning of max_depth and min_child_weight. These are the most important parameters, so they will be evaluated together.

In [11]:
K_folds = 3
params = [{'max_depth': [4,5,6,7], 'min_child_weight':[1,3,5,7], 'learning_rate': [.1], 'n_estimators':[350], 'gamma':[0], 'subsample':[.9], 'colsample_bytree':[.85], 'reg_lambda':[1]}]
crossvalidate_XGBoost(X_train, y_train, params, K_folds)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] max_depth=4, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=1, learning_rate=0.1, gamma=0 
[CV] max_depth=4, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=1, learning_rate=0.1, gamma=0 
[CV] max_depth=4, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=1, learning_rate=0.1, gamma=0 
[CV] max_depth=4, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=3, learning_rate=0.1, gamma=0 
[CV]  max_depth=4, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=1, learning_rate=0.1, gamma=0, total= 1.4min
[CV] max_depth=4, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=3, learning_rate=0.1, gamma=0 
[CV]  max_depth=4, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=1,

[CV] max_depth=6, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=5, learning_rate=0.1, gamma=0 
[CV]  max_depth=6, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=3, learning_rate=0.1, gamma=0, total= 2.1min
[CV] max_depth=6, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=5, learning_rate=0.1, gamma=0 
[CV]  max_depth=6, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=3, learning_rate=0.1, gamma=0, total= 2.0min
[CV] max_depth=6, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=5, learning_rate=0.1, gamma=0 
[CV]  max_depth=6, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=5, learning_rate=0.1, gamma=0, total= 2.0min
[CV] max_depth=6, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=7, learning_rate=0.1, gamma=0 
[CV

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 15.8min


[CV]  max_depth=6, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=7, learning_rate=0.1, gamma=0, total= 2.1min
[CV] max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=1, learning_rate=0.1, gamma=0 
[CV]  max_depth=6, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=7, learning_rate=0.1, gamma=0, total= 2.1min
[CV] max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=1, learning_rate=0.1, gamma=0 
[CV]  max_depth=6, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=7, learning_rate=0.1, gamma=0, total= 2.1min
[CV] max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=3, learning_rate=0.1, gamma=0 
[CV]  max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=1, learning_rate=0.1, gamma=0, to

[Parallel(n_jobs=4)]: Done  48 out of  48 | elapsed: 23.1min finished


Cross-validation Training Time =  1592.945865869522

Grid scores:

0.840 (+/-0.004) for {'max_depth': 4, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'subsample': 0.9, 'min_child_weight': 1, 'learning_rate': 0.1, 'n_estimators': 350}
0.836 (+/-0.007) for {'max_depth': 4, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'subsample': 0.9, 'min_child_weight': 3, 'learning_rate': 0.1, 'n_estimators': 350}
0.835 (+/-0.008) for {'max_depth': 4, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'subsample': 0.9, 'min_child_weight': 5, 'learning_rate': 0.1, 'n_estimators': 350}
0.838 (+/-0.007) for {'max_depth': 4, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'subsample': 0.9, 'min_child_weight': 7, 'learning_rate': 0.1, 'n_estimators': 350}
0.843 (+/-0.007) for {'max_depth': 5, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'subsample': 0.9, 'min_child_weight': 1, 'learning_rate': 0.1, 'n_estimators': 350}
0.839 (+/-0.005) for {'max_depth': 5, 'reg_lambd

Find max_depth = 7 and min_child_weight = 5. Ideal max_depth was at top of search range. Expand search again.

In [12]:
K_folds = 3
params = [{'max_depth': [7,8,9,10], 'min_child_weight':[4,5,6], 'learning_rate': [.1], 'n_estimators':[350], 'gamma':[0], 'subsample':[.9], 'colsample_bytree':[.85], 'reg_lambda':[1]}]
crossvalidate_XGBoost(X_train, y_train, params, K_folds)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=4, learning_rate=0.1, gamma=0 
[CV] max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=4, learning_rate=0.1, gamma=0 
[CV] max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=4, learning_rate=0.1, gamma=0 
[CV] max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=5, learning_rate=0.1, gamma=0 
[CV]  max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=4, learning_rate=0.1, gamma=0, total= 2.4min
[CV] max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=5, learning_rate=0.1, gamma=0 
[CV]  max_depth=7, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=4,

[CV] max_depth=10, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=5, learning_rate=0.1, gamma=0 
[CV]  max_depth=10, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=4, learning_rate=0.1, gamma=0, total= 3.4min
[CV] max_depth=10, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=5, learning_rate=0.1, gamma=0 
[CV]  max_depth=10, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=4, learning_rate=0.1, gamma=0, total= 3.4min
[CV] max_depth=10, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=5, learning_rate=0.1, gamma=0 
[CV]  max_depth=10, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=4, learning_rate=0.1, gamma=0, total= 3.4min
[CV] max_depth=10, reg_lambda=1, n_estimators=350, colsample_bytree=0.85, subsample=0.9, min_child_weight=6, learning_rate=0.1, gamma

[Parallel(n_jobs=4)]: Done  36 out of  36 | elapsed: 26.8min finished


Cross-validation Training Time =  1896.5005655288696

Grid scores:

0.844 (+/-0.005) for {'max_depth': 7, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'subsample': 0.9, 'min_child_weight': 4, 'learning_rate': 0.1, 'n_estimators': 350}
0.845 (+/-0.008) for {'max_depth': 7, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'subsample': 0.9, 'min_child_weight': 5, 'learning_rate': 0.1, 'n_estimators': 350}
0.843 (+/-0.004) for {'max_depth': 7, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'subsample': 0.9, 'min_child_weight': 6, 'learning_rate': 0.1, 'n_estimators': 350}
0.844 (+/-0.008) for {'max_depth': 8, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'subsample': 0.9, 'min_child_weight': 4, 'learning_rate': 0.1, 'n_estimators': 350}
0.845 (+/-0.006) for {'max_depth': 8, 'reg_lambda': 1, 'gamma': 0, 'colsample_bytree': 0.85, 'subsample': 0.9, 'min_child_weight': 5, 'learning_rate': 0.1, 'n_estimators': 350}
0.846 (+/-0.006) for {'max_depth': 8, 'reg_lamb

Take second pass over other parameters. Search parameter space more narrowly and try to validate two parameters each time.

In [3]:
K_folds = 3
params = [{'max_depth': [10], 'min_child_weight':[6], 'learning_rate': [.1], 'n_estimators':[350], 'gamma':[0], 'subsample':[.85,.88,.9,.92,.95], 'colsample_bytree':[.80,.83,.85,.87,.90], 'reg_lambda':[1]}]
crossvalidate_XGBoost(X_train, y_train, params, K_folds)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] colsample_bytree=0.8, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.85, reg_lambda=1 
[CV] colsample_bytree=0.8, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.85, reg_lambda=1 
[CV] colsample_bytree=0.8, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.85, reg_lambda=1 
[CV] colsample_bytree=0.8, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1 
[CV]  colsample_bytree=0.8, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1, total= 3.4min
[CV] colsample_bytree=0.8, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1 
[CV]  colsample_bytree=0.8, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate

[CV] colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.85, reg_lambda=1 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.95, reg_lambda=1, total= 3.3min
[CV] colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.85, reg_lambda=1 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.95, reg_lambda=1, total= 3.2min
[CV] colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.85, reg_lambda=1 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.95, reg_lambda=1, total= 3.2min
[CV] colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 31.2min


[CV]  colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1, total= 3.6min
[CV] colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.9, reg_lambda=1 
[CV]  colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1, total= 3.6min
[CV] colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.9, reg_lambda=1 
[CV]  colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1, total= 3.6min
[CV] colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.92, reg_lambda=1 
[CV]  colsample_bytree=0.85, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.9, reg_l

[CV] colsample_bytree=0.9, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1 
[CV]  colsample_bytree=0.9, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1, total= 3.8min
[CV] colsample_bytree=0.9, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.9, reg_lambda=1 
[CV]  colsample_bytree=0.9, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.85, reg_lambda=1, total= 4.0min
[CV] colsample_bytree=0.9, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.9, reg_lambda=1 
[CV]  colsample_bytree=0.9, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1, total= 3.8min
[CV] colsample_bytree=0.9, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.9, reg_lambda=1 

[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed: 67.9min finished


Cross-validation Training Time =  4361.710444927216

Grid scores:

0.845 (+/-0.003) for {'n_estimators': 350, 'max_depth': 10, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'gamma': 0, 'subsample': 0.85, 'reg_lambda': 1}
0.847 (+/-0.006) for {'n_estimators': 350, 'max_depth': 10, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'gamma': 0, 'subsample': 0.88, 'reg_lambda': 1}
0.847 (+/-0.005) for {'n_estimators': 350, 'max_depth': 10, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'gamma': 0, 'subsample': 0.9, 'reg_lambda': 1}
0.847 (+/-0.004) for {'n_estimators': 350, 'max_depth': 10, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'gamma': 0, 'subsample': 0.92, 'reg_lambda': 1}
0.848 (+/-0.003) for {'n_estimators': 350, 'max_depth': 10, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'gamma': 0, 'subsample': 0.95, 'reg_lambda': 1}
0.846 (+/-0.008) for {'n_estimators': 350, '

In [4]:
K_folds = 3
params = [{'max_depth': [10], 'min_child_weight':[6], 'learning_rate': [.1], 'n_estimators':[350], 'gamma':[0,.05,.10,.15,.20], 'subsample':[.88], 'colsample_bytree':[.83], 'reg_lambda':[.01,.05,.1,.5,1]}]
crossvalidate_XGBoost(X_train, y_train, params, K_folds)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.01 
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.01 
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.01 
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.05 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.01, total= 3.4min
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.05 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0, n_esti

[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.05, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.05, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.5, total= 3.4min
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.1, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.01 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.05, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1, total= 3.4min
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.1, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.01 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.05, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=1, total= 3.3min
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.1, n_estimators=350, learning_rat

[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 30.5min


[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.1, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.05, total= 3.3min
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.1, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.1 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.1, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.05, total= 3.4min
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.1, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.1 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.1, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.05, total= 3.3min
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.1, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.5 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.1, n_estimators=350, learning

[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.2, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.05 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.2, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.01, total= 3.4min
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.2, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.05 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.2, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.01, total= 3.4min
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.2, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.1 
[CV]  colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.2, n_estimators=350, learning_rate=0.1, subsample=0.88, reg_lambda=0.05, total= 3.4min
[CV] colsample_bytree=0.83, max_depth=10, min_child_weight=6, gamma=0.2, n_estimators=350, learnin

[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed: 64.4min finished


Cross-validation Training Time =  4151.135474443436

Grid scores:

0.849 (+/-0.005) for {'n_estimators': 350, 'max_depth': 10, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.83, 'gamma': 0, 'subsample': 0.88, 'reg_lambda': 0.01}
0.847 (+/-0.003) for {'n_estimators': 350, 'max_depth': 10, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.83, 'gamma': 0, 'subsample': 0.88, 'reg_lambda': 0.05}
0.848 (+/-0.005) for {'n_estimators': 350, 'max_depth': 10, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.83, 'gamma': 0, 'subsample': 0.88, 'reg_lambda': 0.1}
0.848 (+/-0.006) for {'n_estimators': 350, 'max_depth': 10, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.83, 'gamma': 0, 'subsample': 0.88, 'reg_lambda': 0.5}
0.849 (+/-0.006) for {'n_estimators': 350, 'max_depth': 10, 'min_child_weight': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.83, 'gamma': 0, 'subsample': 0.88, 'reg_lambda': 1}
0.848 (+/-0.003) for {'n_est

Have tuned gamma = 0 , subsample = .88 , colsample_bytree = .83, reg_lambda = 0.01

## Final Model

Now determine optimal learning rate and number of estimators. Fit final model to the dataset and predict test labels.

In [73]:
def crossvalidateLearningRate(X_train, y_train, maxEstimators, otherparams, cv_folds, early_stopping_rounds):
    dTrain = xgboost.DMatrix(X_train, label=y_train)
    cvresult = xgboost.cv(otherparams, dTrain, num_boost_round=maxEstimators, nfold=cv_folds, early_stopping_rounds=early_stopping_rounds, as_pandas=False)
    return cvresult

# Find optimal learning rate and number of estimators
rates = [.3, .1, .03, .01]
errorArray = []
nEstArray = []
for rate in rates:
    t1 = time.time()
    clf = xgboost.XGBClassifier(learning_rate = rate,
                                max_depth = 10, 
                                min_child_weight = 6, 
                                gamma = 0, 
                                subsample = .88,
                                colsample_bytree = .83,
                                reg_lambda = .01,
                                n_jobs=4)
    params = clf.get_xgb_params()
    cvOUT = crossvalidateLearningRate(X_train, y_train, 10000, params, 3, 50)
    testErrors = cvOUT['test-error-mean']
    errorSTDs = cvOUT['test-error-std']
    numIterations = len(testErrors)
    finalError = testErrors[-1]
    finalSTD = errorSTDs[-1]
    errorArray.append(finalError)
    nEstArray.append(numIterations)
    t2 = time.time()
    print("Learning rate = %s" % rate)
    print("Cross-validation time = %s" % (t2-t1))
    print("Number of iterations = %s" % numIterations)
    print("Test Error = %s +/- %s" % (finalError, finalSTD))
    print()

# Set optimal rate and number of estimators
ind = np.argmin(errorArray)
optimal_rate = rates[ind]
optimal_n = nEstArray[ind]

# Train final model
finalModel = xgboost.XGBClassifier(learning_rate = optimal_rate, 
                                   n_estimators = optimal_n,
                                   max_depth = 10, 
                                   min_child_weight = 6, 
                                   gamma = 0, 
                                   subsample = .88,
                                   colsample_bytree = .83,
                                   reg_lambda = .01,
                                   n_jobs=4)
finalModel.fit(X_train, y_train)

# Predict test labels and save
y_label = finalModel.predict(X_test)
y_label = y_label.reshape((len(y_label),1))
result_col_1 = (np.array(range(len(y_label)))+1).reshape((len(y_label),1))
result = np.concatenate((result_col_1,y_label), axis = 1)
np.savetxt("XGBoost_pred_labels.txt", result, fmt="%d", delimiter=',', header='Id,Prediction')

Learning rate = 0.3
Cross-validation time = 78.36275434494019
Number of iterations = 122
Test Error = 0.15399966666666667 +/- 0.0017764808533227178

Learning rate = 0.1
Cross-validation time = 142.0630099773407
Number of iterations = 260
Test Error = 0.153 +/- 0.0024515970305088865

Learning rate = 0.03
Cross-validation time = 455.487779378891
Number of iterations = 951
Test Error = 0.15155 +/- 0.0027175931753422255

Learning rate = 0.01
Cross-validation time = 810.1910254955292
Number of iterations = 1741
Test Error = 0.15714999999999998 +/- 0.0009138533799248164



NameError: name 'y_label' is not defined

In [74]:
finalModel

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.83, gamma=0, learning_rate=0.03,
       max_delta_step=0, max_depth=10, min_child_weight=6, missing=None,
       n_estimators=951, n_jobs=4, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=0.01, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.88)

In [75]:
y_label = finalModel.predict(X_test)
y_label = y_label.reshape((len(y_label),1))
result_col_1 = (np.array(range(len(y_label)))+1).reshape((len(y_label),1))
result = np.concatenate((result_col_1,y_label), axis = 1)
np.savetxt("XGBoost_pred_labels.txt", result, fmt="%d", delimiter=',', header='Id,Prediction')