# Gradient Boosting - Parameter Tuning

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV  #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline




In [3]:
training = pd.read_csv('../test/training-person2.csv').fillna(0)
test = pd.read_csv('../test/test-person2.csv').fillna(0)
sumbit = test['person'].to_frame()

In [4]:
RANDOM_SEED = 12
TEST_SIZE_PERCENT = 0.2

In [5]:
y = training['label']
X = training.drop(axis=1, labels=['label'])

In [15]:
#Choose all predictors except target & IDcols
predictors = [x for x in training.columns if x not in ['person', 'label']]

param_test1 = {'n_estimators': [x for x in range(20,81,10)]}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=RANDOM_SEED), 
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X, y)

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.85141, std: 0.01167, params: {'n_estimators': 20},
  mean: 0.85290, std: 0.01169, params: {'n_estimators': 30},
  mean: 0.85308, std: 0.01198, params: {'n_estimators': 40},
  mean: 0.85377, std: 0.01207, params: {'n_estimators': 50},
  mean: 0.85361, std: 0.01224, params: {'n_estimators': 60},
  mean: 0.85398, std: 0.01230, params: {'n_estimators': 70},
  mean: 0.85372, std: 0.01190, params: {'n_estimators': 80}],
 {'n_estimators': 70},
 0.8539784113113071)

In [6]:
param_test2 = {'max_depth':[x for x in range(5,16,2)], 'min_samples_split':[x for x in range(200,1001,200)]}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=70, max_features='sqrt', subsample=0.8, random_state=RANDOM_SEED), 
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(X, y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_



([mean: 0.85437, std: 0.01242, params: {'max_depth': 5, 'min_samples_split': 200},
  mean: 0.85447, std: 0.01123, params: {'max_depth': 5, 'min_samples_split': 400},
  mean: 0.85499, std: 0.01123, params: {'max_depth': 5, 'min_samples_split': 600},
  mean: 0.85533, std: 0.01162, params: {'max_depth': 5, 'min_samples_split': 800},
  mean: 0.85304, std: 0.01255, params: {'max_depth': 5, 'min_samples_split': 1000},
  mean: 0.85398, std: 0.01444, params: {'max_depth': 7, 'min_samples_split': 200},
  mean: 0.85236, std: 0.01361, params: {'max_depth': 7, 'min_samples_split': 400},
  mean: 0.85454, std: 0.01253, params: {'max_depth': 7, 'min_samples_split': 600},
  mean: 0.85363, std: 0.01281, params: {'max_depth': 7, 'min_samples_split': 800},
  mean: 0.85427, std: 0.01127, params: {'max_depth': 7, 'min_samples_split': 1000},
  mean: 0.85072, std: 0.01158, params: {'max_depth': 9, 'min_samples_split': 200},
  mean: 0.85172, std: 0.01099, params: {'max_depth': 9, 'min_samples_split': 400},
  

In [7]:
param_test3 = {'min_samples_split':[x for x in range(700,900,50)], 'min_samples_leaf':[x for x in range(30,71,10)]}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=70,max_depth=5,max_features='sqrt', subsample=0.8, random_state=RANDOM_SEED), 
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X, y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_



([mean: 0.85531, std: 0.01164, params: {'min_samples_leaf': 30, 'min_samples_split': 700},
  mean: 0.85556, std: 0.01200, params: {'min_samples_leaf': 30, 'min_samples_split': 750},
  mean: 0.85526, std: 0.01269, params: {'min_samples_leaf': 30, 'min_samples_split': 800},
  mean: 0.85558, std: 0.01297, params: {'min_samples_leaf': 30, 'min_samples_split': 850},
  mean: 0.85514, std: 0.01170, params: {'min_samples_leaf': 40, 'min_samples_split': 700},
  mean: 0.85491, std: 0.01337, params: {'min_samples_leaf': 40, 'min_samples_split': 750},
  mean: 0.85500, std: 0.01309, params: {'min_samples_leaf': 40, 'min_samples_split': 800},
  mean: 0.85612, std: 0.01196, params: {'min_samples_leaf': 40, 'min_samples_split': 850},
  mean: 0.85553, std: 0.01245, params: {'min_samples_leaf': 50, 'min_samples_split': 700},
  mean: 0.85517, std: 0.01321, params: {'min_samples_leaf': 50, 'min_samples_split': 750},
  mean: 0.85492, std: 0.01338, params: {'min_samples_leaf': 50, 'min_samples_split': 800},

In [9]:
param_test4 = {'max_features':[x for x in range(3,7,1)]}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=70,max_depth=5, min_samples_split=750, min_samples_leaf=70, subsample=0.8, random_state=RANDOM_SEED),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(X, y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: 0.85262, std: 0.01300, params: {'max_features': 3},
  mean: 0.85470, std: 0.01336, params: {'max_features': 4},
  mean: 0.85684, std: 0.01325, params: {'max_features': 5},
  mean: 0.85561, std: 0.01284, params: {'max_features': 6}],
 {'max_features': 5},
 0.856836282134603)

In [11]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=70,max_depth=5,min_samples_split=750, min_samples_leaf=70, subsample=0.8, random_state=RANDOM_SEED,max_features=5),
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(X, y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_



([mean: 0.85524, std: 0.01180, params: {'subsample': 0.6},
  mean: 0.85467, std: 0.01254, params: {'subsample': 0.7},
  mean: 0.85427, std: 0.01358, params: {'subsample': 0.75},
  mean: 0.85684, std: 0.01325, params: {'subsample': 0.8},
  mean: 0.85444, std: 0.01355, params: {'subsample': 0.85},
  mean: 0.85604, std: 0.01177, params: {'subsample': 0.9}],
 {'subsample': 0.8},
 0.856836282134603)

In [None]:
# Predict value kaggle set
ret = gsearch4.predict_proba(test.drop(axis=1, labels=['person']))

In [None]:
# Get proba
sumbit['label'] = pd.DataFrame(ret)[1]