# Baseline Algorithms for Forest Cover Type Predictions


Data source: https://www.kaggle.com/c/forest-cover-type-prediction

In [1]:
import numpy as np
import pandas as pd
import pickle

import seaborn as sns
import xgboost as xgb
import feature_eng_function as f_eng
from datetime import datetime
from matplotlib import pyplot as plt
from PCA_function import pca_data100
from IPython.core.display import display, HTML
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV

%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))



## Load Data and create base data set

In [2]:
forest = pd.read_csv("data/train.csv", index_col=0)
forest.head()

Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
2,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
3,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
4,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
5,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [3]:
data = forest.values
X_kaggle = data[:,0:(-1)]
y_train = data[:,-1]
X_kaggle.shape, y_train.shape

((15120, 54), (15120,))

## Feature Engineering

In [4]:
forest_eng = pd.read_csv("data/train_eng.csv", index_col=0)

In [5]:
forest_eng.head()

Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
2,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
3,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
4,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
5,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [6]:
data = forest_eng.values
X_base = data[:,0:(-1)]
y_train = data[:,-1]
X_base.shape, y_train.shape

((15120, 95), (15120,))

## Import the top 100 feature set data for testing

In [7]:
forest_100 = pd.read_csv("data/train_100.csv", index_col=0)
forest_100.head()

Unnamed: 0_level_0,Elevation,Horizontal_Distance_To_Hydrology_Hillshade_3pm,Horizontal_Distance_To_Roadways_Horizontal_Distance_To_Fire_Points,Horizontal_Distance_To_Roadways_Aspect2,Hillshade_9am_Hillshade_Noon,Elevation_Elevation_3100_8000,Elevation_3100_8000_Elevation,Horizontal_Distance_To_Fire_Points_Horizontal_Distance_To_Roadways,Slope_Horizontal_Distance_To_Roadways,Slope_Horizontal_Distance_To_Fire_Points,...,Soil_Type39_Hillshade_9am,Wilderness_Area4_Elevation,Very Stony_Elevation,Wilderness_Area4_Hillshade_Noon,Outcrop_Horizontal_Distance_To_Roadways,Aspect2_Vanet,Rock_Elevation,Soil_Type32_Horizontal_Distance_To_Roadways,Horizontal_Distance_To_Roadways_Wilderness_Area4,Cover_Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2596,38184,3202290,26010,51272,0,0,3202290,1530,18837,...,0,0,0.0,0,510.0,0.0,2596.0,0,0,5
2,2590,32012,2427750,21840,51700,0,0,2427750,780,12450,...,0,0,0.0,0,390.0,0.0,2590.0,0,0,5
3,2804,36180,19464780,442020,55692,0,0,19464780,28620,55089,...,0,0,0.0,0,0.0,0.0,0.0,0,0,2
4,2785,29524,19191990,478950,56644,0,0,19191990,55620,111798,...,0,0,0.0,0,0.0,0.0,0.0,0,0,2
5,2595,22950,2413252,17595,51480,0,0,2413252,782,12344,...,0,0,0.0,0,391.0,0.0,2595.0,0,0,5


In [8]:
data = forest_100.values
X_100 = data[:,0:(-1)]
y_train = data[:,-1]
X_100.shape, y_train.shape

((15120, 100), (15120,))

## Run XGBoost in Grid Search across a broad range of parameters for each data set

In [9]:
models = []

In [10]:
X_kaggle_train, X_kaggle_test, y_kaggle_train, y_kaggle_test = train_test_split(X_kaggle, y_train)
optimized_kaggle_default_GBM = xgb.XGBClassifier(seed=0,objective='multi:softmax')
optimized_kaggle_default_GBM.fit(X_kaggle_train, y_kaggle_train, early_stopping_rounds=50, eval_metric="merror",
    eval_set=[(X_kaggle_test, y_kaggle_test)])

[0]	validation_0-merror:0.385185
Will train until validation_0-merror hasn't improved in 50 rounds.
[1]	validation_0-merror:0.34418
[2]	validation_0-merror:0.344974
[3]	validation_0-merror:0.340212
[4]	validation_0-merror:0.340741
[5]	validation_0-merror:0.337831
[6]	validation_0-merror:0.337037
[7]	validation_0-merror:0.336243
[8]	validation_0-merror:0.337302
[9]	validation_0-merror:0.330423
[10]	validation_0-merror:0.328836
[11]	validation_0-merror:0.324603
[12]	validation_0-merror:0.32328
[13]	validation_0-merror:0.318519
[14]	validation_0-merror:0.313492
[15]	validation_0-merror:0.315608
[16]	validation_0-merror:0.314286
[17]	validation_0-merror:0.310847
[18]	validation_0-merror:0.311111
[19]	validation_0-merror:0.310582
[20]	validation_0-merror:0.308201
[21]	validation_0-merror:0.303175
[22]	validation_0-merror:0.302116
[23]	validation_0-merror:0.301852
[24]	validation_0-merror:0.301852
[25]	validation_0-merror:0.299471
[26]	validation_0-merror:0.298148
[27]	validation_0-merror:0.

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [11]:
optimized_kaggle_default_GBM_pred = optimized_kaggle_default_GBM.predict(X_kaggle_test)
optimized_kaggle_default_GBM_accuracy = accuracy_score(y_kaggle_test, optimized_kaggle_default_GBM_pred);
print ('accuracy:%0.2f%%'%(optimized_kaggle_default_GBM_accuracy*100))

accuracy:75.53%


In [61]:
models.append(('optimized_kaggle_default_GBM',optimized_kaggle_default_GBM))
models

[('optimized_kaggle_default_GBM',
  XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
         gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
         min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
         objective='multi:softprob', reg_alpha=0, reg_lambda=1,
         scale_pos_weight=1, seed=0, silent=True, subsample=1))]

In [16]:
t1 = datetime.now()
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_kaggle_cv5_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5)
optimized_kaggle_cv5_GBM.fit(X_kaggle, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 5317 sec


In [62]:
models.append(('optimized_kaggle_cv5_GBM',optimized_kaggle_cv5_GBM))

In [18]:
optimized_kaggle_cv5_GBM.grid_scores_



[mean: 0.69061, std: 0.03408, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 1},
 mean: 0.69061, std: 0.03355, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 3},
 mean: 0.68909, std: 0.03482, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 5},
 mean: 0.75099, std: 0.03659, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1},
 mean: 0.75218, std: 0.03708, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 3},
 mean: 0.75073, std: 0.03496, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5},
 mean: 0.77864, std: 0.03479, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 1},
 mean: 0.77447, std: 0.03964, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3},
 mean: 0.77282, std: 0.03644, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 5},
 mean: 0.67870, std: 0.03251, params: {'learning_rate': 0.1, 'max_depth': 1, 'min_

In [19]:
t1 = datetime.now()
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_kaggle_cv10_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 10, n_jobs = -1)
optimized_kaggle_cv10_GBM.fit(X_kaggle, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 5043 sec


In [63]:
models.append(('optimized_kaggle_cv10_GBM',optimized_kaggle_cv10_GBM))

In [21]:
optimized_kaggle_cv10_GBM.grid_scores_



[mean: 0.69319, std: 0.03904, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 1},
 mean: 0.69272, std: 0.03909, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 3},
 mean: 0.69153, std: 0.04162, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 5},
 mean: 0.75198, std: 0.04160, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1},
 mean: 0.75040, std: 0.04303, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 3},
 mean: 0.74570, std: 0.04296, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5},
 mean: 0.77778, std: 0.03824, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 1},
 mean: 0.77374, std: 0.04127, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3},
 mean: 0.76971, std: 0.04470, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 5},
 mean: 0.68624, std: 0.03856, params: {'learning_rate': 0.1, 'max_depth': 1, 'min_

In [22]:
X_base_train, X_base_test, y_base_train, y_base_test = train_test_split(X_base, y_train)
optimized_base_default_GBM = xgb.XGBClassifier(seed=0,objective='multi:softmax')
optimized_base_default_GBM.fit(X_base_train, y_base_train, eval_metric="merror",
    eval_set=[(X_base_test, y_base_test)])

[0]	validation_0-merror:0.370106
[1]	validation_0-merror:0.32963
[2]	validation_0-merror:0.325661
[3]	validation_0-merror:0.330423
[4]	validation_0-merror:0.323016
[5]	validation_0-merror:0.318519
[6]	validation_0-merror:0.320635
[7]	validation_0-merror:0.314021
[8]	validation_0-merror:0.312169
[9]	validation_0-merror:0.311376
[10]	validation_0-merror:0.306085
[11]	validation_0-merror:0.305556
[12]	validation_0-merror:0.303704
[13]	validation_0-merror:0.302646
[14]	validation_0-merror:0.298413
[15]	validation_0-merror:0.302646
[16]	validation_0-merror:0.300794
[17]	validation_0-merror:0.299735
[18]	validation_0-merror:0.29709
[19]	validation_0-merror:0.297354
[20]	validation_0-merror:0.296825
[21]	validation_0-merror:0.295767
[22]	validation_0-merror:0.293651
[23]	validation_0-merror:0.292593
[24]	validation_0-merror:0.294709
[25]	validation_0-merror:0.292064
[26]	validation_0-merror:0.29127
[27]	validation_0-merror:0.291005
[28]	validation_0-merror:0.289418
[29]	validation_0-merror:0.

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [24]:
optimized_base_default_GBM_pred = optimized_base_default_GBM.predict(X_base_test)
optimized_base_default_GBM_accuracy = accuracy_score(y_base_test, optimized_base_default_GBM_pred);
print ('accuracy:%0.2f%%'%(optimized_base_default_GBM_accuracy*100))

accuracy:75.69%


In [64]:
models.append(('optimized_base_default_GBM',optimized_base_default_GBM))

In [26]:
t1 = datetime.now()
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_base_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5, n_jobs = -1)
optimized_base_GBM.fit(X_base, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 3761 sec


In [65]:
models.append(('optimized_base_cv5_GBM',optimized_base_GBM))

In [27]:
optimized_base_GBM.grid_scores_



[mean: 0.69034, std: 0.03449, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 1},
 mean: 0.69180, std: 0.03370, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 3},
 mean: 0.69220, std: 0.03422, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 5},
 mean: 0.75516, std: 0.03795, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1},
 mean: 0.75344, std: 0.03834, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 3},
 mean: 0.74934, std: 0.03857, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5},
 mean: 0.78095, std: 0.03316, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 1},
 mean: 0.77573, std: 0.03491, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3},
 mean: 0.77698, std: 0.03563, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 5},
 mean: 0.68439, std: 0.03199, params: {'learning_rate': 0.1, 'max_depth': 1, 'min_

In [28]:
t1 = datetime.now()
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_base_cv10_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 10, n_jobs = -1)
optimized_base_cv10_GBM.fit(X_base, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 8646 sec


In [66]:
models.append(('optimized_base_cv10_GBM',optimized_base_cv10_GBM))

In [30]:
optimized_base_cv10_GBM.grid_scores_



[mean: 0.69729, std: 0.03657, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 1},
 mean: 0.69835, std: 0.03644, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 3},
 mean: 0.69683, std: 0.04253, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 5},
 mean: 0.75357, std: 0.04245, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1},
 mean: 0.75357, std: 0.04104, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 3},
 mean: 0.75298, std: 0.04336, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5},
 mean: 0.77844, std: 0.04083, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 1},
 mean: 0.77606, std: 0.03856, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3},
 mean: 0.77507, std: 0.03946, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 5},
 mean: 0.68909, std: 0.03420, params: {'learning_rate': 0.1, 'max_depth': 1, 'min_

In [31]:
X_100_train, X_100_test, y_100_train, y_100_test = train_test_split(X_100, y_train)
optimized_100_default_GBM = xgb.XGBClassifier(seed=0,objective='multi:softmax')
optimized_100_default_GBM.fit(X_100_train, y_100_train, early_stopping_rounds=50, eval_metric="merror",
    eval_set=[(X_100_test, y_100_test)])

optimized_100_default_GBM_pred = optimized_100_default_GBM.predict(X_100_test)
optimized_100_default_GBM_accuracy = accuracy_score(y_100_test, optimized_100_default_GBM_pred);
print ('accuracy:%0.2f%%'%(optimized_100_default_GBM_accuracy*100))

[0]	validation_0-merror:0.320106
Will train until validation_0-merror hasn't improved in 50 rounds.
[1]	validation_0-merror:0.317196
[2]	validation_0-merror:0.321958
[3]	validation_0-merror:0.322487
[4]	validation_0-merror:0.313492
[5]	validation_0-merror:0.306085
[6]	validation_0-merror:0.305556
[7]	validation_0-merror:0.297619
[8]	validation_0-merror:0.300529
[9]	validation_0-merror:0.297354
[10]	validation_0-merror:0.293651
[11]	validation_0-merror:0.292593
[12]	validation_0-merror:0.29127
[13]	validation_0-merror:0.291005
[14]	validation_0-merror:0.288889
[15]	validation_0-merror:0.286243
[16]	validation_0-merror:0.281746
[17]	validation_0-merror:0.281217
[18]	validation_0-merror:0.279101
[19]	validation_0-merror:0.275926
[20]	validation_0-merror:0.275397
[21]	validation_0-merror:0.278042
[22]	validation_0-merror:0.27328
[23]	validation_0-merror:0.274074
[24]	validation_0-merror:0.272222
[25]	validation_0-merror:0.270899
[26]	validation_0-merror:0.270106
[27]	validation_0-merror:0.

In [67]:
models.append(('optimized_100_default_GBM',optimized_100_default_GBM))

In [33]:
t1 = datetime.now()
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_100_cv5_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5, n_jobs = -1)
optimized_100_cv5_GBM.fit(X_100, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 5546 sec


In [68]:
models.append(('optimized_100_cv5_GBM',optimized_100_cv5_GBM))

In [35]:
optimized_100_cv5_GBM.grid_scores_



[mean: 0.70060, std: 0.03620, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 1},
 mean: 0.70132, std: 0.03721, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 3},
 mean: 0.70033, std: 0.03655, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 5},
 mean: 0.77500, std: 0.03620, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1},
 mean: 0.77295, std: 0.03565, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 3},
 mean: 0.76951, std: 0.03670, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5},
 mean: 0.79788, std: 0.03663, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 1},
 mean: 0.79239, std: 0.03677, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3},
 mean: 0.79187, std: 0.03674, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 5},
 mean: 0.69008, std: 0.03703, params: {'learning_rate': 0.1, 'max_depth': 1, 'min_

In [36]:
t1 = datetime.now()
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_100_cv10_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 10, n_jobs = -1)
optimized_100_cv10_GBM.fit(X_100, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 10858 sec


In [69]:
models.append(('optimized_100_cv10_GBM',optimized_100_cv10_GBM))

In [39]:
optimized_100_cv10_GBM.grid_scores_



[mean: 0.70403, std: 0.04440, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 1},
 mean: 0.70489, std: 0.04445, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 3},
 mean: 0.70403, std: 0.04522, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 5},
 mean: 0.77269, std: 0.04235, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1},
 mean: 0.76766, std: 0.04451, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 3},
 mean: 0.76739, std: 0.04289, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5},
 mean: 0.79735, std: 0.03831, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 1},
 mean: 0.79220, std: 0.04193, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3},
 mean: 0.78829, std: 0.04127, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 5},
 mean: 0.69319, std: 0.04769, params: {'learning_rate': 0.1, 'max_depth': 1, 'min_

## Fine Tune the best hyperparameters for the best model in the broad search

In [40]:
############ NEEDS TO BE MODIFIED ONCE WE FIGURE OUT THE BEST ONE ####################
'''t1 = datetime.now()
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_kaggle_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5)#, n_jobs = -1)
optimized_kaggle_GBM.fit(X_kaggle, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)'''

't1 = datetime.now()\ncv_params = {\'max_depth\': [1,3,5], \'min_child_weight\': [1,3,5], \'learning_rate\': [0.15, 0.1, 0.075, 0.05]}\nind_params = {\'n_estimators\': 300, \'seed\':0, \'subsample\': 0.8, \'colsample_bytree\': 0.8, \n             \'objective\': \'multi:softmax\'}\noptimized_kaggle_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = \'accuracy\', cv = 5)#, n_jobs = -1)\noptimized_kaggle_GBM.fit(X_kaggle, y_train)\nt2 = datetime.now()\nprint ("Total time: %i sec" %(t2-t1).seconds)'

## Random Forest Ensembles in Grid Search

In [41]:
t1 = datetime.now()
hyperparameters = {'max_depth': [1,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}
optimized_kaggle_cv5_RF = GridSearchCV(RandomForestClassifier(random_state=0), hyperparameters, cv=5, n_jobs=-1)
optimized_kaggle_cv5_RF.fit(X_kaggle, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 720 sec


In [70]:
models.append(('optimized_kaggle_cv5_RF',optimized_kaggle_cv5_RF))

In [43]:
optimized_kaggle_cv5_RF.grid_scores_



[mean: 0.45231, std: 0.01502, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 50},
 mean: 0.45549, std: 0.01143, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 100},
 mean: 0.47910, std: 0.01698, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 200},
 mean: 0.46852, std: 0.03014, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 300},
 mean: 0.46786, std: 0.03025, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 500},
 mean: 0.48776, std: 0.02424, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 700},
 mean: 0.48942, std: 0.02423, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 800},
 mean: 0.48823, std: 0.02366, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 1000},
 mean: 0.47454, std: 0.0

In [44]:
t1 = datetime.now()
hyperparameters = {'max_depth': [1,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}
optimized_kaggle_cv10_RF = GridSearchCV(RandomForestClassifier(random_state=0), hyperparameters, cv=10, n_jobs=-1)
optimized_kaggle_cv10_RF.fit(X_kaggle, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 1476 sec


In [71]:
models.append(('optimized_kaggle_cv10_RF',optimized_kaggle_cv10_RF))

In [46]:
optimized_kaggle_cv10_RF.grid_scores_



[mean: 0.44974, std: 0.04753, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 50},
 mean: 0.46561, std: 0.05157, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 100},
 mean: 0.48426, std: 0.04744, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 200},
 mean: 0.48228, std: 0.05189, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 300},
 mean: 0.48307, std: 0.05616, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 500},
 mean: 0.48406, std: 0.05590, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 700},
 mean: 0.48697, std: 0.05471, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 800},
 mean: 0.48724, std: 0.05301, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 1000},
 mean: 0.47930, std: 0.0

In [47]:
t1 = datetime.now()
hyperparameters = {'max_depth': [1,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}
optimized_base_cv5_RF = GridSearchCV(RandomForestClassifier(random_state=0), hyperparameters, cv=5, n_jobs=-1)
optimized_base_cv5_RF.fit(X_base, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 752 sec


In [72]:
models.append(('optimized_base_cv5_RF',optimized_base_cv5_RF))

In [50]:
optimized_base_cv5_RF.grid_scores_



[mean: 0.49914, std: 0.02765, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 50},
 mean: 0.47791, std: 0.02382, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 100},
 mean: 0.47394, std: 0.02268, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 200},
 mean: 0.47235, std: 0.02107, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 300},
 mean: 0.47884, std: 0.02503, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 500},
 mean: 0.48247, std: 0.02761, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 700},
 mean: 0.48889, std: 0.03671, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 800},
 mean: 0.49253, std: 0.03487, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 1000},
 mean: 0.51938, std: 0.0

In [53]:
t1 = datetime.now()
hyperparameters = {'max_depth': [1,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}
optimized_base_cv10_RF = GridSearchCV(RandomForestClassifier(random_state=0), hyperparameters, cv=10, n_jobs=-1)
optimized_base_cv10_RF.fit(X_base, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 2005 sec


In [73]:
models.append(('optimized_base_cv10_RF',optimized_base_cv10_RF))

In [55]:
optimized_base_cv10_RF.grid_scores_



[mean: 0.47903, std: 0.04816, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 50},
 mean: 0.45615, std: 0.03975, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 100},
 mean: 0.45417, std: 0.04226, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 200},
 mean: 0.45622, std: 0.04208, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 300},
 mean: 0.46541, std: 0.04609, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 500},
 mean: 0.46329, std: 0.04603, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 700},
 mean: 0.47784, std: 0.05587, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 800},
 mean: 0.47745, std: 0.04982, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 1000},
 mean: 0.53188, std: 0.0

In [56]:
t1 = datetime.now()
hyperparameters = {'max_depth': [1,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}
optimized_100_cv5_RF = GridSearchCV(RandomForestClassifier(random_state=0), hyperparameters, cv=5, n_jobs=-1)
optimized_100_cv5_RF.fit(X_100, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 1761 sec


In [74]:
models.append(('optimized_100_cv5_RF',optimized_100_cv5_RF))

In [58]:
optimized_100_cv5_RF.grid_scores_



[mean: 0.44028, std: 0.01134, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 50},
 mean: 0.43836, std: 0.01526, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 100},
 mean: 0.44306, std: 0.01158, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 200},
 mean: 0.44292, std: 0.01184, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 300},
 mean: 0.44597, std: 0.01461, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 500},
 mean: 0.44980, std: 0.01564, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 700},
 mean: 0.44332, std: 0.01112, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 800},
 mean: 0.44306, std: 0.01108, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 1000},
 mean: 0.48505, std: 0.0

In [10]:
t1 = datetime.now()
hyperparameters = {'max_depth': [1,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}
optimized_100_cv10_RF = GridSearchCV(RandomForestClassifier(random_state=0), hyperparameters, cv=10, n_jobs=-1)
optimized_100_cv10_RF.fit(X_100, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 3541 sec


In [11]:
models.append(('optimized_100_cv10_RF',optimized_100_cv10_RF))

In [None]:
optimized_100_cv10_RF.grid_scores_



[mean: 0.42778, std: 0.01130, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 50},
 mean: 0.42976, std: 0.01405, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 100},
 mean: 0.43419, std: 0.01498, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 200},
 mean: 0.43439, std: 0.01484, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 300},
 mean: 0.43611, std: 0.01576, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 500},
 mean: 0.43591, std: 0.01511, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 700},
 mean: 0.43552, std: 0.01488, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 800},
 mean: 0.43565, std: 0.01494, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 1000},
 mean: 0.46356, std: 0.0

## Extra Trees Classifier
##### NOTE: These models were run, but I accidentally cleared the output, so you cannot see them here. You can see the results in the Testing Trees notebook.

In [None]:
t1 = datetime.now()
hyperparameters = {'min_samples_split': [2,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700],
    'max_features': ['sqrt', 'log2']
}
optimized_kaggle_cv10_ET = GridSearchCV(ExtraTreesClassifier(random_state=0), hyperparameters, cv=10, n_jobs=-1)
optimized_kaggle_cv10_ET.fit(X_kaggle, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

In [None]:
models.append(('optimized_kaggle_cv10_ET',optimized_kaggle_cv10_ET))

In [None]:
optimized_kaggle_cv10_ET.grid_scores_

In [None]:
t1 = datetime.now()
hyperparameters = {'min_samples_split': [2,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700],
    'max_features': ['sqrt', 'log2']
}
optimized_base_cv10_ET = GridSearchCV(ExtraTreesClassifier(random_state=0), hyperparameters, cv=10, n_jobs=-1)
optimized_base_cv10_ET.fit(X_base, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

In [None]:
models.append(('optimized_base_cv10_ET',optimized_base_cv10_ET))

In [None]:
optimized_base_cv10_ET.grid_scores_

In [None]:
t1 = datetime.now()
hyperparameters = {'min_samples_split': [2,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700],
    'max_features': ['sqrt', 'log2']
}
optimized_100_cv10_ET = GridSearchCV(ExtraTreesClassifier(random_state=0), hyperparameters, cv=10, n_jobs=-1)
optimized_100_cv10_ET.fit(X_100, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

In [None]:
models.append(('optimized_100_cv10_ET',optimized_100_cv10_ET))

In [None]:
optimized_100_cv10_ET.grid_scores_

## Save model fits to pickle so I can stop using all of my computer's memory and do some work later

In [None]:
for i in range(len(models)):
    name = str('pickles/' + models[i][0] + '.p')
    with open(name, 'wb') as f:
        pickle.dump(models[i][1], f)
    f.close()
    