# Grid search for Random Forest
http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html

In [21]:
print(__doc__)


import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn import cross_validation

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

Automatically created module for IPython interactive environment


In [22]:
## Get data
train_gut = pd.read_csv('train_valid.csv')
X = train_gut[['SP1','SP2','SP3','SP4','SP5','SP6','SP7','SP8','SP9','SP10']].values
y = train_gut['Group'].values

In [23]:
## Build classifier
rf = RandomForestClassifier(max_depth=None, n_estimators=50, max_features="auto", min_samples_split=4,min_samples_leaf = 4) 

In [24]:
## Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [33]:
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [34]:
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 13.96 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.816 (std: 0.040)
Parameters: {'bootstrap': False, 'min_samples_leaf': 4, 'min_samples_split': 6, 'criterion': 'gini', 'max_features': 1, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.789 (std: 0.041)
Parameters: {'bootstrap': False, 'min_samples_leaf': 2, 'min_samples_split': 7, 'criterion': 'entropy', 'max_features': 1, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.772 (std: 0.030)
Parameters: {'bootstrap': True, 'min_samples_leaf': 5, 'min_samples_split': 6, 'criterion': 'entropy', 'max_features': 10, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.772 (std: 0.036)
Parameters: {'bootstrap': True, 'min_samples_leaf': 1, 'min_samples_split': 6, 'criterion': 'entropy', 'max_features': 9, 'max_depth': 3}

Model with rank: 3
Mean validation score: 0.772 (std: 0.009)
Parameters: {'bootstrap': True, 'min_samples_leaf': 4,