In [1]:
%matplotlib inline

In [2]:
import numpy as np
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import uniform, truncnorm, randint

In [3]:
# get data
digits = load_digits()  # Source : http://archive.ics.uci.edu/ml/datasets/Pen-Based+Recognition+of+Handwritten+Digits
X, y = digits.data, digits.target

In [4]:
#digits   # if you wish to check how data looks

In [5]:
# build a classifier
clf = RandomForestClassifier(n_estimators=50)

In [17]:
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3,4,5,6],
              "max_features": [1, 3,4,5,6,7,8,9,10,11,12,13,14,15],
              "min_samples_split": [2,3,4,5,6,7,8,9,10],
              "min_samples_leaf": [2,3,4,5,6,7,8,9,10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}



In [18]:
# run randomized search
samples = 10  # number of random samples 
randomCV = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=samples) #default cv = 5

In [20]:
randomCV.fit(X, y)

 
print(randomCV.best_params_)

{'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 3, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}


In [21]:
results = randomCV.cv_results_['mean_test_score']

In [22]:
print("Avg score %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Avg score 87.887% (1.963%)


In [None]:
# Randomsearch CV may give us a combination in the locality of the best combination. It is not guraranteed to give the best.
# Employ Gridsearch in the locality of the returned combination

In [23]:
# use a full grid over all parameters
param_grid = {"max_depth": [5,6,7],
              "max_features": [1, 3, 10],
              "min_samples_split": [2,3],
              "min_samples_leaf": [8,9,10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [24]:
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)

grid_search.fit(X, y)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=50, n_jobs=None,
                                              oob_score=False,
                                              ran

In [25]:
grid_search.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': 7,
 'max_features': 3,
 'min_samples_leaf': 9,
 'min_samples_split': 2}

In [26]:
results = grid_search.cv_results_['mean_test_score']

In [27]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=7, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=9, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [28]:
print("Avg score %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

Avg score 89.041% (1.702%)
