# Hyperparameter Optimization with RandomizedSearchCV

## Setup

In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn import ensemble
from sklearn import model_selection

## Data read

In [2]:
df = pd.read_csv('../data/raw/train.csv')

In [3]:
X = df.drop(columns = ['price_range']).values.copy()
y = df.price_range.values

## Random forest classifier

In [4]:
classifier = ensemble.RandomForestClassifier(n_jobs = -1)
parameters_dist = {
    'n_estimators' : np.arange(100, 1500, 100),
    'max_depth' : np.arange(1, 20),
    'criterion' : ['gini', 'entropy']
}

In [5]:
model = model_selection.RandomizedSearchCV(
    estimator = classifier,
    param_distributions = parameters_dist,
    n_iter = 10,
    scoring = 'accuracy',
    verbose = 10,
    n_jobs = 1,
    cv = 5,
)

model.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=gini, max_depth=8, n_estimators=1300.............
[CV 1/5; 1/10] END criterion=gini, max_depth=8, n_estimators=1300;, score=0.873 total time=   5.3s
[CV 2/5; 1/10] START criterion=gini, max_depth=8, n_estimators=1300.............
[CV 2/5; 1/10] END criterion=gini, max_depth=8, n_estimators=1300;, score=0.880 total time=   0.8s
[CV 3/5; 1/10] START criterion=gini, max_depth=8, n_estimators=1300.............
[CV 3/5; 1/10] END criterion=gini, max_depth=8, n_estimators=1300;, score=0.892 total time=   0.8s
[CV 4/5; 1/10] START criterion=gini, max_depth=8, n_estimators=1300.............
[CV 4/5; 1/10] END criterion=gini, max_depth=8, n_estimators=1300;, score=0.860 total time=   0.8s
[CV 5/5; 1/10] START criterion=gini, max_depth=8, n_estimators=1300.............
[CV 5/5; 1/10] END criterion=gini, max_depth=8, n_estimators=1300;, score=0.858 total time=   0.8s
[CV 1/5; 2/10] START criterion=entropy,

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1), n_jobs=1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
                                        'n_estimators': array([ 100,  200,  300,  400,  500,  600,  700,  800,  900, 1000, 1100,
       1200, 1300, 1400])},
                   scoring='accuracy', verbose=10)

In [6]:
print(model.best_score_)
print(model.best_estimator_.get_params())

0.8879999999999999
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 11, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 600, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
