# Hyperparameter Optimization with RandomizedSearchCV and Pipeline

## Setup

In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn import ensemble
from sklearn import pipeline
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import model_selection

## Data read

In [2]:
df = pd.read_csv('../data/raw/train.csv')

In [3]:
X = df.drop(columns = ['price_range']).values.copy()
y = df.price_range.values

## Random forest classifier

In [4]:
scl = preprocessing.StandardScaler()
pca = decomposition.PCA()
rf = ensemble.RandomForestClassifier(n_jobs = -1)

classifier = pipeline.Pipeline(
    [
        ('scaling', scl),
        ('pca', pca),
        ('rf', rf)
    ]
)

parameters_dist = {
    'pca__n_components': np.arange(5, 10),
    'rf__n_estimators' : np.arange(100, 1500, 100),
    'rf__max_depth' : np.arange(1, 20),
    'rf__criterion' : ['gini', 'entropy']
}

In [5]:
model = model_selection.RandomizedSearchCV(
    estimator = classifier,
    param_distributions = parameters_dist,
    n_iter = 10,
    scoring = 'accuracy',
    verbose = 10,
    n_jobs = 1,
    cv = 5,
)

model.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START pca__n_components=7, rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=1100
[CV 1/5; 1/10] END pca__n_components=7, rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=1100;, score=0.405 total time=   6.3s
[CV 2/5; 1/10] START pca__n_components=7, rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=1100
[CV 2/5; 1/10] END pca__n_components=7, rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=1100;, score=0.422 total time=   0.8s
[CV 3/5; 1/10] START pca__n_components=7, rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=1100
[CV 3/5; 1/10] END pca__n_components=7, rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=1100;, score=0.403 total time=   0.8s
[CV 4/5; 1/10] START pca__n_components=7, rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=1100
[CV 4/5; 1/10] END pca__n_components=7, rf__criterion=entropy, rf__max_depth=4, rf__n_estimators=1100;, score=0.405 to

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('scaling', StandardScaler()),
                                             ('pca', PCA()),
                                             ('rf',
                                              RandomForestClassifier(n_jobs=-1))]),
                   n_jobs=1,
                   param_distributions={'pca__n_components': array([5, 6, 7, 8, 9]),
                                        'rf__criterion': ['gini', 'entropy'],
                                        'rf__max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
                                        'rf__n_estimators': array([ 100,  200,  300,  400,  500,  600,  700,  800,  900, 1000, 1100,
       1200, 1300, 1400])},
                   scoring='accuracy', verbose=10)

In [6]:
print(model.best_score_)
print(model.best_estimator_.get_params())

0.45649999999999996
{'memory': None, 'steps': [('scaling', StandardScaler()), ('pca', PCA(n_components=9)), ('rf', RandomForestClassifier(max_depth=10, n_estimators=1000, n_jobs=-1))], 'verbose': False, 'scaling': StandardScaler(), 'pca': PCA(n_components=9), 'rf': RandomForestClassifier(max_depth=10, n_estimators=1000, n_jobs=-1), 'scaling__copy': True, 'scaling__with_mean': True, 'scaling__with_std': True, 'pca__copy': True, 'pca__iterated_power': 'auto', 'pca__n_components': 9, 'pca__random_state': None, 'pca__svd_solver': 'auto', 'pca__tol': 0.0, 'pca__whiten': False, 'rf__bootstrap': True, 'rf__ccp_alpha': 0.0, 'rf__class_weight': None, 'rf__criterion': 'gini', 'rf__max_depth': 10, 'rf__max_features': 'auto', 'rf__max_leaf_nodes': None, 'rf__max_samples': None, 'rf__min_impurity_decrease': 0.0, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__min_weight_fraction_leaf': 0.0, 'rf__n_estimators': 1000, 'rf__n_jobs': -1, 'rf__oob_score': False, 'rf__random_state': None, 'rf