In [12]:
from sklearn import datasets

n = 5
x_train = [0]*n
x_test =[0]*n
y_train = [0]*n
y_test = [0]*n

for i in range(0,n):
    data = datasets.make_friedman1(n_samples = 100, n_features = 5, noise = 1)
    x_train[i] = data[0][0:50]
    x_test[i] = data[0][50:data[0].shape[0]]
    y_train[i] = data[1][0:50]
    y_test[i] = data[1][50:data[1].shape[0]]
    
for i in range(0,n):
    print(y_test[i])
    print('------------------------')

[ 7.45443878 18.20545887 14.29400891  9.65617145 22.26487029 11.47963127
 14.50143885 19.58439488 20.30382531 14.94661972 19.84599292  9.27833302
 19.96675609 14.63633634 16.17152365 20.02751325 14.70198061  8.88761479
 12.73228913 15.76980523 13.14219048 11.88601634  5.23995084 24.21183636
 13.26483603  8.43047168 17.35215613 15.26868895 18.88254206 18.33654641
 14.11605463 14.75640954 17.37602207 18.12867095 21.62500225 10.94069684
 13.87958255 17.14271621  7.21356191 15.28969153 18.38754072  9.40632669
 21.91981466 20.65502941 18.54783029 14.21785228 12.55765436 13.15362201
 23.10724182 19.31400173]
------------------------
[11.37693192 19.48713996  6.07536094 15.34019917 15.40141604 16.33636479
 13.03746092  6.1614286  15.85295718 17.1846833  17.76957281  6.33278556
 13.36600689 13.95482638 23.2925214  15.01873548 10.59640591 15.18732858
 11.73482133 19.21731511 15.7512567  10.30065695 16.51926427  5.88677761
 10.28390155 11.78253899  4.66173275 17.17681508 14.04245463 27.83254581


In [10]:
#compare grid and random search. search over all combinations. random forest

'''
note that, for random forest:
n_estimators = num.trees
min_samples_leaf = min.node.size
max_features = mtry
max_samples = sample.fraction
'''

import math
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
import time

par_grid = {'n_estimators':[1,2,3,4,5], 
        'min_samples_leaf':np.linspace(0.1,0.5,5), 
        'max_features':np.linspace(0.1,0.5,5), 
        'max_samples':np.linspace(0.1,0.5,5)}

results_rs = pd.DataFrame(np.zeros((n,4)), columns = ['out-score', 'in-score', 'time', 'params'])
results_gs = pd.DataFrame(np.zeros((n,4)), columns = ['out-score', 'in-score', 'time', 'params'])

for i in range(0,n):
    gs = model_selection.GridSearchCV(estimator = RandomForestRegressor(), 
                                      param_grid = par_grid, 
                                      scoring = 'neg_root_mean_squared_error', 
                                      cv = 3)
    
    start = time.thread_time()
    gs.fit(x_train[i], y_train[i])
    stop = time.thread_time()
    
    results_gs.loc[i, 'params'] = str(gs.best_params_)
    results_gs.loc[i, 'in-score'] = gs.best_score_
    gs_pred_i = gs.best_estimator_.predict(x_test[i])
    results_gs.loc[i, 'out-score'] = math.sqrt(mean_squared_error(y_true = y_test, y_pred = gs_pred_i))
    results_gs.loc[i, 'time'] = stop - start

    rs = model_selection.RandomizedSearchCV(estimator = RandomForestRegressor(), 
                                        param_distributions = par_grid,
                                        n_iter = 5**4,
                                        scoring = 'neg_root_mean_squared_error',
                                        cv = 3)
    
    start = time.thread.time()
    rs.fit(x_train[i], y_train[i])
    stop = time.thread.time()
    
    results_rs.loc[i, 'params'] = str(rs.best_params_)
    results_rs.loc[i, 'in-score'] = rs.best_score_
    rs_pred_i = rs.best_estimator_.predict(x_test[i])
    results_rs.loc[i, 'out-score'] = math.sqrt(mean_squared_error(y_true = y_test, y_pred = rs_pred_i))
    results_rs.loc[i, 'time'] = stop - start
    
print(results_gs)
print(results_rs)

ValueError: Found input variables with inconsistent numbers of samples: [5, 50]

In [18]:
#compare grid and random search. search along only 1 parameter at a time. random forest.
import numpy as np
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
import time

x = datasets.make_friedman1(n_samples = 50, n_features = 5, noise = 0)

#I decided to define the default parameters as somewhere in the middle of the entire space
def_pars = {'n_estimators':3, 'min_samples_leaf':0.3, 'max_samples':0.3, 'max_features':0.3}
best_pars_gs = def_pars
best_pars_rs = def_pars

param_grid = {'n_estimators':[1,2,3,4,5], 
        'min_samples_leaf':np.linspace(0.1,0.5,5), 
        'max_features':np.linspace(0.1,0.5,5), 
        'max_samples':np.linspace(0.1,0.5,5)}
space_sizes = {'n_estimators':5, 'min_samples_leaf':5, 'max_features':5, 'max_samples':5}

for par in param_grid:
    rf = RandomForestRegressor(n_estimators = def_pars['n_estimators'],
                                min_samples_leaf = def_pars['min_samples_leaf'],
                                max_samples = def_pars['max_samples'],
                                max_features = def_pars['max_features']
                               )
    one_par = {par:param_grid[par]}
    gs = model_selection.GridSearchCV(estimator = rf,
                                      param_grid = one_par,
                                      cv = 3,
                                      scoring = 'neg_mean_squared_error'
                                     )
    gs.fit(x[0], x[1])
    best_pars_gs[par] = gs.best_params_[par]
    rs = model_selection.RandomizedSearchCV(estimator = rf,
                                           param_distributions = one_par,
                                           cv = 3,
                                            n_iter = space_sizes[par],
                                           scoring = 'neg_mean_squared_error')
    rs.fit(x[0], x[1])
    best_pars_rs[par] = rs.best_params_[par]

print(best_pars_gs)
print(best_pars_rs)
    

{'n_estimators': 3, 'min_samples_leaf': 0.4, 'max_samples': 0.5, 'max_features': 0.30000000000000004}
{'n_estimators': 3, 'min_samples_leaf': 0.4, 'max_samples': 0.5, 'max_features': 0.30000000000000004}
