# VI Fine-Tune your model

This notebook assumes, that notebook 4_prepare_California_housing.ipynb (Version from 9th March 6:39pm) has been successfully executed.

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib inline

In [2]:
import os
result_path = '../results'
get_path = lambda fn: os.path.join(result_path, fn)
get_path('X_train.csv')

'../results/X_train.csv'

In [3]:
X_train = pd.read_csv(get_path('X_train.csv'), index_col=0)
y_train = pd.read_csv(get_path('y_train.csv'), index_col=0)
X_train.shape, y_train.shape

((16512, 16), (16512, 1))

## VI.1 Fine-tune the hyperparameters

$f (x, y) = g(x) + h(y) \approx g(x)$
![Image](https://cdn-images-1.medium.com/max/800/1*ZTlQm_WRcrNqL-nLnx6GJA.png)
http://jmlr.csail.mit.edu/papers/volume13/bergstra12a/bergstra12a.pdf


### Grid Search

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
              {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
             ]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=10, scoring='neg_mean_squared_error')

grid_search.fit(X_train, np.ravel(y_train))

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [5]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [6]:
np.sqrt(-grid_search.best_score_)

49315.571583673074

In [7]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63322.7233402 {'max_features': 2, 'n_estimators': 3}
54558.2694604 {'max_features': 2, 'n_estimators': 10}
52035.863399 {'max_features': 2, 'n_estimators': 30}
59656.7575646 {'max_features': 4, 'n_estimators': 3}
51833.8923192 {'max_features': 4, 'n_estimators': 10}
49887.6231709 {'max_features': 4, 'n_estimators': 30}
58584.3210863 {'max_features': 6, 'n_estimators': 3}
51719.9867783 {'max_features': 6, 'n_estimators': 10}
49315.5715837 {'max_features': 6, 'n_estimators': 30}
57452.0674091 {'max_features': 8, 'n_estimators': 3}
51162.4695606 {'max_features': 8, 'n_estimators': 10}
49387.0753722 {'max_features': 8, 'n_estimators': 30}
61956.5010195 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53629.9910617 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
58866.2477254 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52001.5140996 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
58373.3745911 {'bootstrap': False, 'max_features': 4, 'n_estima

In [8]:
importance = pd.DataFrame({'importance': grid_search.best_estimator_.feature_importances_},
                          index = X_train.columns)
importance.sort_values(ascending=False, by='importance')

Unnamed: 0,importance
num_pipeline__median_income,0.35517
cat_pipeline__INLAND,0.166597
num_pipeline__population_per_household,0.108421
num_pipeline__longitude,0.067511
num_pipeline__latitude,0.062512
num_pipeline__bedrooms_per_room,0.061955
num_pipeline__rooms_per_household,0.050914
num_pipeline__housing_median_age,0.043908
num_pipeline__total_rooms,0.017801
num_pipeline__population,0.017092


### Randomized Search

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=16),
    }

forest_reg = RandomForestRegressor()
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=100, cv=10, scoring='neg_mean_squared_error',
                               n_jobs=4)
rnd_search.fit(X_train, np.ravel(y_train))

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=4,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a0b738940>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a0b7387f0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [10]:
rnd_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=174, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [11]:
np.sqrt(-rnd_search.best_score_)

48443.53845389765

In [12]:
importance = pd.DataFrame({'importance': rnd_search.best_estimator_.feature_importances_},
                          index = X_train.columns)
importance.sort_values(ascending=False, by='importance')

Unnamed: 0,importance
num_pipeline__median_income,0.334844
cat_pipeline__INLAND,0.154042
num_pipeline__population_per_household,0.106235
num_pipeline__bedrooms_per_room,0.079933
num_pipeline__longitude,0.072441
num_pipeline__latitude,0.066764
num_pipeline__rooms_per_household,0.056723
num_pipeline__housing_median_age,0.043968
num_pipeline__population,0.017527
num_pipeline__total_rooms,0.017509


In [13]:
from sklearn.externals import joblib
final_model = rnd_search.best_estimator_
joblib.dump(final_model, get_path("my_random_forest_regressor.pkl"))

['../results/my_random_forest_regressor.pkl']

## VI.6 Measure the performance on the test set

In [14]:
X_test = pd.read_csv(get_path('X_test.csv'), index_col=0)
y_test = pd.read_csv(get_path('y_test.csv'), index_col=0)

In [15]:
from sklearn.metrics import mean_squared_error
y_pred = final_model.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))

48804.902267
