# Fine-Tune a Model
We now have a 3 different models with similar performance. Now we need to fine-tune them to leverage their potential.

In [1]:
import joblib
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from scipy import stats

In [2]:
tree_model = joblib.load("models/tree_reg_housing.pkl")
linear_model = joblib.load("models/lin_reg_housing.pkl")
forest_model = joblib.load("models/forest_reg_housing.pkl")

In [3]:
data_prep = os.path.join("..","datasets","housing","data_prep","housing_prep.csv")
housing_prepared = pd.read_csv(data_prep)
housing_prepared

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0.245020,0.504782,0.725490,0.039731,0.056218,0.019816,0.062920,0.152019,0.024826,0.001128,0.067348,1.0,0.0,0.0,0.0,0.0
1,0.241036,0.479277,0.254902,0.017119,0.017075,0.008492,0.020724,0.408374,0.034653,0.001622,0.043996,1.0,0.0,0.0,0.0,0.0
2,0.712151,0.024442,0.588235,0.049499,0.075548,0.026150,0.085885,0.162908,0.021983,0.001073,0.073633,0.0,0.0,0.0,0.0,1.0
3,0.472112,0.400638,0.470588,0.046828,0.059439,0.040836,0.065534,0.095447,0.029137,0.002771,0.059064,0.0,1.0,0.0,0.0,0.0
4,0.573705,0.179596,0.313725,0.167523,0.245329,0.124891,0.272778,0.174811,0.023976,0.001896,0.070047,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,0.619522,0.176408,0.882353,0.032177,0.037693,0.015976,0.038835,0.305603,0.034962,0.001639,0.053591,0.0,1.0,0.0,0.0,0.0
16508,0.676295,0.142402,0.764706,0.030269,0.047036,0.029401,0.047797,0.108157,0.024899,0.002724,0.075265,0.0,1.0,0.0,0.0,0.0
16509,0.791833,0.164718,0.156863,0.123340,0.140142,0.058718,0.142457,0.191197,0.037051,0.001650,0.051403,0.0,1.0,0.0,0.0,0.0
16510,0.631474,0.136026,0.588235,0.049702,0.060889,0.037921,0.066094,0.245693,0.031078,0.002508,0.056546,1.0,0.0,0.0,0.0,0.0


In [4]:
trainset_path = os.path.join("..","datasets","housing","train","housing_strat_train.csv")
strat_train_set = pd.read_csv(trainset_path)

# Predictors
housing = strat_train_set.drop("median_house_value", axis=1, inplace=False)
housing.drop("id", axis=1, inplace=True)

# Labels
housing_labels = strat_train_set["median_house_value"]

# Numeric columns DF
numeric_housing = housing.drop("ocean_proximity", axis=1)

# Categorical cloumns DF
housing_cat = housing[["ocean_proximity"]]

### Grid Search
GridSearch provides an automatic way of searching for the best hyperparameter combination instead of searching manually.

In [5]:
param_grid = [
    {'n_estimators' : [3, 10, 30], 'max_features' : [2, 4, 6, 8]},
    {'bootstrap' : [False], 'n_estimators' : [3, 10], 'max_features' : [2, 3, 4]}
]

grid_search = GridSearchCV(forest_model, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=10, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 6, 8],
    

In [6]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

The outcome {'max_features': 8, 'n_estimators': 30} is probably due to 6 and 30 being the maximun values to be evaluated. We should try with higher values.

In [7]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=6, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=30,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [8]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

64314.080334784085 {'max_features': 2, 'n_estimators': 3}
55456.194504309664 {'max_features': 2, 'n_estimators': 10}
53293.48096681795 {'max_features': 2, 'n_estimators': 30}
60915.90797390935 {'max_features': 4, 'n_estimators': 3}
53185.6638787699 {'max_features': 4, 'n_estimators': 10}
50797.80164583807 {'max_features': 4, 'n_estimators': 30}
58707.73086039366 {'max_features': 6, 'n_estimators': 3}
52466.86335565673 {'max_features': 6, 'n_estimators': 10}
50005.71764250188 {'max_features': 6, 'n_estimators': 30}
59145.66559085611 {'max_features': 8, 'n_estimators': 3}
52243.74165460452 {'max_features': 8, 'n_estimators': 10}
50108.37191660722 {'max_features': 8, 'n_estimators': 30}
63111.74552108382 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54180.664922832504 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59922.30211318795 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52924.653796960614 {'bootstrap': False, 'max_features': 3, 'n_estimators':

## Randomized Search
RandomizedSearchCV provides exploration of hyperparameters randomly given a maximun number of iteratios. It is better for large  hyperparameter search space. It is useful when you have a given computing budget as you can set the number of iterations to that budget.

## Ensable Methods
This is a way to fine-tune a system by trying combinations of models that perform better.

## Analyze the Best Models and their Errors

In [9]:
# From previous notebooks
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:,rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
# Provided transformers
imputer = SimpleImputer(strategy="median")
cat_enconder = OneHotEncoder()
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)

# Pipelines
std_num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attrib_adder', CombinedAttributesAdder(add_bedrooms_per_room=True)),
    ('std_scaler', MinMaxScaler())
])


num_attribs = list(numeric_housing)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ('num', std_num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

# Data fitted to the pipeline
full_pipeline.fit(housing)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('attrib_adder',
                                                  CombinedAttributesAdder(add_bedrooms_per_room=True)),
                                                 ('std_scaler',
          

In [10]:
feature_importance = grid_search.best_estimator_.feature_importances_
feature_importance

array([7.35405121e-02, 6.90548273e-02, 4.24809514e-02, 1.77738833e-02,
       1.74578773e-02, 1.78167005e-02, 1.64125163e-02, 3.27521552e-01,
       4.52725958e-02, 1.07833428e-01, 8.64516523e-02, 1.06799969e-02,
       1.59662676e-01, 8.82337873e-05, 3.82448402e-03, 4.12811321e-03])

In [11]:
# Importance scores vs attribute names
extra_attribs = ["rooms_perhhold", "pop_per_hhold", "bedrooms_per_room"]
cat_enconder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_enconder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importance, attributes), reverse=True)

[(0.32752155205086614, 'median_income'),
 (0.15966267615382082, 'INLAND'),
 (0.1078334275556137, 'pop_per_hhold'),
 (0.08645165230606836, 'bedrooms_per_room'),
 (0.0735405120543594, 'longitude'),
 (0.06905482733336016, 'latitude'),
 (0.04527259576487489, 'rooms_perhhold'),
 (0.04248095138185572, 'housing_median_age'),
 (0.017816700483959873, 'population'),
 (0.017773883326142024, 'total_rooms'),
 (0.01745787733002535, 'total_bedrooms'),
 (0.016412516292657775, 'households'),
 (0.010679996944556399, '<1H OCEAN'),
 (0.004128113210300525, 'NEAR OCEAN'),
 (0.003824484024227125, 'NEAR BAY'),
 (8.823378731174312e-05, 'ISLAND')]

This list helps us to understand which attributes are a more useful feature. We could drop the least important ones.

## Evaluating your system on the Test Set
Steps:
1- Get the predictors and labels from the Test Set.<br/>
2- Run the full_pipeline.<br/>
3- Call transform() NOT fit_transform(). We don't want to fit the test set.<br/>
4- Evaluate the results.<br/>

In [12]:
# Load test set
test_path = os.path.join("..","datasets","housing","test","housing_strat_test.csv")
test_set = pd.read_csv(test_path)
test_set.drop("id", axis=1, inplace=True)

# Predictors
housing_pred = test_set.drop("median_house_value", axis=1, inplace=False)
x_test_prep = full_pipeline.transform(housing_pred)

# Labels
y_test = test_set["median_house_value"].copy()

In [13]:
final_model = grid_search.best_estimator_
final_pred = final_model.predict(x_test_prep)

final_mse = mean_squared_error(y_test, final_pred)
final_rmse = np.sqrt(final_mse)
final_rmse

47751.74888828847

To check how precise the estimate is, we need to compute a 95% Confidence Interval fro the generalisation error.

In [14]:
confidence = 0.95
squared_errors = (final_pred - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) -1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))

array([45773.57201926, 49651.17469283])