## Chapter 2 - Exercises

1. Try a Support Vector Machine regressor (sklearn.svm.SVR) with various hyperparameters, such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Don’t worry about what these hyperparameters mean for now. How does the best SVR predictor perform?

In [35]:
import pandas as pd

housing = pd.read_csv('datasets/housing/housing.csv')

In [36]:
# create a column for 5 income groups
import numpy as np

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [37]:
# Create stratified sampling based on the income category
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) 
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
housing = strat_train_set.drop("median_house_value", axis=1) # predictions
housing_labels = strat_train_set["median_house_value"].copy() # labels

In [38]:
# Now that we've separated the train and test sets, 
# drop column created to turn the data back to its original state
for set_ in (strat_train_set, strat_test_set): 
    set_.drop("income_cat", axis=1, inplace=True)

In [39]:
# preparing a tranformer class:

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room 
    
    def fit(self, X, y=None):
        return self # nothing else to do
    
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix] 
        population_per_household = X[:, population_ix] / X[:, households_ix] 
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")), 
                         ('attribs_adder', CombinedAttributesAdder()), 
                         ('std_scaler', StandardScaler())])

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

housing_num = housing.drop('ocean_proximity', axis=1)
num_attribs = list(housing_num)

cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([('num', num_pipeline, num_attribs),
                                   ('cat', OneHotEncoder(), cat_attribs)])

housing_prepared = full_pipeline.fit_transform(housing)

In [55]:
from sklearn.svm import SVR

svr_reg = SVR()
svr_reg.fit(housing_prepared, housing_labels)

SVR(kernel='linear')

In [56]:
from sklearn.metrics import mean_squared_error

housing_prediction = svr_reg.predict(housing_prepared)
svr_mse = mean_squared_error(housing_labels, housing_prediction)
svr_rmse = np.sqrt(svr_mse)
svr_rmse

106874.50415237377

In [66]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

svr_reg = SVR()

param_grid = [
  {
      'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0], 'kernel': ['linear']
  },
  {
      'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0], 
      'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0], 
      'kernel': ['rbf']}
      ]

grid = GridSearchCV(svr_reg, param_grid=param_grid, cv=5, n_jobs=4, 
                    scoring='neg_mean_squared_error', verbose=2)

grid.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  3.6min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 15.3min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed: 23.3min finished


GridSearchCV(cv=5, estimator=SVR(), n_jobs=4,
             param_grid=[{'C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,
                                10000.0, 30000.0],
                          'kernel': ['linear']},
                         {'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0],
                          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
                          'kernel': ['rbf']}],
             scoring='neg_mean_squared_error', verbose=2)

In [75]:
negative_mse = grid.best_score_
rmse = np.sqrt(-negative_mse)
rmse

70338.36756805143

In [67]:
grid.best_params_

{'C': 1000.0, 'kernel': 'linear'}

In [68]:
# evaluation scores
cvres = grid.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

81649.4933206491 {'C': 10.0, 'kernel': 'linear'}
74344.25372740348 {'C': 30.0, 'kernel': 'linear'}
71235.18700401639 {'C': 100.0, 'kernel': 'linear'}
70485.58977342138 {'C': 300.0, 'kernel': 'linear'}
70338.36756805143 {'C': 1000.0, 'kernel': 'linear'}
70346.3175689201 {'C': 3000.0, 'kernel': 'linear'}
70366.12439508669 {'C': 10000.0, 'kernel': 'linear'}
70361.65894739094 {'C': 30000.0, 'kernel': 'linear'}
118743.74833054295 {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
118581.24055475516 {'C': 1.0, 'gamma': 0.03, 'kernel': 'rbf'}
118556.82411885096 {'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}
118764.37910427665 {'C': 1.0, 'gamma': 0.3, 'kernel': 'rbf'}
118894.65726808914 {'C': 1.0, 'gamma': 1.0, 'kernel': 'rbf'}
118918.06616121177 {'C': 1.0, 'gamma': 3.0, 'kernel': 'rbf'}
118398.06973905365 {'C': 3.0, 'gamma': 0.01, 'kernel': 'rbf'}
117911.51997815768 {'C': 3.0, 'gamma': 0.03, 'kernel': 'rbf'}
117816.57261693868 {'C': 3.0, 'gamma': 0.1, 'kernel': 'rbf'}
118441.34978932199 {'C': 3.0, 'gamma'

Best perfomance: 70338.36756805143 {'C': 1000.0, 'kernel': 'linear'}

2. Try replacingGridSearchCV with RandomizedSearchCV.

In [82]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

# After running my own version, and getting not so good results,
# I took a look at the book's answers to see what parameters the author used.

param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }

svm_reg = SVR()

grid_rand = RandomizedSearchCV(svm_reg, param_distributions=param_distribs, cv=5, 
                               n_jobs=-1, n_iter=50, scoring='neg_mean_squared_error', 
                               verbose=2, random_state=42)

grid_rand.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 31.8min finished


RandomizedSearchCV(cv=5, estimator=SVR(), n_iter=50, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x120abfe80>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x120abf128>,
                                        'kernel': ['linear', 'rbf']},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [83]:
negative_mse = grid_rand.best_score_
rmse = np.sqrt(-negative_mse)
rmse

55215.341404932245

In [84]:
grid_rand.best_params_

{'C': 157055.10989448498, 'gamma': 0.26497040005002437, 'kernel': 'rbf'}

3. Try adding a transformer in the preparation pipeline to select only the most important attributes.

...will continue tomorrow