In [1]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

In [2]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [3]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


fetch_housing_data()

In [4]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


housing = load_housing_data()

In [7]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

housing = strat_train_set.drop("median_house_value", axis=1)  # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6


class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):  # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # nothing else to do

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [
    {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
    {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
     'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END ..............................C=10.0, kernel=linear; total time=   6.3s
[CV] END ..............................C=10.0, kernel=linear; total time=   6.3s
[CV] END ..............................C=10.0, kernel=linear; total time=   7.1s
[CV] END ..............................C=10.0, kernel=linear; total time=   6.3s
[CV] END ..............................C=10.0, kernel=linear; total time=   6.1s
[CV] END ..............................C=30.0, kernel=linear; total time=   6.4s
[CV] END ..............................C=30.0, kernel=linear; total time=   6.5s
[CV] END ..............................C=30.0, kernel=linear; total time=   6.2s
[CV] END ..............................C=30.0, kernel=linear; total time=   6.0s
[CV] END ..............................C=30.0, kernel=linear; total time=   6.0s
[CV] END .............................C=100.0, kernel=linear; total time=   6.0s
[CV] END .............................C=100.0, 

[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  11.9s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  11.8s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  11.8s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  11.9s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  12.0s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  11.7s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  11.7s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  11.6s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  11.7s
[CV] END ......................C=10.0, gamma=0.1, kernel=rbf; total time=  11.4s
[CV] END ......................C=10.0, gamma=0.1, kernel=rbf; total time=  11.5s
[CV] END ......................C=10.0, gamma=0.1, kernel=rbf; total time=  11.5s
[CV] END ...................

[CV] END .....................C=300.0, gamma=0.1, kernel=rbf; total time=  13.2s
[CV] END .....................C=300.0, gamma=0.1, kernel=rbf; total time=  13.4s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  12.8s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  12.7s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  12.9s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  12.8s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  12.6s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=  12.4s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=  12.5s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=  12.4s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=  12.4s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=  12.4s
[CV] END ...................

GridSearchCV(cv=5, estimator=SVR(),
             param_grid=[{'C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,
                                10000.0, 30000.0],
                          'kernel': ['linear']},
                         {'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0],
                          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
                          'kernel': ['rbf']}],
             scoring='neg_mean_squared_error', verbose=2)

In [14]:
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

70338.36757904004

In [15]:
grid_search.best_params_

{'C': 1000.0, 'kernel': 'linear'}