In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

import pandas as pd

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
housing_data = pd.read_csv("./datasets/housing/housing.csv")

In [21]:
housing_data["income_cat"] = pd.cut(housing_data["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing_data, housing_data["income_cat"]):
    strat_train_set = housing_data.loc[train_index]
    strat_test_set = housing_data.loc[test_index]
    
housing = strat_train_set.drop("median_house_value", axis=1).copy()
housing_labels = strat_train_set["median_house_value"].copy()

housing_test = strat_test_set.drop("median_house_value", axis=1).copy()
housing_test_labels = strat_test_set["median_house_value"].copy()

In [4]:
# numpy.c_ = <numpy.lib.index_tricks.CClass object>
#     from docs: Translates slice objects to concatenation along the second axis.
# np.c_[np.array([1,2,3]), np.array([4,5,6])]
# array([[1, 4],
#       [2, 5],
#       [3, 6]])

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

# Remove the text attribute
housing_num = housing.drop("ocean_proximity", axis=1)

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num) # ignore ocean_proximity
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs), # do the numbers stuff
        ("cat", OneHotEncoder(), cat_attribs), # convert the text to one hot encoding
    ])


housing_prepared = full_pipeline.fit_transform(housing)

Excercise 1 mentions that:
kernel = "linear" or kernel="rbf" with various C and gamma

from docs (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html):
sklearn.svm.SVR(*, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=- 1)

In [26]:
# fit without cross validating
from sklearn.svm import SVR
SVM = SVR(C=1.0, kernel='rbf')

SVM.fit(housing_prepared, housing_labels)

In [8]:
housing_test_prepared = full_pipeline.fit_transform(housing_test)
test_predicted_labels = SVM.predict(housing_test_prepared)

In [25]:
len(housing_test_prepared)

16512

In [23]:
len(test_predicted_labels)

16512

In [57]:
len()

-0.04668996097547229

In [65]:
# we can also put the whole thing in 1 pipeline...

sp = ColumnTransformer([
        ("num", num_pipeline, num_attribs), # do the numbers stuff
        ("cat", OneHotEncoder(), cat_attribs), # convert the text to one hot encoding
    ])

est = Pipeline([("pre",sp),("svm", SVR(C=1.0, kernel='linear'))]).fit(housing, housing_labels)