In [42]:
import pandas as pd
import numpy as np

housing = pd.read_csv('../datasets/housing/housing.csv')
housing.head()

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder

# stratified sampling
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index, in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('one_hot_encoder', OneHotEncoder(sparse=False)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared.shape



(16512, 16)

In [43]:
# 1. Try SVM with various hyperparameters
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

svm_reg = SVR(kernel="linear", C=200, gamma=0.1)
svm_reg.fit(housing_prepared, housing_labels)

# housing_predictions = svr_reg.predict(housing_prepared)
# svm_mse = mean_squared_error(housing_labels, housing_predictions)
# svm_rmse = np.sqrt(svm_mse)
# svm_rmse

svm_scores = cross_val_score(svm_reg, housing_prepared, housing_labels,
                            scoring="neg_mean_squared_error", cv=10)
svm_rmse_scores = np.sqrt(-svm_scores)
display_scores(svm_rmse_scores)

Scores: [67935.62044974 68671.47730845 72168.24911247 73993.81945908
 69710.8493005  73959.49915997 66941.18368742 70430.61309063
 73455.94451581 70583.16248608]
Mean: 70785.0418570158
Standard deviation: 2410.4415354154876


In [44]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'kernel': ["linear"], 'C':[100, 200, 300]},
    {'kernel': ["rbf"], 'C':[100, 200, 300], 'gamma':[0.1, 0.3, 0.5]},
]

svr = SVR()
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [100, 200, 300], 'kernel': ['linear']},
                         {'C': [100, 200, 300], 'gamma': [0.1, 0.3, 0.5],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [45]:
grid_search.best_params_

{'C': 300, 'kernel': 'linear'}

In [46]:
grid_search.best_estimator_

SVR(C=300, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [47]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

71603.12196479437 {'C': 100, 'kernel': 'linear'}
70914.93688195093 {'C': 200, 'kernel': 'linear'}
70703.95891598675 {'C': 300, 'kernel': 'linear'}
98578.87492167123 {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
106517.78628279532 {'C': 100, 'gamma': 0.3, 'kernel': 'rbf'}
111147.05176152407 {'C': 100, 'gamma': 0.5, 'kernel': 'rbf'}
89553.1282311912 {'C': 200, 'gamma': 0.1, 'kernel': 'rbf'}
99533.19490870919 {'C': 200, 'gamma': 0.3, 'kernel': 'rbf'}
105953.51175746799 {'C': 200, 'gamma': 0.5, 'kernel': 'rbf'}
84514.70619517785 {'C': 300, 'gamma': 0.1, 'kernel': 'rbf'}
95252.79103138024 {'C': 300, 'gamma': 0.3, 'kernel': 'rbf'}
102114.3034190811 {'C': 300, 'gamma': 0.5, 'kernel': 'rbf'}


In [54]:
# 2. Try RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }

rand_search = RandomizedSearchCV(svr, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, n_jobs=4, random_state=42)
rand_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 15.1min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed: 339.2min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='auto_deprecated',
                                 kernel='rbf', max_iter=-1, shrinking=True,
                                 tol=0.001, verbose=False),
                   iid='warn', n_iter=50, n_jobs=4,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000239998865C0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000239998869E8>,
                                        'kernel': ['linear', 'rbf']},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring='neg_mean_squared_error',
                   verbose=2)

In [52]:
negative_mse = rand_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

63973.37905067421

In [53]:
rand_search.best_params_

{'C': 4366.15629113969, 'gamma': 0.07484460991679197, 'kernel': 'rbf'}