#  Exercise 1

In [67]:
import pandas as pd
import os
import tarfile
from six.moves import urllib

import numpy as np
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import Imputer, LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor

from scipy.stats import randint as sp_randint

%matplotlib inline

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [102]:
def load_housing_data(path=HOUSING_PATH):
    csv_path = os.path.join(path, "housing.csv")
    return pd.read_csv(csv_path)
housing = load_housing_data()
print "Loaded Data into Program"

Loaded Data into Program


In [103]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [104]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [105]:
class CustomBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lb = LabelBinarizer()
    def fit(self, X, y=None,**fit_params):
        return self.lb.fit(X)
    def transform(self, X):
        return self.lb.transform(X)

In [106]:
class PickNBestFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, n):
        self.n = n
        self.names = None
    def fit(self, X, y=None):
        corr_matrix = X.corr()
        corr_matrix = np.abs(corr_matrix)
        ordered_features = corr_matrix["median_house_value"].sort_values(ascending=False)
        self.names = ordered_features[1:self.n+1].axes
        return self
    def transform(self, X):
        return X[self.names]

In [107]:
class DropLabel(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return self.drop("median_house_value", axis=1)

In [108]:
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
housing = strat_train_set.copy()
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]

In [109]:
# housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('pick_best_feets', PickNBestFeatures(8)),
    ('drop_label', DropLabel()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_bin', CustomBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)

AttributeError: 'numpy.ndarray' object has no attribute 'corr'

In [28]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard Deviation: ", scores.std())

In [29]:
svm_reg = SVR(kernel="rbf", degree=4, C=500, epsilon=1)
svm_reg.fit(housing_prepared, housing_labels)
svm_housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, svm_housing_predictions)
svm_rmse = np.sqrt(svm_mse)
print svm_rmse

svm_scores = cross_val_score(svm_reg,
                             housing_prepared,
                             housing_labels,
                             scoring="neg_mean_squared_error",
                             cv=10)
svm_rmse_scores = np.sqrt(-svm_scores)

74920.7231392


# Exercise 2

In [34]:
param_grid = { 
    'kernel':['rbf', 'poly', 'linear'],
    'gamma': [0.01, 0.1, 1, 10], 
    'C': [0.01, 0.1, 1, 10, 100], 
    'epsilon': [0.01, 0.1, 1, 10],
    'degree': [1, 2, 3, 4],
}
svm_reg = SVR()
rand_search = RandomizedSearchCV(estimator=svm_reg, 
                                 param_distributions=param_grid, 
                                 cv=5, 
                                 scoring="neg_mean_squared_error",
                                 n_iter=20)
rand_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'kernel': ['rbf', 'poly', 'linear'], 'C': [0.01, 0.1, 1, 10, 100], 'degree': [1, 2, 3, 4], 'gamma': [0.01, 0.1, 1, 10], 'epsilon': [0.01, 0.1, 1, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

# Exercise 4

In [44]:
best_model = rand_search.best_estimator_
print best_model
final_predictor = Pipeline([
    ('transformation', full_pipeline),
    ('predictor', best_model)
])

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

final_predictions = final_predictor.predict(X_test)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print final_rmse

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.01, gamma=10,
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
69502.8234331
