#  Exercise 1

In [112]:
import pandas as pd
import os
import tarfile
from six.moves import urllib

import numpy as np
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import Imputer, LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor

from scipy.stats import randint as sp_randint

%matplotlib inline

import warnings
warnings.filterwarnings(action="once") # To see warning only once

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [113]:
def load_housing_data(path=HOUSING_PATH):
    csv_path = os.path.join(path, "housing.csv")
    return pd.read_csv(csv_path)
housing = pd.read_csv("../../datasets/housing/housing.csv")
print "Loaded Data into Program"
housing.info()

Loaded Data into Program
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [114]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [115]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [116]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        print "Combined Attributes Adder"
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [117]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        print "DataFrame Selector"
        return X[self.attribute_names].values

In [118]:
class CustomBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lb = LabelBinarizer()
    def fit(self, X, y=None,**fit_params):
        return self.lb.fit(X)
    def transform(self, X):
        print "Custom Binarizer"
        return self.lb.transform(X)

In [119]:
class PickNBestFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, n, col):
        self.n = n
        self.col = col
        self.names = None
    def fit(self, X, y=None):
        corr_matrix = pd.DataFrame(X).corr()
        corr_matrix = np.abs(corr_matrix)
        ordered_features = corr_matrix[self.col.index("median_house_value")].sort_values(ascending=False)
        self.names = ordered_features[1:self.n+1].axes
        return self
    def transform(self, X):
        print "Pick N Best Features"
        return X[self.names]

In [120]:
class DropLabel(BaseEstimator, TransformerMixin):
    def __init__(self, col, to_drop):
        self.col = col
        self.to_drop = to_drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        print "Drop Label"
        return np.delete(X, self.col.index(self.to_drop), axis=1)

In [121]:
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.iloc[train_index]
    strat_test_set = housing.iloc[test_index]
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
housing = strat_train_set.copy()
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]
housing = housing.dropna()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [122]:
# housing = strat_train_set.drop("median_house_value", axis=1)
# housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('pick_best_feets', PickNBestFeatures(8, num_attribs)),
    ('drop_label', DropLabel(num_attribs, "median_house_value")),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_bin', CustomBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)

DataFrame Selector
Combined Attributes Adder
Pick N Best Features
Drop Label
DataFrame Selector
[['<1H OCEAN']
 ['<1H OCEAN']
 ['NEAR OCEAN']
 ['INLAND']
 ['<1H OCEAN']]


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [92]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard Deviation: ", scores.std())

In [None]:
svm_reg = SVR(kernel="rbf", degree=4, C=500, epsilon=1)
svm_reg.fit(housing_prepared, housing_labels)
svm_housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, svm_housing_predictions)
svm_rmse = np.sqrt(svm_mse)
print svm_rmse

svm_scores = cross_val_score(svm_reg,
                             housing_prepared,
                             housing_labels,
                             scoring="neg_mean_squared_error",
                             cv=10)
svm_rmse_scores = np.sqrt(-svm_scores)

# Exercise 2

In [None]:
param_grid = { 
    'kernel':['rbf', 'poly', 'linear'],
    'gamma': [0.01, 0.1, 1, 10], 
    'C': [0.01, 0.1, 1, 10, 100], 
    'epsilon': [0.01, 0.1, 1, 10],
    'degree': [1, 2, 3, 4],
}
svm_reg = SVR()
rand_search = RandomizedSearchCV(estimator=svm_reg, 
                                 param_distributions=param_grid, 
                                 cv=5, 
                                 scoring="neg_mean_squared_error",
                                 n_iter=20)
rand_search.fit(housing_prepared, housing_labels)

# Exercise 4

In [None]:
best_model = rand_search.best_estimator_
print best_model
final_predictor = Pipeline([
    ('transformation', full_pipeline),
    ('predictor', best_model)
])

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

final_predictions = final_predictor.predict(X_test)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print final_rmse