In [19]:
# Import packages
import os
import tarfile
import urllib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [20]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"


def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir('../' + housing_path):
        os.makedirs('../' + housing_path)
    tgz_path = os.path.join('../' + housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path='../' + housing_path)
    housing_tgz.close()


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join('../' + housing_path, "housing.csv")
    return pd.read_csv(csv_path)


fetch_housing_data()
housing = load_housing_data()

In [21]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

In [22]:
from sklearn.base import BaseEstimator, TransformerMixin

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_rooms_per_household=True, add_population_per_household=True, add_bedrooms_per_room=True):
        self.add_rooms_per_household = add_rooms_per_household
        self.add_population_per_household = add_population_per_household
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        if self.add_rooms_per_household:
            X = np.c_[X, rooms_per_household]
        if self.add_population_per_household:
            X = np.c_[X, population_per_household]
        if self.add_bedrooms_per_room:
            X = np.c_[X, bedrooms_per_room]
        return X

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('label_encoder', LabelEncoder()),
    ('cat_encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ("num_transformer", num_transformer, list(housing.drop("ocean_proximity", axis=1).columns)),
    ("cat_transformer", cat_transformer, ["ocean_proximity"])
])

In [24]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', None)
])

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

param_grid = [
    {'preprocessor__num_transformer__imputer__strategy': ['mean', 'median', 'most_frequent'],
     'preprocessor__num_transformer__attribs_adder__add_rooms_per_household': [True, False],
     'preprocessor__num_transformer__attribs_adder__add_population_per_household': [True, False],
     'preprocessor__num_transformer__attribs_adder__add_bedrooms_per_room': [True, False],
     'model': [LinearRegression(), RandomForestRegressor(), SVR(), KNeighborsRegressor()]
     }
]

grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(housing, housing_labels)
print(grid_search.best_params_)

{'preprocessor__num_transformer__imputer__strategy': 'mean', 'preprocessor__num_transformer__attribs_adder__add_rooms_per_household': False, 'preprocessor__num_transformer__attribs_adder__add_population_per_household': False, 'preprocessor__num_transformer__attribs_adder__add_bedrooms_per_room': False, 'model': RandomForestRegressor()}


In [32]:
param_grid = [
    {'preprocessor__num_transformer__imputer__strategy': ['mean'],
     'preprocessor__num_transformer__attribs_adder__add_rooms_per_household': [True],
     'preprocessor__num_transformer__attribs_adder__add_population_per_household': [True],
     'preprocessor__num_transformer__attribs_adder__add_bedrooms_per_room': [True],
     'model': [RandomForestRegressor()],
     'model__n_estimators': [3, 10, 30, 100, 300, 1000],
     'model__max_features': [2, 4, 6, 8, 10]
     }
]

grid_search = RandomizedSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(housing, housing_labels)
print(grid_search.best_params_)



{'preprocessor__num_transformer__imputer__strategy': 'mean', 'preprocessor__num_transformer__attribs_adder__add_rooms_per_household': True, 'preprocessor__num_transformer__attribs_adder__add_population_per_household': True, 'preprocessor__num_transformer__attribs_adder__add_bedrooms_per_room': True, 'model__n_estimators': 300, 'model__max_features': 6, 'model': RandomForestRegressor()}


In [33]:
model = grid_search.best_estimator_
model.fit(housing, housing_labels)
print(model.score(test_set.drop("median_house_value", axis=1), test_set["median_house_value"]))

0.8213021914758261
