    GETTING, EXPLORING AND CLEANING THE DATA

In [1]:
import os
import tarfile
from six.moves import urllib

In [6]:
# function to fetch the data
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [7]:
fetch_housing_data()
# when fetch_housing_data() is called, it creates a datasets/housing 
# directory in your workspace and downloads the housing.tgz file

URLError: <urlopen error [Errno -2] Name or service not known>

In [None]:
# function to load the data
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

In [None]:
# the describe() method shows a summary of the numerical attributes
housing.describe()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
# a histogram for each numerical attribute
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins = 50, figsize = (20, 15))
plt.show()

In [None]:
# For illustration only. Sklearn has train_test_split()

import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")

In [None]:
# splitting using train_test_split from sklearn

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
print(len(train_set), "train +", len(test_set), "test")

In [None]:
# there should not be too many strata, and each stratum should be large enough
# this code creates an income category attribute by dividing the median income by 1.5(to limit the number of income categories), and rounding up using ceil(to have discrete categories), and then merging all the categories greater than 5 into category 5:

housing['income_cat'] = np.ceil(housing['median_income'] / 1.5)
housing['income_cat'].where(housing['income_cat'] < 5, 5.0, inplace = True)

In [None]:
#histogram of income categories

housing['income_cat'].hist()

In [None]:
# stratified sampling based on the income category

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
# checking the stratification in the income category proportions in the full housing dataset:

housing["income_cat"].value_counts() / len(housing)

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
# function for sampling bias 

def income_cat_proportions(data):
    return data["income_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(housing),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [None]:
# sampling the bias comparison of stratified versus purely random sampling

compare_props

In [None]:
# we remove the income_cat attribute so the data is back to its original state

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

DISCOVER AND VISUALIZE THE DATA TO GAIN INSIGHTS

In [None]:
# creating a copy so we can work with it without harming the training set

housing = strat_train_set.copy()
housing.head()

DISCOVER AND VISUALIZE THE DATA TO GAIN INSIGHTS

In [None]:
# geographical scatterplot of the data

housing.plot(kind="scatter", x="longitude", y="latitude")

In [None]:
# better visualization, highlighting high-density areas

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
# california housing prices
#1. radius of each circle represents the district's population 
#2. color represents the price
#3. we use a predefined color map, called jet, which ranges from the (low values) to red (high values)

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.legend()

LOOKING FOR CORRELATIONS

In [None]:
# calculating the standard correlation coefficient between every pair of attributes using the corr() method

corr_matrix = housing.corr()

In [None]:
# checking how much each attribute correlates with the median house value

corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
# scatter matrix 

# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
# median income vs median house value
# the most promising attribute to predict the median house value is the median income

housing.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1)
plt.axis([0, 16, 0, 550000])

In [None]:
# creating some new useful attributes

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
# checking the correlation matrix again

corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending = False)

    PREPARE THE DATA FOR MACHINE LEARNING ALGORITHMS

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
# call the simpleImputer to fill in the missing values

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')

In [None]:
# since the median can only be computed on numerical attributes, we need to create a copy of the data without the text attribute ocean_proximity 

housing_num = housing.drop('ocean_proximity', axis = 1)

In [None]:
# fitting the imputer instance to the training data set using fit()

imputer.fit(housing_num)

In [None]:
# manually checking the imputer statistics is the same as the median of each attribute

imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
# use the "trained" imputer to transform the training set

X = imputer.transform(housing_num)

In [None]:
# putting it back into a pandas dataframe

housing_tr = pd.DataFrame(X, columns = housing_num.columns)

    HANDLING TEXT AND CATEGORICAL ATTRIBUTES

In [None]:
# we convert the text labels to numbers

housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]
ordinal_encoder.categories_

In [None]:
# we convert integer categorical values into one-hot vectors

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
# By default, the OneHotEncoder class returns a sparse array, but we can convert it to a dense array if needed by calling the toarray() method:

housing_cat_1hot.toarray()

In [None]:
cat_encoder.categories_

In [None]:
housing.columns

CUSTOM TRANSFORMERS

In [None]:
# we write our own specific transformer for custom cleanup operations or combining specific attributes
# in this example, the transformer has one hyperparameter, add_bedrooms_per_room, set to True by default

from sklearn.base import BaseEstimator, TransformerMixin

# get the right column indices: safer than hard-coding indices 3, 4, 5, 6
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kwargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [None]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

In [None]:
# there are many data transformation steps that need to be executed in the right order

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr

In [None]:
# custom transformer to automatically extract the numerical columns into a NumPy array. 

from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
# we join all these components into a big pipeline that will preprocess both the numerical and the categorical features

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

In [None]:
# we join the two pipelines into a single pipeline by using sklearn's FeatureUnion class

from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [None]:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

SELECT AND TRAIN A MODEL

In [None]:
# training with a linear regression model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
# let's try the full preprocessing pipeline on a few training instances

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

In [None]:
# measure this model's RMSE on the whole training set 

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
# measure this model's mean absolute error on the whole training set 

from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

In [None]:
# as the results are not too satisfying, we choose a more powerful model (decision tree regressor)

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
# evaluate the trained model on the training set

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

BETTER EVALUATION USING CROSS-VALIDATION

In [None]:
# as our model is overfitting we cross validate our findings

from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error', cv =10)
tree_rsme_scores = np.sqrt(-scores)

In [None]:
# simple function to display the scores

def display_scores(scores):
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('Standard Devitation: ', scores.std())
    
display_scores(tree_rsme_scores)

In [None]:
# computing the scores for linear regression
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
# using random forest regressor

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
# computing the scores for random forest classifier

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
# giving a general idea of the entire scores

scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

In [None]:
# trying support vector regression

from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

FINE TUNING OUR MODEL

In [None]:
# to find a great combination of hyperparameter values

from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]
forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

In [None]:
# to get the best combination of parameters

grid_search.best_params_

In [None]:
# to get the best estimator directly

grid_search.best_estimator_

In [None]:
# evaluation scores

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)