In [None]:
import pathlib

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

import hands_on_machine_learning as homl

datasets_dir = pathlib.Path('datasets')

housing_dir = datasets_dir / 'housing'
housing_csv = housing_dir / 'housing.csv'

if not housing_csv.is_file():
    homl.datasets.fetch.housing(housing_dir)

housing_data = pd.read_csv(housing_csv)

### Take a quick look at the data structure

The goal of the project is to try to predict the sale value of homes in regions where the existing data on home values is sparse.  So, we will try to analyze home-buyers' behavior in terms of other data such as income, population density, and various facts about the houses themselves.

In [None]:
housing_data.head()

In [None]:
housing_data.info()

In [None]:
housing_data['ocean_proximity'].value_counts()

In [None]:
housing_data.describe()

In [None]:
housing_data.hist(bins=50, figsize=(20, 15))
plt.show()

### Create a Test Set

First create a naive test set by simply reserving 20% of the data, chosen randomly.

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing_data, test_size=0.2, random_state=42)

In [None]:
len(train_set), len(test_set)

In [None]:
housing_data['income_cat'] = pd.cut(
    housing_data['median_income'],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)
housing_data['income_cat'].hist()

However, since income levels probably have a significant effect on people's home-buying behavior, the test set will be more representative if we use a *stratified* sample, making sure to take a good number of samples from each income bracket.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing_data, housing_data['income_cat']):
    strat_train_set = housing_data.loc[train_index]
    strat_test_set = housing_data.loc[test_index]

for strat_set in (strat_train_set, strat_test_set):
    strat_set.drop('income_cat', axis=1, inplace=True)

## Discover and visualize the data to gain insights

We will create various plots to help us understand what we might want to look for.

In [None]:
housing_data = strat_train_set.copy()
housing_data.plot(kind='scatter', x='longitude', y='latitude')

In [None]:
housing_data.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)

This one shows that the house value is strongly dependent on location, with expensive homes concentrated in the Bay Area and LA/San Diego.

In [None]:
housing_data.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
    s=housing_data['population']/100, label='population', figsize=(10,7),
    c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True
)
plt.legend()

### Looking for correlations

We will see if any of the attributes in the data set appear to be related to one another.

In [None]:
corr_matrix = housing_data.corr()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
pd.plotting.scatter_matrix(housing_data[attributes], figsize=(12, 8));

### Experimenting with attribute combinations

It may be useful to define some derived attributes, which may have better properties for machine learning, or may have more relation to what we expect should affect home prices.

In [None]:
housing_data['rooms_per_household'] = housing_data['total_rooms'] / housing_data['households']
housing_data['bedrooms_per_room'] = housing_data['total_bedrooms'] / housing_data['total_rooms']
housing_data['population_per_household'] = housing_data['population'] / housing_data['households']

In [None]:
corr_matrix = housing_data.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

## Prepare the data for machine learning algorithms

First split the labels from the feature data. We will apply various transformations to the feature data to clean it.

In [None]:
housing_features = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

### Data cleaning

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

In [None]:
housing_features_numerical = housing_features.drop('ocean_proximity', axis=1)

In [None]:
imputer.fit(housing_features_numerical);

In [None]:
imputer.statistics_

In [None]:
X = imputer.transform(housing_features_numerical)
housing_features_imputed = pd.DataFrame(X, columns=housing_features_numerical.columns, index=housing_features_numerical.index)

### Handling text and categorical attributes