In [24]:
import os
import tarfile
import requests 

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    r = requests.get(housing_url)
    with open(tgz_path, 'wb') as f:
        f.write(r.content)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


In [25]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [31]:
fetch_housing_data()

housing = load_housing_data()

In [None]:
housing.head()
housing.info()
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20, 15))
plt.show()

We can see some interesting details in these histograms.
1. The median income is not expressed in USD, but rather scaled to a much higher value which looks to be around 1 unit = 10,000 USD.
2. The median age and value of houses are capped. This removes outliers, but it may also affect your algorithm by restricting the range of data it trains on.
3. All the graphs have very different scales and it is not immediately clear how they relate to each other.
4. All of the histograms are heavily right-skewed, which affects how our algorithm learns. It is better to tramsform the data into more bell shaped distributions.

Now we need to split up our data into training and testing data. We can do this in a few ways

In [1]:
import numpy as np
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier) & 0xffffffff < test_ratio * 2 ** 32)

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda i : test_set_check(i, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0,.2, "id")