In [None]:
import sys
import pathlib

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

scripts_dir = pathlib.Path.cwd().parent / 'scripts'
datasets_dir = pathlib.Path.cwd().parent / 'datasets'

housing_dir = datasets_dir / 'housing'
housing_csv = housing_dir / 'housing.csv'

if not housing_csv.is_file():
    sys.path.append(str(scripts_dir))
    import fetch_datasets
    fetch_datasets.housing(housing_dir)
    sys.path.pop()

housing_data = pd.read_csv(housing_csv)

In [None]:
housing_data.head()

In [None]:
housing_data.info()

In [None]:
housing_data['ocean_proximity'].value_counts()

In [None]:
housing_data.describe()

In [None]:
housing_data.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing_data, test_size=0.2, random_state=42)

In [None]:
len(train_set), len(test_set)

In [None]:
housing_data['income_cat'] = pd.cut(
    housing_data['median_income'],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)
housing_data['income_cat'].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing_data, housing_data['income_cat']):
    strat_train_set = housing_data.loc[train_index]
    strat_test_set = housing_data.loc[test_index]

for strat_set in (strat_train_set, strat_test_set):
    strat_set.drop('income_cat', axis=1, inplace=True)

exploration_set = strat_train_set.copy()

In [None]:
exploration_set.plot(kind='scatter', x='longitude', y='latitude')

In [None]:
exploration_set.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)

In [None]:
exploration_set.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
    s=exploration_set['population']/100, label='population', figsize=(10,7),
    c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True
)
plt.legend()

In [None]:
corr_matrix = exploration_set.corr()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
pd.plotting.scatter_matrix(exploration_set[attributes], figsize=(12, 8));