In [None]:
# Initial imports and set-up

import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

HOUSING_PATH = os.path.join('datasets', 'housing')

np.random.seed(42)
%matplotlib inline

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)



# Load and view data info

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing.info

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50, figsize=(20, 15))

## Create test set

In [None]:
from zlib import crc32

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)

## Prepare dataset

In [None]:
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')

In [None]:
# Convert latitude and longitude to id
housing_with_id['id'] = housing['longitude'] * 1000 + housing['latitude']
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'id')

## Introduce Scikit-Learn

In [None]:
# Imports

from sklearn.model_selection import train_test_split

In [None]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

## Prepare data continued

In [None]:
housing['income_cat'] = pd.cut(housing['median_income'],
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1, 2, 3, 4, 5])
housing['income_cat'].hist()

## Sampling

In [None]:
# Imports
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set['income_cat'].value_counts() / len(strat_test_set)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)

# Visualize and explore data

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)

In [None]:
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
            s=housing['population']/100, label='population', figsize=(10, 7),
            c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)
plt.legend()

## Finding Correlations

In [None]:
# Imports

from pandas.plotting import scatter_matrix

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
attributes = ['median_house_value', 'median_income', 'total_rooms',
             'housing_median_age']
scatter_matrix(housing[attributes], figsize=(12, 8))