In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit


# Project Goals
To create a model that predicts median house values in Californian districts, given a number of features from these districts

# Get the data

In [None]:
file_name = 'data/raw/housing.csv'
housing_df = pd.read_csv(file_name)
housing_df.head()

In [None]:
housing_df.info()

In [None]:
housing_df['ocean_proximity'].value_counts()

In [None]:
housing_df.describe()

In [None]:
housing_df.hist(bins=50, figsize=(20,15));


## Split the data
Here we create training and test splits

In [None]:
train_set, test_set = train_test_split(housing_df, test_size=0.2, random_state=8)

In [None]:
train_set.head()

In [None]:
test_set.head()

## reduce the number of categories 

In [None]:
housing_df["median_income"].hist();

In [None]:
# Create income category
# Divide by 1.5 to limit the number of income categories
housing_df["income_cat"] = np.ceil(housing_df["median_income"] / 1.5)
# Label those above 5 as 5
housing_df["income_cat"].where(housing_df["income_cat"] < 5, 5.0, inplace=True)

In [None]:
# plot histogram for new category 
housing_df["income_cat"].hist();

In [2]:
housing_df["income_cat"].value_counts()

NameError: name 'housing_df' is not defined

From the above histogram we see the data is not evenly split. We stratify in order to properly represent population with splits. 

## Stratify data

In [None]:
# split data again but this time with strata
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing_df, housing_df["income_cat"]):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set = housing_df.loc[test_index]

In [None]:
# strata ratios 
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
# compared to original population 
housing_df["income_cat"].value_counts() / len(housing_df)

In [None]:
# Now we can drop the income cat column 
for df in [strat_train_set, strat_test_set]:
    df.drop("income_cat", axis=1, inplace=True)

## Save strata to pickles 

In [None]:
# Save the strata sets
strata = [strat_train_set, strat_test_set]
names = ['strat_train_set', 'strat_test_set']
for df, name in zip(strata, names):
    df.to_pickle('data/interim/'+name+'.pkl')

In [8]:
# read pickles to dataframes
strat_train_set = pd.read_pickle('data/interim/'+'strat_train_set'+'.pkl')
strat_test_set = pd.read_pickle('data/interim/'+'strat_test_set'+'.pkl')

# Explore data (gain insights)

# Prepare data

# Explore models

# Fine tune model


# Present solution

# Launch monitor and maintain system