## Import libraries

In [14]:
import os

import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

## Import data

In [2]:
DATA_PATH = os.path.join('..', 'datasets', 'housing-kaggle')

In [3]:
def load_data(data_path=DATA_PATH):
    data = os.path.join(data_path, 'housing.csv')
    return pd.read_csv(data)

In [4]:
housing = load_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Train-test split data

### Stratified sampling

In [6]:
## Create 5 bins and assign labels to them - income category
housing['income_cat'] = pd.cut(housing['median_income'], bins=[0, 1.5, 3, 4.5, 6, np.inf], labels = [1,2,3,4,5])

strata = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

## Perform startified split
for train_index, test_index in strata.split(housing, housing['income_cat']):
    strata_train_set = housing.loc[train_index]
    strata_test_set = housing.loc[test_index]

## Remove the income_cat attribute fromtrain and test data
for strata_ in (strata_train_set, strata_test_set):
    strata_.drop('income_cat', axis=1, inplace=True)

## Data Cleaning

In [11]:
# Drop the target values and store as housing dataframe
housing = strata_train_set.drop('median_house_value', axis=1)

# Store the target values as housing_labels
housing_labels = strata_train_set['median_house_value'].copy()

# Removing ocean proximity as it is non-numerical
housing_num = housing.drop('ocean_proximity', axis=1)

## Data transformation

In [12]:
num_attributes = list(housing_num)
cat_attributes = ['ocean_proximity']

# Pipeline to transform numerical attributes
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

# Pipeline to transform categorical attributes
cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])

housing_num_tr = full_pipeline.fit_transform(housing)

## Model training

In [13]:
# Train on a Linear Regression model
lr = LinearRegression()
lr.fit(housing_num_tr, housing_labels)

LinearRegression()

## Evaluate model

In [18]:
# Test predicting on the whole training set
housing_predictions = lr.predict(housing_num_tr)

# Check mean square error
lr_mse = mean_squared_error(housing_labels, housing_predictions)

# Check RMSE error
lr_rmse = np.sqrt(lr_mse)
lr_rmse

69050.98178244587

The median_housing_values range between 120,000 and 265,000. So a prediction error of 68,628 is not optimal. This is an example of model **underfitting**. It can be resolved by selecting a more powerful model or adding more features.