## Import libraries

In [37]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

## Import data

In [2]:
DATA_PATH = os.path.join('..', 'datasets', 'housing-kaggle')

In [3]:
def load_data(data_path=DATA_PATH):
    data = os.path.join(data_path, 'housing.csv')
    return pd.read_csv(data)

In [4]:
housing = load_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Train-test split data

### Stratified sampling

In [5]:
## Create 5 bins and assign labels to them - income category
housing['income_cat'] = pd.cut(housing['median_income'], bins=[0, 1.5, 3, 4.5, 6, np.inf], labels = [1,2,3,4,5])

strata = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

## Perform startified split
for train_index, test_index in strata.split(housing, housing['income_cat']):
    strata_train_set = housing.loc[train_index]
    strata_test_set = housing.loc[test_index]

## Remove the income_cat attribute fromtrain and test data
for strata_ in (strata_train_set, strata_test_set):
    strata_.drop('income_cat', axis=1, inplace=True)

## Data Cleaning

In [6]:
# Drop the target values and store as housing dataframe
housing = strata_train_set.drop('median_house_value', axis=1)

# Store the target values as housing_labels
housing_labels = strata_train_set['median_house_value'].copy()

### 1. Handle missing values

The missing values are generally handled in the below ways,

 - Drop the rows with missing values
 - Drop the columns with missing values
 - Set missing values with some value (0, mean, median or any custom value)

For this, let's start by identifying the missing values and it's percentage

In [8]:
total_missing = housing.isnull().sum().sort_values(ascending=False)
total_missing_percent = (housing.isnull().sum()/housing.isnull().count()*100).sort_values(ascending=False)

missing_values = pd.concat([total_missing, total_missing_percent], axis=1, keys=['Missing', 'Missing %'])
missing_values

Unnamed: 0,Missing,Missing %
total_bedrooms,158,0.95688
longitude,0,0.0
latitude,0,0.0
housing_median_age,0,0.0
total_rooms,0,0.0
population,0,0.0
households,0,0.0
median_income,0,0.0
ocean_proximity,0,0.0


#### Simple imputer to handle missing values

Impute missing values with a defined strategy. It can only be be formed on **numerical** attributes. Hence, remove any column with text attribute first.

In [18]:
# Note: An imputer only works on numerical attributes
imputer = SimpleImputer(strategy='median')

# Removing ocean proximity as it is non-numerical
housing_num = housing.drop('ocean_proximity', axis=1)

imputer.fit(housing_num)

# Results of imputing are stored in statistics_
print(imputer.statistics_)

[-118.51     34.26     29.     2119.5     433.     1164.      408.
    3.5409]


In [19]:
# Use the trained imputer to transform the training dataset
X = imputer.transform(housing_num)

# Results in a numpy array with transformed features
print(X)

[[-121.89     37.29     38.     ...  710.      339.        2.7042]
 [-121.93     37.05     14.     ...  306.      113.        6.4214]
 [-117.2      32.77     31.     ...  936.      462.        2.8621]
 ...
 [-116.4      34.09      9.     ... 2098.      765.        3.2723]
 [-118.01     33.82     31.     ... 1356.      356.        4.0625]
 [-122.45     37.77     52.     ... 1269.      639.        3.575 ]]


In [20]:
# Put the transformed features into a dataframe
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

housing_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347


### 2. Handle text and categorical attributes

ML algorithms usually work with numbers. Hence, it is efficient to convert categorical attributes to numbers using the below,
 - OrdinalEncoder
 - OneHotEncoder

In [24]:
# Extract the categorical attribute|
housing_cat = housing[['ocean_proximity']]

#### Ordinal Encoder
This assigns every attribute category a numeric value. 

The problem with this approach is that it assumes relationships between values. E.g. values assigned as 1 can be assumed to be related to values assigned 2 and so on

In [28]:
oe = OrdinalEncoder()

housing_cat_oe = oe.fit_transform(housing_cat)

print(oe.categories_)

housing_cat_oe[:5]

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)]


array([[0.],
       [0.],
       [4.],
       [1.],
       [0.]])

#### One Hot Encoder or dummy attributes

Creates one binary attribute per category. Only one attributes equals 1 (hot), rest are all 0 (cold). 

The problem with this approach is that the output is a sparse matrix instead of a numpy array

In [31]:
ohe = OneHotEncoder()

housing_cat_ohe = ohe.fit_transform(housing_cat)

housing_cat_ohe.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

### 3. Feature scaling

Transformation operation performed to match the scales of mu,erical attributes. Two common ways:

 - Min-max scaling (normalization) --> MinMaxScaler
 - Standarization                  --> StandardScaler

**Note:** Scaling the target is generally not required

#### Min-max scaling (normalization)

Values are shifted and rescaled such that they have the range 0 - 1. 

$ 
\frac{X - min} {(max - min)} 
$

**Note:** The feature_range hyperparameter can be used to change the range (if you do not need 0 -1 range)

#### Standardization

Does not bound values to a specific range, however, the resulting distribution has unit variance. They are **less affected by outliers**.

$
\frac {X - \mu} {\sigma}
$

**Note:** It is better to perform standarization if values contain few outliers

## Transformation pipeline

In [39]:
num_attributes = list(housing_num)
cat_attributes = ['ocean_proximity']

# Pipeline to transform numerical attributes
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

# Pipeline to transform categorical attributes
cat_pipeline = Pipeline([
    ('one_hot', OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])

housing_num_tr = full_pipeline.fit_transform(housing)

In [40]:
housing_num_tr

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])