# Preparing Data for ML Algorithms

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

First, let's separate the labels (median_house_value) and the predictors (the rest of the columns)

In [2]:
# Load stratified trainning set.
trainset_path = os.path.join("..","datasets","housing","train","housing_strat_train.csv")
strat_train_set = pd.read_csv(trainset_path)

# Predictors
housing = strat_train_set.drop("median_house_value", axis=1, inplace=False)
housing.drop("id", axis=1, inplace=True)


# Labels
housing_labels = strat_train_set["median_house_value"]

In [3]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16354.0,16512.0,16512.0,16512.0
mean,-119.575834,35.639577,28.653101,2622.728319,534.97389,1419.790819,497.06038,3.875589
std,2.00186,2.138058,12.574726,2138.458419,412.699041,1115.686241,375.720845,1.90495
min,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1443.0,295.0,784.0,279.0,2.566775
50%,-118.51,34.26,29.0,2119.5,433.0,1164.0,408.0,3.5409
75%,-118.01,37.72,37.0,3141.0,644.0,1719.25,602.0,4.744475
max,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001


In [4]:
housing_labels

0        286600.0
1        340600.0
2        196900.0
3         46300.0
4        254500.0
           ...   
16507    240200.0
16508    113000.0
16509     97800.0
16510    225900.0
16511    500001.0
Name: median_house_value, Length: 16512, dtype: float64

## Data Cleaning

Most ML algorithms can't work with missing features. We need to implement functions to take care of them.<br/>
total_bedrooms attribute has some missing values, we need to do something with them.<br/>
Option1: get rid of the districts with missing values.<br/>
Option2: get rid of the whole attribute<br/>
Option3: Set missing values to some value.<br/>

In [5]:
# 1
option1 = housing.dropna(subset=["total_bedrooms"])
# 2
option2 = housing.drop("total_bedrooms", axis=1)
# 3
median = housing["total_bedrooms"].median()
option3 = housing.copy()
option3["total_bedrooms"].fillna(median, inplace=True)

In [6]:
option1.describe()
# Notice that the count of every attribute is set to 16354 like total_bedrooms.

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0
mean,-119.575471,35.639354,28.641556,2624.246117,534.97389,1419.15886,496.999266,3.876957
std,2.001732,2.138251,12.58104,2141.933421,412.699041,1115.860053,375.485182,1.904516
min,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1445.0,295.0,784.0,279.0,2.567
50%,-118.51,34.26,29.0,2120.0,433.0,1164.0,408.0,3.5439
75%,-118.01,37.72,37.0,3139.75,644.0,1716.0,602.0,4.74715
max,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001


In [7]:
option2.describe()
# Notice that total_bedrooms doesn't exist any more.

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.575834,35.639577,28.653101,2622.728319,1419.790819,497.06038,3.875589
std,2.00186,2.138058,12.574726,2138.458419,1115.686241,375.720845,1.90495
min,-124.35,32.54,1.0,6.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1443.0,784.0,279.0,2.566775
50%,-118.51,34.26,29.0,2119.5,1164.0,408.0,3.5409
75%,-118.01,37.72,37.0,3141.0,1719.25,602.0,4.744475
max,-114.31,41.95,52.0,39320.0,35682.0,5358.0,15.0001


In [8]:
option3.describe()
# Notices that the count of total_bedrooms is increased to 16512 like the rest of the attributes.
# The median is kept untouched compared to housing.

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.575834,35.639577,28.653101,2622.728319,533.998123,1419.790819,497.06038,3.875589
std,2.00186,2.138058,12.574726,2138.458419,410.839621,1115.686241,375.720845,1.90495
min,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1443.0,296.0,784.0,279.0,2.566775
50%,-118.51,34.26,29.0,2119.5,433.0,1164.0,408.0,3.5409
75%,-118.01,37.72,37.0,3141.0,641.0,1719.25,602.0,4.744475
max,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001


### Built in functions
Sckikit-learn provides a SimpleImputer to take care of missing values. You can choose which strategy you want to use. In this example we will use median strategy.<br/>
The imputer will compute the median value for each attribute and replace any missing value with it. <br/>

It is very important to store the median values used so that we can apply the same replacement to new training data and to the test data. The SimpleImputer stores this info in statistic_ instance variable.

First we need to train the imputer with the data using fit(), then we can apply the change to the dta with transform().

In [9]:
imputer = SimpleImputer(strategy="median")

In [10]:
# Train the imputer with fit().
# Note that to apply the imputer strategy we need to get rid of the non-numeric attributes: ocean_proximity.
numeric_housing = housing.drop("ocean_proximity", axis=1)
imputer.fit(numeric_housing)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [11]:
imputer.statistics_

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

In [12]:
numeric_housing.median().values

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

In [13]:
# Transform the dataset with the trained imputer
x = imputer.transform(numeric_housing)
x

array([[-121.89  ,   37.29  ,   38.    , ...,  710.    ,  339.    ,
           2.7042],
       [-121.93  ,   37.05  ,   14.    , ...,  306.    ,  113.    ,
           6.4214],
       [-117.2   ,   32.77  ,   31.    , ...,  936.    ,  462.    ,
           2.8621],
       ...,
       [-116.4   ,   34.09  ,    9.    , ..., 2098.    ,  765.    ,
           3.2723],
       [-118.01  ,   33.82  ,   31.    , ..., 1356.    ,  356.    ,
           4.0625],
       [-122.45  ,   37.77  ,   52.    , ..., 1269.    ,  639.    ,
           3.575 ]])

In [14]:
# If we want to transform the array back to a DF, we can use pandas.
housing_tr = pd.DataFrame(x, columns=numeric_housing.columns, index=numeric_housing.index)
housing_tr.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.575834,35.639577,28.653101,2622.728319,533.998123,1419.790819,497.06038,3.875589
std,2.00186,2.138058,12.574726,2138.458419,410.839621,1115.686241,375.720845,1.90495
min,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1443.0,296.0,784.0,279.0,2.566775
50%,-118.51,34.26,29.0,2119.5,433.0,1164.0,408.0,3.5409
75%,-118.01,37.72,37.0,3141.0,641.0,1719.25,602.0,4.744475
max,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001


## Handling Text and Categorical Attributes
Now we have to deal with the ocean_proximity attribute. Let's start taking a look at its values.

In [15]:
# Note: DataFrame[] returns a pandas.core.series.Series Object whilst DataFrame[[]] returns a pandas.core.frame.DataFrame Object.
# The first looks for a specific key in the DataFrame columns. The second one subselects a list of columns form the main DataFrame.
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
0,<1H OCEAN
1,<1H OCEAN
2,NEAR OCEAN
3,INLAND
4,<1H OCEAN
5,INLAND
6,<1H OCEAN
7,INLAND
8,<1H OCEAN
9,<1H OCEAN


We can see that this attribute represents a list of categories. Most of the ML algorithms prefer to work with numbers, so we have to translate each category to a number. <br/>
We can use scikit-learn's OrdinalEncoder class to do so. <br/>

In [16]:
ordinal_encoder = OrdinalEncoder()
# We can use fit() to train the enconder and transform() to transforms the dataset or fit_transform() to do both.
housing_cat_enconder = ordinal_encoder.fit_transform(housing_cat)
housing_cat_enconder[:10]

array([[0.],
       [0.],
       [4.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.]])

The OrdinalEnconder stores a list of the categories that were detected in categories_

In [17]:
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

One issue with this representation is that  ML algorithsm will assume that values close to each other are more similar to distant ones. <br/>
To solve this we can use specific binary attributes for each category. This is called <b><i>one-hot enconding</i></b> <br/>
scikit-learn provides a OneHotEnconder class to convert categorical values into one-hot vectors.

In [18]:
cat_enconder = OneHotEncoder()
housing_cat1hot = cat_enconder.fit_transform(housing_cat)
housing_cat1hot
# Note that the output is a SciPy sparse matrix instead of a NumPy array. This will store only the location of non-zero elements in the matrix 
# instead of all of the values. When we have thousands of categories it will reduce significatly the memory usage.
# To convert it back to a NumPy array is the toarray() method.

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [19]:
housing_cat1hot.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

We can get tha list of categories from the categories_ variables.

In [20]:
cat_enconder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

## Custom transformers
scikit-learn relies on duck typing instead of inheritance. To create new custom transformers we need to create a class that implements fit(), transform() and fit_transform(). We cam get the last one by free by adding TransformerMixin as a base class. If we add BaseEstimator avoiding **args and ***kargs we get get_params() and set_params() too.

In [21]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:,rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [22]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)
housing_extra_attr = attr_adder.fit_transform(housing.values)
housing_extra_attr

array([[-121.89, 37.29, 38.0, ..., 4.625368731563422, 2.094395280235988,
        0.22385204081632654],
       [-121.93, 37.05, 14.0, ..., 6.008849557522124, 2.7079646017699117,
        0.15905743740795286],
       [-117.2, 32.77, 31.0, ..., 4.225108225108225, 2.0259740259740258,
        0.24129098360655737],
       ...,
       [-116.4, 34.09, 9.0, ..., 6.34640522875817, 2.742483660130719,
        0.1796086508753862],
       [-118.01, 33.82, 31.0, ..., 5.50561797752809, 3.808988764044944,
        0.19387755102040816],
       [-122.45, 37.77, 52.0, ..., 4.843505477308295, 1.9859154929577465,
        0.22035541195476574]], dtype=object)

In [23]:
attr_adder.get_params

<bound method BaseEstimator.get_params of CombinedAttributesAdder(add_bedrooms_per_room=True)>

In [24]:
attr_adder.set_params

<bound method BaseEstimator.set_params of CombinedAttributesAdder(add_bedrooms_per_room=True)>

## Feature Scaling
ML algorithms don't perform well when the input numerical attributes have very different sclaes. In this example the total number os rooms ranges from 6 to 39320, while the median income ranges ffrom 0 to 15.<br/>

We have two common ways to solve the problem: min-max scaling and standarisation.

#### Min-Max Scaling (Normalisation):
Values shifted and rescaled so that they end up ranging from 0 to 1.<br/>
We can acomplish this by subtracting the min value and dividing by max minus min (X-min / max - min).<br/>
Scikit-learn provides a transformer called MinMaxScaler that does this. It has a hyperparameter to change the 0,1 range if needed (feature_range).

#### Standarisation
First we subtract the mean value (so standarised values always have a zero mean), then it divides by the standard deviation (so that the resulting distribution has unit variance): x-mean / SD.<br/>
Unlike Min-Max Scaling, Standarisation does not bound values to a specific range. This can be a problem to some ML algorithms. On the other hand, Standarisation is much less afected by outliers.<br/>
Scikit-learn provides a transformer called StandardScaler that performs this action.

## NOTE:
As with all transformes, it is important to fit the scalers to the training data only (Not to the full set). Later on, it can be applied to the test set and/or any new set.

## Transfromation Pipelines
Scikit-learn provides a class to orchestrate a transformation pipeline: Pipeline.<br/>
This class takes a list of name/estimator pairs defining the sequence of steps. All but the last estimator must be transfromers.

In [25]:
std_num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attrib_adder', CombinedAttributesAdder(add_bedrooms_per_room=True)),
    ('std_scaler', MinMaxScaler())
])

min_max_num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attrib_adder', CombinedAttributesAdder(add_bedrooms_per_room=True)),
    ('std_scaler', StandardScaler())
])


In [26]:
housing_num_tr_2 = min_max_num_pipeline.fit_transform(numeric_housing)
housing_num_tr_2

array([[-1.15604281,  0.77194962,  0.74333089, ..., -0.31205452,
        -0.08649871,  0.15531753],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.21768338,
        -0.03353391, -0.83628902],
       [ 1.18684903, -1.34218285,  0.18664186, ..., -0.46531516,
        -0.09240499,  0.4222004 ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.3469342 ,
        -0.03055414, -0.52177644],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.02499488,
         0.06150916, -0.30340741],
       [-1.43579109,  0.99645926,  1.85670895, ..., -0.22852947,
        -0.09586294,  0.10180567]])

In [27]:
housing_num_tr = std_num_pipeline.fit_transform(numeric_housing)
housing_num_tr

array([[0.24501992, 0.50478215, 0.7254902 , ..., 0.02482574, 0.00112831,
        0.06734832],
       [0.24103586, 0.47927736, 0.25490196, ..., 0.03465309, 0.00162207,
        0.04399646],
       [0.71215139, 0.02444208, 0.58823529, ..., 0.02198255, 0.00107325,
        0.07363329],
       ...,
       [0.79183267, 0.16471838, 0.15686275, ..., 0.03705086, 0.00164985,
        0.05140308],
       [0.6314741 , 0.1360255 , 0.58823529, ..., 0.03107846, 0.00250811,
        0.05654557],
       [0.18924303, 0.55579171, 1.        , ..., 0.02637524, 0.00104101,
        0.06608814]])

## Handling Numeric and Categorical columns
Up to now we have handled numeric and categorical caloumns separately applying the appropriate transformations to each other. Version 0.20 of scikit-learn provides the ColumnTransformer for this purpose.<br/>
ColumnTransformer receives a list of tuples with name, transformer and a list of columns.

In [28]:
num_attribs = list(numeric_housing)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ('num', std_num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [29]:
extra_columns = ("rooms_per_household", "population_per_household", "bedrooms_per_room")
cat_columns = cat_enconder.categories_[0]
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared = pd.DataFrame(housing_prepared, columns=[*numeric_housing.columns, *extra_columns, *cat_columns])
housing_prepared

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0.245020,0.504782,0.725490,0.039731,0.056218,0.019816,0.062920,0.152019,0.024826,0.001128,0.067348,1.0,0.0,0.0,0.0,0.0
1,0.241036,0.479277,0.254902,0.017119,0.017075,0.008492,0.020724,0.408374,0.034653,0.001622,0.043996,1.0,0.0,0.0,0.0,0.0
2,0.712151,0.024442,0.588235,0.049499,0.075548,0.026150,0.085885,0.162908,0.021983,0.001073,0.073633,0.0,0.0,0.0,0.0,1.0
3,0.472112,0.400638,0.470588,0.046828,0.059439,0.040836,0.065534,0.095447,0.029137,0.002771,0.059064,0.0,1.0,0.0,0.0,0.0
4,0.573705,0.179596,0.313725,0.167523,0.245329,0.124891,0.272778,0.174811,0.023976,0.001896,0.070047,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,0.619522,0.176408,0.882353,0.032177,0.037693,0.015976,0.038835,0.305603,0.034962,0.001639,0.053591,0.0,1.0,0.0,0.0,0.0
16508,0.676295,0.142402,0.764706,0.030269,0.047036,0.029401,0.047797,0.108157,0.024899,0.002724,0.075265,0.0,1.0,0.0,0.0,0.0
16509,0.791833,0.164718,0.156863,0.123340,0.140142,0.058718,0.142457,0.191197,0.037051,0.001650,0.051403,0.0,1.0,0.0,0.0,0.0
16510,0.631474,0.136026,0.588235,0.049702,0.060889,0.037921,0.066094,0.245693,0.031078,0.002508,0.056546,1.0,0.0,0.0,0.0,0.0


In [30]:
housing_prepared.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,0.475515,0.329392,0.542218,0.06656,0.085696,0.039709,0.092431,0.232803,0.030615,0.001935,0.063691,0.440649,0.318738,0.000121,0.111858,0.128634
std,0.199388,0.227211,0.246563,0.054394,0.066179,0.03127,0.07015,0.131374,0.018552,0.009323,0.02355,0.49648,0.466001,0.011005,0.315201,0.334804
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.253984,0.148778,0.333333,0.036552,0.047358,0.02189,0.051718,0.142541,0.023523,0.001399,0.049762,0.0,0.0,0.0,0.0,0.0
50%,0.581673,0.182784,0.54902,0.053759,0.069427,0.03254,0.075803,0.209721,0.029137,0.00171,0.059855,0.0,0.0,0.0,0.0,0.0
75%,0.631474,0.550478,0.705882,0.079743,0.102932,0.048103,0.112024,0.292725,0.034991,0.002084,0.073215,1.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
! mkdir -p $AAI_HOME/datasets/housing/data_prep

In [32]:
prepared_set_path = os.path.join("..","datasets","housing","data_prep","housing_prep.csv")
housing_prepared.to_csv(prepared_set_path, index=False)

In [33]:
! ls -la $AAI_HOME/datasets/housing/data_prep

total 8200
drwxr-xr-x  3 adzarei  staff       96 Nov 17 05:22 [36m.[m[m
drwxr-xr-x  8 adzarei  staff      256 Nov 17 05:21 [36m..[m[m
-rw-r--r--@ 1 adzarei  staff  3964232 Nov 17 05:25 housing_prep.csv
