# Preparing Data for ML Algorithms

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

First, let's separate the labels (median_house_value) and the predictors (the rest of the columns)

In [2]:
# Load stratified trainning set.
trainset_path = os.path.join("..","datasets","housing","train","housing_strat_train.csv")
strat_train_set = pd.read_csv(trainset_path)

# Predictors
housing = strat_train_set.drop("median_house_value", axis=1, inplace=False)
housing.drop("id", axis=1, inplace=True)


# Labels
housing_labels = strat_train_set["median_house_value"]

In [3]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16354.0,16512.0,16512.0,16512.0
mean,-119.575834,35.639577,28.653101,2622.728319,534.97389,1419.790819,497.06038,3.875589
std,2.00186,2.138058,12.574726,2138.458419,412.699041,1115.686241,375.720845,1.90495
min,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1443.0,295.0,784.0,279.0,2.566775
50%,-118.51,34.26,29.0,2119.5,433.0,1164.0,408.0,3.5409
75%,-118.01,37.72,37.0,3141.0,644.0,1719.25,602.0,4.744475
max,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001


In [4]:
housing_labels

0        286600.0
1        340600.0
2        196900.0
3         46300.0
4        254500.0
           ...   
16507    240200.0
16508    113000.0
16509     97800.0
16510    225900.0
16511    500001.0
Name: median_house_value, Length: 16512, dtype: float64

## Data Cleaning

Most ML algorithms can't work with missing features. We need to implement functions to take care of them.<br/>
total_bedrooms attribute has some missing values, we need to do something with them.<br/>
Option1: get rid of the districts with missing values.<br/>
Option2: get rid of the whole attribute<br/>
Option3: Set missing values to some value.<br/>

In [5]:
# 1
option1 = housing.dropna(subset=["total_bedrooms"])
# 2
option2 = housing.drop("total_bedrooms", axis=1)
# 3
median = housing["total_bedrooms"].median()
option3 = housing.copy()
option3["total_bedrooms"].fillna(median, inplace=True)

In [6]:
option1.describe()
# Notice that the count of every attribute is set to 16354 like total_bedrooms.

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0
mean,-119.575471,35.639354,28.641556,2624.246117,534.97389,1419.15886,496.999266,3.876957
std,2.001732,2.138251,12.58104,2141.933421,412.699041,1115.860053,375.485182,1.904516
min,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1445.0,295.0,784.0,279.0,2.567
50%,-118.51,34.26,29.0,2120.0,433.0,1164.0,408.0,3.5439
75%,-118.01,37.72,37.0,3139.75,644.0,1716.0,602.0,4.74715
max,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001


In [7]:
option2.describe()
# Notice that total_bedrooms doesn't exist any more.

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.575834,35.639577,28.653101,2622.728319,1419.790819,497.06038,3.875589
std,2.00186,2.138058,12.574726,2138.458419,1115.686241,375.720845,1.90495
min,-124.35,32.54,1.0,6.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1443.0,784.0,279.0,2.566775
50%,-118.51,34.26,29.0,2119.5,1164.0,408.0,3.5409
75%,-118.01,37.72,37.0,3141.0,1719.25,602.0,4.744475
max,-114.31,41.95,52.0,39320.0,35682.0,5358.0,15.0001


In [8]:
option3.describe()
# Notices that the count of total_bedrooms is increased to 16512 like the rest of the attributes.
# The median is kept untouched compared to housing.

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.575834,35.639577,28.653101,2622.728319,533.998123,1419.790819,497.06038,3.875589
std,2.00186,2.138058,12.574726,2138.458419,410.839621,1115.686241,375.720845,1.90495
min,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1443.0,296.0,784.0,279.0,2.566775
50%,-118.51,34.26,29.0,2119.5,433.0,1164.0,408.0,3.5409
75%,-118.01,37.72,37.0,3141.0,641.0,1719.25,602.0,4.744475
max,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001


### Built in functions
Sckikit-learn provides a SimpleImputer to take care of missing values. You can choose which strategy you want to use. In this example we will use median strategy.<br/>
The imputer will compute the median value for each attribute and replace any missing value with it. <br/>

It is very important to store the median values used so that we can apply the same replacement to new training data and to the test data. The SimpleImputer stores this info in statistic_ instance variable.

First we need to train the imputer with the data using fit(), then we can apply the change to the dta with transform().

In [9]:
imputer = SimpleImputer(strategy="median")

In [10]:
# Train the imputer with fit().
# Note that to apply the imputer strategy we need to get rid of the non-numeric attributes: ocean_proximity.
numeric_housing = housing.drop("ocean_proximity", axis=1)
imputer.fit(numeric_housing)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [11]:
imputer.statistics_

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

In [12]:
numeric_housing.median().values

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

In [13]:
# Transform the dataset with the trained imputer
x = imputer.transform(numeric_housing)
x

array([[-121.89  ,   37.29  ,   38.    , ...,  710.    ,  339.    ,
           2.7042],
       [-121.93  ,   37.05  ,   14.    , ...,  306.    ,  113.    ,
           6.4214],
       [-117.2   ,   32.77  ,   31.    , ...,  936.    ,  462.    ,
           2.8621],
       ...,
       [-116.4   ,   34.09  ,    9.    , ..., 2098.    ,  765.    ,
           3.2723],
       [-118.01  ,   33.82  ,   31.    , ..., 1356.    ,  356.    ,
           4.0625],
       [-122.45  ,   37.77  ,   52.    , ..., 1269.    ,  639.    ,
           3.575 ]])

In [14]:
# If we want to transform the array back to a DF, we can use pandas.
housing_tr = pd.DataFrame(x, columns=numeric_housing.columns, index=numeric_housing.index)
housing_tr.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.575834,35.639577,28.653101,2622.728319,533.998123,1419.790819,497.06038,3.875589
std,2.00186,2.138058,12.574726,2138.458419,410.839621,1115.686241,375.720845,1.90495
min,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999
25%,-121.8,33.94,18.0,1443.0,296.0,784.0,279.0,2.566775
50%,-118.51,34.26,29.0,2119.5,433.0,1164.0,408.0,3.5409
75%,-118.01,37.72,37.0,3141.0,641.0,1719.25,602.0,4.744475
max,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001


## Handling Text and Categorical Attributes
Now we have to deal with the ocean_proximity attribute. Let's start taking a look at its values.

In [15]:
# Note: DataFrame[] returns a pandas.core.series.Series Object whilst DataFrame[[]] returns a pandas.core.frame.DataFrame Object.
# The first looks for a specific key in the DataFrame columns. The second one subselects a list of columns form the main DataFrame.
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
0,<1H OCEAN
1,<1H OCEAN
2,NEAR OCEAN
3,INLAND
4,<1H OCEAN
5,INLAND
6,<1H OCEAN
7,INLAND
8,<1H OCEAN
9,<1H OCEAN


We can see that this attribute represents a list of categories. Most of the ML algorithms prefer to work with numbers, so we have to translate each category to a number. <br/>
We can use scikit-learn's OrdinalEncoder class to do so. <br/>

In [16]:
ordinal_encoder = OrdinalEncoder()
# We can use fit() to train the enconder and transform() to transforms the dataset or fit_transform() to do both.
housing_cat_enconder = ordinal_encoder.fit_transform(housing_cat)
housing_cat_enconder[:10]

array([[0.],
       [0.],
       [4.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.]])

The OrdinalEnconder stores a list of the categories that were detected in categories_

In [17]:
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

One issue with this representation is that  ML algorithsm will assume that values close to each other are more similar to distant ones. <br/>
To solve this we can use specific binary attributes for each category. This is called <b><i>one-hot enconding</i></b> <br/>
scikit-learn provides a OneHotEnconder class to convert categorical values into one-hot vectors.

In [18]:
cat_enconder = OneHotEncoder()
housing_cat1hot = cat_enconder.fit_transform(housing_cat)
housing_cat1hot
# Note that the output is a SciPy sparse matrix instead of a NumPy array. This will store only the location of non-zero elements in the matrix 
# instead of all of the values. When we have thousands of categories it will reduce significatly the memory usage.
# To convert it back to a NumPy array is the toarray() method.

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [19]:
print(type(housing_cat1hot))
print(type(housing_cat1hot.toarray()))

<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>


In [20]:
housing_cat1hot.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

We can get tha list of categories from the categories_ variables.

In [21]:
cat_enconder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]