# Preparing Data for ML Algorithms

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

First, let's separate the labels (median_house_value) and the predictors (the rest of the columns)

In [2]:
# Load stratified trainning set.
trainset_path = os.path.join("..","datasets","housing","train","housing_strat_train.csv")
strat_train_set = pd.read_csv(trainset_path)

# Predictors
housing = strat_train_set.drop("median_house_value", axis=1)
# Labels
housing_labels = strat_train_set["median_house_value"]

In [3]:
housing.describe()

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,id,income_cat
count,16512.0,16512.0,16512.0,16512.0,16512.0,16354.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,10332.352108,-119.575834,35.639577,28.653101,2622.728319,534.97389,1419.790819,497.06038,3.875589,119540.194362,3.006541
std,5979.473431,2.00186,2.138058,12.574726,2138.458419,412.699041,1115.686241,375.720845,1.90495,1999.883823,1.054602
min,0.0,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999,114275.81,1.0
25%,5157.75,-121.8,33.94,18.0,1443.0,295.0,784.0,279.0,2.566775,117976.2375,2.0
50%,10341.0,-118.51,34.26,29.0,2119.5,433.0,1164.0,408.0,3.5409,118475.925,3.0
75%,15522.5,-118.01,37.72,37.0,3141.0,644.0,1719.25,602.0,4.744475,121762.68,4.0
max,20638.0,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001,124309.46,5.0


In [4]:
housing_labels

0        286600.0
1        340600.0
2        196900.0
3         46300.0
4        254500.0
           ...   
16507    240200.0
16508    113000.0
16509     97800.0
16510    225900.0
16511    500001.0
Name: median_house_value, Length: 16512, dtype: float64

## Data Cleaning

Most ML algorithms can't work with missing features. We need to implement functions to take care of them.<br/>
total_bedrooms attribute has some missing values, we need to do something with them.<br/>
Option1: get rid of the districts with missing values.<br/>
Option2: get rid of the whole attribute<br/>
Option3: Set missing values to some value.<br/>

In [5]:
# 1
option1 = housing.dropna(subset=["total_bedrooms"])
# 2
option2 = housing.drop("total_bedrooms", axis=1)
# 3
median = housing["total_bedrooms"].median()
option3 = housing.copy()
option3["total_bedrooms"].fillna(median, inplace=True)

In [6]:
option1.describe()
# Notice that the count of every attribute is set to 16354 like total_bedrooms.

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,id,income_cat
count,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0,16354.0
mean,10331.675859,-119.575471,35.639354,28.641556,2624.246117,534.97389,1419.15886,496.999266,3.876957,119539.83209,3.007399
std,5977.284822,2.001732,2.138251,12.58104,2141.933421,412.699041,1115.860053,375.485182,1.904516,1999.755045,1.054792
min,0.0,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999,114275.81,1.0
25%,5159.5,-121.8,33.94,18.0,1445.0,295.0,784.0,279.0,2.567,117976.1925,2.0
50%,10348.5,-118.51,34.26,29.0,2120.0,433.0,1164.0,408.0,3.5439,118475.875,3.0
75%,15515.75,-118.01,37.72,37.0,3139.75,644.0,1716.0,602.0,4.74715,121762.675,4.0
max,20638.0,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001,124309.46,5.0


In [7]:
option2.describe()
# Notice that total_bedrooms doesn't exist any more.

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,id,income_cat
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,10332.352108,-119.575834,35.639577,28.653101,2622.728319,1419.790819,497.06038,3.875589,119540.194362,3.006541
std,5979.473431,2.00186,2.138058,12.574726,2138.458419,1115.686241,375.720845,1.90495,1999.883823,1.054602
min,0.0,-124.35,32.54,1.0,6.0,3.0,2.0,0.4999,114275.81,1.0
25%,5157.75,-121.8,33.94,18.0,1443.0,784.0,279.0,2.566775,117976.2375,2.0
50%,10341.0,-118.51,34.26,29.0,2119.5,1164.0,408.0,3.5409,118475.925,3.0
75%,15522.5,-118.01,37.72,37.0,3141.0,1719.25,602.0,4.744475,121762.68,4.0
max,20638.0,-114.31,41.95,52.0,39320.0,35682.0,5358.0,15.0001,124309.46,5.0


In [8]:
option3.describe()
# Notices that the count of total_bedrooms is increased to 16512 like the rest of the attributes.
# The median is kept untouched compared to housing.

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,id,income_cat
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,10332.352108,-119.575834,35.639577,28.653101,2622.728319,533.998123,1419.790819,497.06038,3.875589,119540.194362,3.006541
std,5979.473431,2.00186,2.138058,12.574726,2138.458419,410.839621,1115.686241,375.720845,1.90495,1999.883823,1.054602
min,0.0,-124.35,32.54,1.0,6.0,2.0,3.0,2.0,0.4999,114275.81,1.0
25%,5157.75,-121.8,33.94,18.0,1443.0,296.0,784.0,279.0,2.566775,117976.2375,2.0
50%,10341.0,-118.51,34.26,29.0,2119.5,433.0,1164.0,408.0,3.5409,118475.925,3.0
75%,15522.5,-118.01,37.72,37.0,3141.0,641.0,1719.25,602.0,4.744475,121762.68,4.0
max,20638.0,-114.31,41.95,52.0,39320.0,6210.0,35682.0,5358.0,15.0001,124309.46,5.0


### Built in functions
Sckikit-learn provides a SimpleImputer to take care of missing values. You can choose which strategy you want to use. In this example we will use median strategy.<br/>
The imputer will compute the median value for each attribute and replace any missing value with it. <br/>

It is very important to store the median values used so that we can apply the same replacement to new training data and to the test data. The SimpleImputer stores this info in statistic_ instance variable.

In [9]:
imputer = SimpleImputer(strategy="median")

In [10]:
# To apply the imputer strategy we need to get rid of the non-numeric attributes: ocean_proximity.
numeric_housing = housing.drop("ocean_proximity", axis=1)
imputer.fit(numeric_housing)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [12]:
imputer.statistics_

array([ 1.03410000e+04, -1.18510000e+02,  3.42600000e+01,  2.90000000e+01,
        2.11950000e+03,  4.33000000e+02,  1.16400000e+03,  4.08000000e+02,
        3.54090000e+00,  1.18475925e+05,  3.00000000e+00])

In [14]:
numeric_housing.median().values

array([ 1.03410000e+04, -1.18510000e+02,  3.42600000e+01,  2.90000000e+01,
        2.11950000e+03,  4.33000000e+02,  1.16400000e+03,  4.08000000e+02,
        3.54090000e+00,  1.18475925e+05,  3.00000000e+00])