In [1]:
import os
import tarfile
from six.moves import urllib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as spy
import sklearn as skl

## Reading data 

In [2]:
cwd = os.getcwd()
BASE_DIR = os.path.dirname(cwd)
TRAIN_TEST_DIR = os.path.join(BASE_DIR, 'CH02\\datasets\\housing')

In [3]:
TRAIN_PATH = os.path.join(TRAIN_TEST_DIR, 'strat-train-set.csv')
TEST_PATH = os.path.join(TRAIN_TEST_DIR, 'strat-test-set.csv')

In [4]:
strat_test_set = pd.read_csv(TEST_PATH)
strat_train_set = pd.read_csv(TRAIN_PATH)

In [5]:
strat_train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,286600.0,<1H OCEAN
1,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,340600.0,<1H OCEAN
2,-117.20,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,196900.0,NEAR OCEAN
3,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,46300.0,INLAND
4,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,254500.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
16507,-118.13,34.20,46.0,1271.0,236.0,573.0,210.0,4.9312,240200.0,INLAND
16508,-117.56,33.88,40.0,1196.0,294.0,1052.0,258.0,2.0682,113000.0,INLAND
16509,-116.40,34.09,9.0,4855.0,872.0,2098.0,765.0,3.2723,97800.0,INLAND
16510,-118.01,33.82,31.0,1960.0,380.0,1356.0,356.0,4.0625,225900.0,<1H OCEAN


## Preparing data for ML

We well do this using functions and classes in order to reuse code not just for this project, but for future ones we work on. Now, we what we need to do is transform a little bit the date without affecting labels (median_house_value), so we create a copy and remove them. 

In [6]:
housing = strat_train_set.drop('median_house_value', 
                               axis = 1)
housing_labels =  strat_train_set['median_house_value'].copy()

In [8]:
housing_labels

0        286600.0
1        340600.0
2        196900.0
3         46300.0
4        254500.0
           ...   
16507    240200.0
16508    113000.0
16509     97800.0
16510    225900.0
16511    500001.0
Name: median_house_value, Length: 16512, dtype: float64

### Data cleaning

We need to get rid of the nan values. The most recommended option is computing the median and filling blanks whit it. 

In [7]:
from sklearn.impute import SimpleImputer as Imputer

imputer = Imputer(strategy = 'median')

In [8]:
housing_num = housing.drop('ocean_proximity', axis = 1)

In [9]:
imputer.fit(housing_num)

SimpleImputer(strategy='median')

In [10]:
housing_num.median()

longitude             -118.5100
latitude                34.2600
housing_median_age      29.0000
total_rooms           2119.5000
total_bedrooms         433.0000
population            1164.0000
households             408.0000
median_income            3.5409
dtype: float64

X will be a matrix (the housing DF) but with blank spaces already filled. Every row is 8-lenghted since we removed the label of the prices and the ocean proximity, so it has 8 attributes.

In [11]:
X = imputer.transform(housing_num)

Now we set it to a DF again 

In [12]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

In [13]:
housing_tr['total_bedrooms'].value_counts()

433.0     181
272.0      44
280.0      44
393.0      43
331.0      41
         ... 
1710.0      1
1576.0      1
1288.0      1
1819.0      1
1351.0      1
Name: total_bedrooms, Length: 1810, dtype: int64

Now 181 districs have the median value in the total_bedrooms columns. We can demostrate this adding the previous districs which had this values and the ones which have it now

In [14]:
print("# of districts in blanks: ", 
      housing['total_bedrooms'].isna().sum())

# of districts in blanks:  158


In [15]:
print("# of districts that had the median value: ",
      housing['total_bedrooms'].value_counts()[433])

# of districts that had the median value:  23


In [16]:
print("At the top, 181, which is 23 + 158, ",
      "the # of districts with the median now")
housing_tr['total_bedrooms'].value_counts()

At the top, 181, which is 23 + 158,  the # of districts with the median now


433.0     181
272.0      44
280.0      44
393.0      43
331.0      41
         ... 
1710.0      1
1576.0      1
1288.0      1
1819.0      1
1351.0      1
Name: total_bedrooms, Length: 1810, dtype: int64

### Handling Text and Categorical Attributes

In [17]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN
1,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN
2,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN
3,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND
4,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN


Here we encode the ocean_proximity, which is a text attribute, into numbers. 

In [18]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
housing_cat = housing['ocean_proximity']
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded

array([0, 0, 4, ..., 1, 0, 3])

In [19]:
encoder.classes_

array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
      dtype=object)

But the model will assume a more strong relation between 0 and 1 than 0 and 4. This is bad then we'll encode using the OneHotEncoder.

In [20]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(
    housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [21]:
housing_cat_1hot.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

BUUUT... to be more effective we can do these 2 transformations in just one using the LabelBinarizer estimator. 

In [22]:
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

### Custom Transformers

What this code below does is equivalent to create a new column with the attributes that you want. In this case, we are putting a column that gets the ambined attributes we saw earlier: rooms per house, bedrooms per room, and population per house. The advantage of this automatization via classes is that it works as a constructor with its own parameters (hyperparamenters), which is faster than traditional pandas coding! 

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
         return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [24]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN
1,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN
2,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN
3,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND
4,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN


### Pipelines 

In order to get direction on what transformation to apply in a certain moment, pipeline is used. Recall housing_num is the training data without the label (house pricing) and the ocean proximity (since it's text). 

In [25]:
housing_num.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042
1,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214
2,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621
3,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839
4,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347


In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
 ])

In [27]:
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [28]:
pd.DataFrame(housing_num_tr)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-1.156043,0.771950,0.743331,-0.493234,-0.445438,-0.636211,-0.420698,-0.614937,-0.312055,-0.086499,0.155318
1,-1.176025,0.659695,-1.165317,-0.908967,-1.036928,-0.998331,-1.022227,1.336459,0.217683,-0.033534,-0.836289
2,1.186849,-1.342183,0.186642,-0.313660,-0.153345,-0.433639,-0.093318,-0.532046,-0.465315,-0.092405,0.422200
3,-0.017068,0.313576,-0.290520,-0.362762,-0.396756,0.036041,-0.383436,-1.045566,-0.079661,0.089736,-0.196453
4,0.492474,-0.659299,-0.926736,1.856193,2.412211,2.724154,2.570975,-0.441437,-0.357834,-0.004194,0.269928
...,...,...,...,...,...,...,...,...,...,...,...
16507,0.722267,-0.673331,1.379547,-0.632123,-0.725361,-0.759010,-0.764049,0.554158,0.234352,-0.031755,-0.428853
16508,1.007011,-0.823004,0.902385,-0.667196,-0.584183,-0.329664,-0.636291,-0.948815,-0.308114,0.084689,0.491503
16509,1.586489,-0.724781,-1.562952,1.043901,0.822735,0.607904,0.713156,-0.316705,0.346934,-0.030554,-0.521776
16510,0.782213,-0.851068,0.186642,-0.309919,-0.374849,-0.057178,-0.375451,0.098121,0.024995,0.061509,-0.303407


So far we have: 
- Worked with the training data only (using stratified sampling).
- Gained insights in the data (exploring it) and created some figures for this. We also studied the correlations and detected some quirks.  
- Combined attributes to get better ones. This was done manually using traditional pandas methods. 

After that we started to prepare data for ML:
- We removed the labels, to work with the rest of attributes. We also separed the categorical data (text) to compute numerical values only.
- Cleaning data and filling blanks with set-up classes and methods in order to be more effective and implement reusable code on practical solutions(Imputer, etc). 
- We handled categorical data using LabelBinarizer. 
- We also created transformers and our own classes (CombinedAttributesAdder, etc)
- Instead of passing the data to the estimators/transformers one by one, we decided to used Pipeline. Which does this really well, be with set-up estimators/transformers or the ones created by us. 
- We also standarize the data so it perfoms better on the ML algorithm. 

In [29]:
from sklearn.base import BaseEstimator , TransformerMixin

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

Here we are just using the numerical and categorical data separed. This is all our data just leaving out the labels, which is our initial intention. With the numerical data we create the combined attributes and standarized it at the end. In another pipeline, with categorical data, we encode it using OneHot. Finally, we added up together into a single matrix. 

In [30]:
from sklearn.pipeline import FeatureUnion

num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('one_hot_encoder', OneHotEncoder(sparse=False)),
])
full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [31]:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

In [52]:
housing_prepared = pd.DataFrame(housing_prepared)
housing_prepared

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-1.156043,0.771950,0.743331,-0.493234,-0.445438,-0.636211,-0.420698,-0.614937,-0.312055,-0.086499,0.155318,1.0,0.0,0.0,0.0,0.0
1,-1.176025,0.659695,-1.165317,-0.908967,-1.036928,-0.998331,-1.022227,1.336459,0.217683,-0.033534,-0.836289,1.0,0.0,0.0,0.0,0.0
2,1.186849,-1.342183,0.186642,-0.313660,-0.153345,-0.433639,-0.093318,-0.532046,-0.465315,-0.092405,0.422200,0.0,0.0,0.0,0.0,1.0
3,-0.017068,0.313576,-0.290520,-0.362762,-0.396756,0.036041,-0.383436,-1.045566,-0.079661,0.089736,-0.196453,0.0,1.0,0.0,0.0,0.0
4,0.492474,-0.659299,-0.926736,1.856193,2.412211,2.724154,2.570975,-0.441437,-0.357834,-0.004194,0.269928,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,0.722267,-0.673331,1.379547,-0.632123,-0.725361,-0.759010,-0.764049,0.554158,0.234352,-0.031755,-0.428853,0.0,1.0,0.0,0.0,0.0
16508,1.007011,-0.823004,0.902385,-0.667196,-0.584183,-0.329664,-0.636291,-0.948815,-0.308114,0.084689,0.491503,0.0,1.0,0.0,0.0,0.0
16509,1.586489,-0.724781,-1.562952,1.043901,0.822735,0.607904,0.713156,-0.316705,0.346934,-0.030554,-0.521776,0.0,1.0,0.0,0.0,0.0
16510,0.782213,-0.851068,0.186642,-0.309919,-0.374849,-0.057178,-0.375451,0.098121,0.024995,0.061509,-0.303407,1.0,0.0,0.0,0.0,0.0
