## Chapter 2 -  End-to-End Machine Learning Project
## Feature Engineering 

Input: Raw data

Output: Data after preprocessing & feature engineering

In [1]:
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (LabelEncoder, OneHotEncoder, LabelBinarizer, StandardScaler)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

### Ingest

In [2]:
df = pd.read_csv('housing.csv')
print(df.shape)

df_features = df.drop(['median_house_value'], axis=1).copy()
df_target = df[['median_house_value']].copy()

df_numericonly = df_features.drop(['ocean_proximity'], axis=1).copy()
NUMERIC_COLUMNS = list(df_numericonly)
df_categoricalonly = df_features[['ocean_proximity']].copy()
CATEGORICAL_COLUMNS = list(df_categoricalonly)

(20640, 10)


In [3]:
# For testing
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Feature Engineering Pipeline

In [4]:
# Prepare pipelines

class MyLabelBinarizer(TransformerMixin):
    '''
        Used for the pipeline as the vanilla LabelBinarizer does not have the transform() function
        for 3 params
    '''
    
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
        
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    
    def transform(self, x, y=0):
        return self.encoder.transform(x)

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    _add_bedrooms_per_room = True
    _rooms_ix, _bedrooms_ix, _population_ix, _household_ix = 3,4,5,6
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, self._rooms_ix]/X[:, self._household_ix]
        population_per_household = X[:, self._population_ix]/X[:, self._household_ix]
        if self._add_bedrooms_per_room:
            bedrooms_per_room = X[:, self._bedrooms_ix]/X[:, self._rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X,rooms_per_household, population_per_household]

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())])   

full_pipeline = ColumnTransformer([
    ('num_pipeline', numerical_pipeline, NUMERIC_COLUMNS),
    ('one_hot_encoder', OneHotEncoder(), CATEGORICAL_COLUMNS),])

In [5]:
# Test pipelines

# print(df_numericonly.head(3).shape)
# numerical_smoketest = numerical_pipeline.fit_transform(df_numericonly.head(3))
# print(numerical_smoketest)
# print(numerical_smoketest.shape)
# print()

# print(df.head(3).shape)
# full_smoketest = full_pipeline.fit_transform(df.head(3))
# print(full_smoketest)
# print(full_smoketest.shape) # only 1 category so only 12 columns
# print()

# print(np.array(df.head(3).values))
# values_out = full_pipeline.fit_transform(df.head(3))
# print(values_out[:3])
# print(values_out.shape)

In [6]:
columns_out = (list(df_numericonly) + 
               ['rooms per_household', 'population_per_household', 'bedrooms_per_room'] + 
               ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'])
print(columns_out)
print(len(columns_out))

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'rooms per_household', 'population_per_household', 'bedrooms_per_room', '<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']
16


In [7]:
values_out = full_pipeline.fit_transform(df)
print(values_out.shape)
print(values_out[:3])

(20640, 16)
[[-1.32783522  1.05254828  0.98214266 -0.8048191  -0.97247648 -0.9744286
  -0.97703285  2.34476576  0.62855945 -0.04959654 -1.02998783  0.
   0.          0.          1.          0.        ]
 [-1.32284391  1.04318455 -0.60701891  2.0458901   1.35714343  0.86143887
   1.66996103  2.33223796  0.32704136 -0.09251223 -0.8888972   0.
   0.          0.          1.          0.        ]
 [-1.33282653  1.03850269  1.85618152 -0.53574589 -0.82702426 -0.82077735
  -0.84363692  1.7826994   1.15562047 -0.02584253 -1.29168566  0.
   0.          0.          1.          0.        ]]


In [8]:
df_out = pd.DataFrame(values_out, columns=columns_out)
display(df_out.head())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms per_household,population_per_household,bedrooms_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,0.628559,-0.049597,-1.029988,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,0.327041,-0.092512,-0.888897,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.15562,-0.025843,-1.291686,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,0.156966,-0.050329,-0.449613,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,0.344711,-0.085616,-0.639087,0.0,0.0,0.0,1.0,0.0


In [10]:
df_out.to_csv('housing_X_feateng_complete.csv', index=False)
df_target.to_csv('housing_y_feateng_complete.csv', index=False)

$\diamondsuit$

## Appendices

### Feature Engineering - Imputation

In [None]:
imputer1 = SimpleImputer(strategy='median')
df_imputer1 = pd.DataFrame(imputer1.fit_transform(df_numericonly), columns=df_numericonly.columns)

display(df_numericonly.describe())
display(df_imputer1.describe())

### Feature Engineering - Categorical

In [None]:
encoder1 = LabelEncoder()

df_encoder1 = encoder1.fit_transform(df_categoricalonly)
print(encoder1.classes_)

print(len(df_encoder1))
print(df_encoder1[:5])

In [None]:
encoder2 = OneHotEncoder()
df_encoder2 = encoder2.fit_transform(df_encoder1.reshape(-1,1))

print(df_encoder2.shape)
print(df_encoder2.A[:5])

In [None]:
encoder3 = MyLabelBinarizer()
df_encoder3 = encoder3.fit_transform(df_categoricalonly)
print(df_encoder3[:5])

### Feature Engineering - Scaling

In [None]:
scaler = StandardScaler()
df['households_scale'] = scaler.fit_transform(df[['households']])
print(df[['households_scale', 'households']].describe())

### Feature Engineering - Combination

In [None]:
attr_adder = CombinedAttributesAdder()
housing_extra_attribs = attr_adder.transform(df.values)

**References:**

Python for Data Analysis, 2nd Edition, McKinney (2017)