## Chapter 2 -  End-to-End Machine Learning Project

## Exercises (2)

In [1]:
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, GridSearchCV

### Ingestion

In [2]:
df_features = pd.read_csv('housing_X_feateng_complete.csv')
df_result = pd.read_csv('housing_y_feateng_complete.csv')
df = df_features.join(df_result)

In [3]:
# For testing
# display(df.head())

### Train-Test Split

Using Stratified Sampling strategy

In [4]:
# Obtain the column to statify on
df['p1_median_income_category'] = np.ceil(df['median_income']/1.5)
df['p1_median_income_category'] = df['p1_median_income_category'].apply(lambda x: x if x<=5.0 else 5.0)

# Train Test Split - Stratified strategy
shuffle_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
df_splits = list(shuffle_split.split(df, df['p1_median_income_category']))[0]
df_strat_train = df.iloc[df_splits[0]].copy()
df_strat_test = df.iloc[df_splits[1]].copy()

# Remove column to stratify on
_ = df_strat_train.drop('p1_median_income_category', axis=1, inplace=True)
_ = df_strat_test.drop('p1_median_income_category', axis=1, inplace=True)

# X_train, X_test, y_train, y_test
X_train = df_strat_train.drop('median_house_value', axis=1).copy()
y_train = df_strat_train['median_house_value'].copy()
X_test = df_strat_test.drop('median_house_value', axis=1).copy()
y_test = df_strat_test['median_house_value'].copy()

In [5]:
# For testing
# display(X_train.describe())
# display(X_test.describe())

### Modelling - Random Forest Regressor

In [6]:
model3 = RandomForestRegressor(random_state=0)
model3.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

3 - Try adding a transformer in the preparation pipeline to select only the most important attributes.

In [7]:
model3.feature_importances_

array([5.95863878e-02, 5.75564317e-02, 4.38695839e-02, 1.26693463e-02,
       1.21465523e-02, 1.20689712e-02, 1.10389932e-02, 4.71259499e-01,
       2.63220579e-02, 1.22456634e-01, 2.35721940e-02, 1.19636670e-03,
       1.42861849e-01, 1.76148429e-04, 7.15232352e-04, 2.50375204e-03])

In [8]:
class TopFeatureSelector(BaseEstimator, TransformerMixin):
    _feature_importances, _k = [], 0
    _feature_indices = []
    def __init__(self, feature_importances, k):
        self._feature_importances = feature_importances
        self._k = k
    
    def fit(self, X, y=None):
        self._feature_indices = np.sort(np.argpartition(np.array(self._feature_importances), self._k*-1)[-self._k:])
        return self
    
    def transform(self, X):
        return X[:, self._feature_indices]

In [9]:
print(model3.feature_importances_)

[5.95863878e-02 5.75564317e-02 4.38695839e-02 1.26693463e-02
 1.21465523e-02 1.20689712e-02 1.10389932e-02 4.71259499e-01
 2.63220579e-02 1.22456634e-01 2.35721940e-02 1.19636670e-03
 1.42861849e-01 1.76148429e-04 7.15232352e-04 2.50375204e-03]


In [10]:
top_k = 5
np.sort(np.argpartition(np.array(model3.feature_importances_), -top_k)[-top_k:])

array([ 0,  1,  7,  9, 12])

In [11]:
indices = list(np.sort(np.argpartition(np.array(model3.feature_importances_), -top_k)[-top_k:]))
features = list(X_train)

In [14]:
feat_final = []
for d in indices:
    feat_final.append((features[d], model3.feature_importances_[d]))
print(feat_final)

[('longitude', 0.05958638784843237), ('latitude', 0.05755643172239503), ('median_income', 0.47125949943545237), ('population_per_household', 0.12245663426509462), ('INLAND', 0.14286184852375705)]


In [17]:
for i in sorted(feat_final, key=lambda x: x[1], reverse=True):
    print(i)

('median_income', 0.47125949943545237)
('INLAND', 0.14286184852375705)
('population_per_household', 0.12245663426509462)
('longitude', 0.05958638784843237)
('latitude', 0.05755643172239503)


**References:**

Python for Data Analysis, 2nd Edition, McKinney (2017)