## Chapter 2 -  End-to-End Machine Learning Project

## Machine Learning

In the *housing* problem, the problem is framed as a <u>supervised learning problem</u> as there is labelled data. Since a numerical value is expected from the outcome, this is a <u>regression</u> problem. Finally, since the whole dataset will be used to train the model, this is a <u>batch learning</u> problem.

For a regression problem, the performance measure is Root Mean Square Error or RMSE. Given $m$ training samples, feature matrix $\mathbf X$, hypothesis $h$, then the RMSE is $$\text{RMSE} (\mathbf X , h) = \sqrt{\frac 1 m \sum_{i=1}^m \begin{bmatrix}h(\mathbf x^{(i)} - y^{(i)})\end{bmatrix}^2}$$

Note that an alternative performance measure for regression is mean absolute error or MAE, where $$\text{MAE} (\mathbf X , h) = \frac 1 m \sum_{i=1}^m|h(\mathbf x^{(i)} - y^{(i)})|$$

In [1]:
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, GridSearchCV

### Ingestion

In [2]:
df_features = pd.read_csv('housing_X_feateng_complete.csv')
df_result = pd.read_csv('housing_y_feateng_complete.csv')
df = df_features.join(df_result)

In [3]:
# For testing
display(df.head())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms per_household,population_per_household,bedrooms_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,median_house_value
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,0.628559,-0.049597,-1.029988,0.0,0.0,0.0,1.0,0.0,452600.0
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,0.327041,-0.092512,-0.888897,0.0,0.0,0.0,1.0,0.0,358500.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.15562,-0.025843,-1.291686,0.0,0.0,0.0,1.0,0.0,352100.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,0.156966,-0.050329,-0.449613,0.0,0.0,0.0,1.0,0.0,341300.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,0.344711,-0.085616,-0.639087,0.0,0.0,0.0,1.0,0.0,342200.0


### Train-Test Split

Using Stratified Sampling strategy

In [4]:
# Obtain the column to statify on
df['p1_median_income_category'] = np.ceil(df['median_income']/1.5)
df['p1_median_income_category'] = df['p1_median_income_category'].apply(lambda x: x if x<=5.0 else 5.0)

# Train Test Split - Stratified strategy
shuffle_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
df_splits = list(shuffle_split.split(df, df['p1_median_income_category']))[0]
df_strat_train = df.iloc[df_splits[0]].copy()
df_strat_test = df.iloc[df_splits[1]].copy()

# Remove column to stratify on
_ = df_strat_train.drop('p1_median_income_category', axis=1, inplace=True)
_ = df_strat_test.drop('p1_median_income_category', axis=1, inplace=True)

# X_train, X_test, y_train, y_test
X_train = df_strat_train.drop('median_house_value', axis=1).copy()
y_train = df_strat_train['median_house_value'].copy()
X_test = df_strat_test.drop('median_house_value', axis=1).copy()
y_test = df_strat_test['median_house_value'].copy()

In [5]:
# For testing
display(X_train.describe())
display(X_test.describe())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms per_household,population_per_household,bedrooms_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-0.001442,0.00223,0.002458,-0.003532,-0.004792,-0.004591,-0.004538,0.001456,-0.001727,0.00279,0.000698,0.441679,0.318556,0.000303,0.10992,0.129542
std,1.002354,1.003475,0.999614,0.99367,0.992945,1.002085,0.994333,0.999486,1.008119,1.116353,1.022342,0.496602,0.465931,0.017399,0.3128,0.335809
min,-2.385992,-1.447568,-2.19618,-1.207283,-1.277688,-1.256123,-1.303984,-1.774299,-1.852319,-0.229,-2.707317,0.0,0.0,0.0,0.0,0.0
25%,-1.113209,-0.796789,-0.845393,-0.546289,-0.571887,-0.565575,-0.576845,-0.686882,-0.397628,-0.061888,-0.591556,0.0,0.0,0.0,0.0,0.0
50%,0.538914,-0.642287,0.028646,-0.235273,-0.242831,-0.230015,-0.236816,-0.175979,-0.079958,-0.024428,-0.164955,0.0,0.0,0.0,0.0,0.0
75%,0.778496,0.977638,0.66431,0.230792,0.248368,0.26008,0.26538,0.460925,0.251069,0.020167,0.400664,1.0,1.0,0.0,0.0,0.0
max,2.62528,2.958068,1.856182,16.81558,14.087789,30.25033,14.60152,5.858286,55.163236,119.419103,40.015599,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms per_household,population_per_household,bedrooms_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
count,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0,4128.0
mean,0.005769,-0.00892,-0.009832,0.014127,0.019168,0.018364,0.01815,-0.005824,0.006909,-0.011161,-0.002794,0.446463,0.312742,0.0,0.115068,0.125727
std,0.990752,0.986168,1.001724,1.025047,1.027755,0.991646,1.022393,1.002275,0.967058,0.123169,0.905376,0.497186,0.463666,0.0,0.319142,0.331581
min,-2.361036,-1.442886,-2.19618,-1.201324,-1.272919,-1.251707,-1.298752,-1.774299,-1.56661,-0.190779,-2.21322,0.0,0.0,0.0,0.0,0.0
25%,-1.103226,-0.796789,-0.845393,-0.537121,-0.565329,-0.555861,-0.568998,-0.691593,-0.405664,-0.060465,-0.589007,0.0,0.0,0.0,0.0,0.0
50%,0.533922,-0.642287,0.028646,-0.223126,-0.242831,-0.222067,-0.234201,-0.179953,-0.085268,-0.023762,-0.154334,0.0,0.0,0.0,0.0,0.0
75%,0.778496,0.963593,0.66431,0.25268,0.277578,0.291207,0.299383,0.454806,0.255078,0.021198,0.415131,1.0,1.0,0.0,0.0,0.0
max,2.500497,2.897203,1.856182,13.484939,11.333735,12.434733,11.902214,5.858286,22.789378,4.653407,9.47093,1.0,1.0,0.0,1.0,1.0


### Modelling - Linear Regression

In [6]:
# Train
model1 = LinearRegression()
model1.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
joblib.dump(model1, 'chap02_model1.pkl')

['chap02_model1.pkl']

### Model Evaluation - Linear Regression

In [8]:
# Test a small sample
X_test_sample = X_test[:5]
y_test_sample = y_test.iloc[:5]
y_predict_test_sample = model1.predict(X_test_sample)
print("Actual\t\tPredicted")
print("---------------------------")
for ytest, ypredict in zip(y_test_sample, y_predict_test_sample):
    print("{:.2f}\t{:.2f}".format(ytest, ypredict))

Actual		Predicted
---------------------------
121800.00	111700.46
219400.00	176671.21
310000.00	362695.92
157600.00	159785.81
257500.00	219614.98


In [9]:
ytrain_predict = model1.predict(X_train)
training_mse = mean_squared_error(y_train, ytrain_predict)
training_rmse = np.sqrt(training_mse)
print(training_rmse)

68498.1054790028


In [10]:
training_mae = mean_absolute_error(y_train, ytrain_predict)
print(training_mae)

49388.02226716921


### Modelling - Decision Tree Regressor

In [11]:
model2 = DecisionTreeRegressor(random_state=0)
model2.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [12]:
joblib.dump(model2, 'chap02_model2.pkl')

['chap02_model2.pkl']

### Model Evaluation - Decision Tree Regressor

In [13]:
ytrain_predict2 = model2.predict(X_train)
training_mse2 = mean_squared_error(y_train, ytrain_predict2)
training_rmse2 = np.sqrt(training_mse2)
print(training_rmse2)

0.0


### Modelling - Random Forest Regressor

In [14]:
model3 = RandomForestRegressor(random_state=0)
model3.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [15]:
joblib.dump(model3, 'chap02_model3.pkl')

['chap02_model3.pkl']

In [16]:
y_predict3 = model3.predict(X_train)
training_mse3 = mean_squared_error(y_train, y_predict3)
training_rmse3 = np.sqrt(training_mse3)
print(training_rmse3)

18531.48268225328


### Cross Validation

In [17]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean  : ", scores.mean())
    print("SD    : ", scores.std())    

In [18]:
# RMSE for 10-fold cross validation on Linear Regression
scores = cross_val_score(model3, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
rmse_scores = np.sqrt(-scores)
display_scores(rmse_scores)

Scores:  [50128.63077714 49825.91191266 48340.97328755 49021.60276501
 51167.551463   49693.9308058  47497.07290867 50530.47646797
 52228.19774811 50487.86648286]
Mean  :  49892.22146187593
SD    :  1297.9254283659618


In [19]:
# RMSE for 10-fold cross validation on Decision Tree Regressor
scores = cross_val_score(model2, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
rmse_scores = np.sqrt(-scores)
display_scores(rmse_scores)

Scores:  [71083.05580918 69947.3226813  70231.93474872 69114.8078499
 72193.57936851 70884.26836269 69528.41007064 71185.59244803
 72080.87635058 69978.17419224]
Mean  :  70622.80218817863
SD    :  984.2046662040034


In [20]:
# RMSE for 10-fold cross validation on Random Forest Regressor
scores = cross_val_score(model3, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
rmse_scores = np.sqrt(-scores)
display_scores(rmse_scores)

Scores:  [50128.63077714 49825.91191266 48340.97328755 49021.60276501
 51167.551463   49693.9308058  47497.07290867 50530.47646797
 52228.19774811 50487.86648286]
Mean  :  49892.22146187593
SD    :  1297.9254283659618


### Grid Search

In [21]:
param_grid = [{'n_estimators' : [5, 10, 50], 'max_features' : [2,4,6,8]},
              {'bootstrap' : [False], 'n_estimators' : [5,10,50], 'max_features' : [2,3,4]}]

In [22]:
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [23]:
best_estimator = grid_search.best_estimator_

In [24]:
joblib.dump(best_estimator, 'chap02_model_final.pkl')

['chap02_model_final.pkl']

In [25]:
cvs = grid_search.cv_results_
for mean_score, params in zip(cvs['mean_test_score'], cvs['params']):
    print('{:.1f}'.format(np.sqrt(-mean_score)), params)

58161.6 {'max_features': 2, 'n_estimators': 5}
55191.9 {'max_features': 2, 'n_estimators': 10}
51777.9 {'max_features': 2, 'n_estimators': 50}
55806.7 {'max_features': 4, 'n_estimators': 5}
52736.0 {'max_features': 4, 'n_estimators': 10}
49922.6 {'max_features': 4, 'n_estimators': 50}
55251.5 {'max_features': 6, 'n_estimators': 5}
51840.3 {'max_features': 6, 'n_estimators': 10}
49392.6 {'max_features': 6, 'n_estimators': 50}
55120.2 {'max_features': 8, 'n_estimators': 5}
51939.3 {'max_features': 8, 'n_estimators': 10}
49625.5 {'max_features': 8, 'n_estimators': 50}
57713.2 {'bootstrap': False, 'max_features': 2, 'n_estimators': 5}
54378.6 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
50982.9 {'bootstrap': False, 'max_features': 2, 'n_estimators': 50}
55133.0 {'bootstrap': False, 'max_features': 3, 'n_estimators': 5}
52623.1 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
49609.1 {'bootstrap': False, 'max_features': 3, 'n_estimators': 50}
54619.9 {'bootstrap': 

In [26]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([9.03638624e-02, 8.35511606e-02, 4.23530447e-02, 2.10085404e-02,
       1.92031769e-02, 2.12312174e-02, 1.89749711e-02, 2.99098505e-01,
       6.08305063e-02, 1.00398310e-01, 8.67623228e-02, 2.12547616e-02,
       1.18707000e-01, 2.78197716e-04, 5.69627010e-03, 1.02881540e-02])

In [27]:
attributes = list(df_features)
sorted(zip(feature_importances, attributes), reverse=True)

[(0.2990985046814199, 'median_income'),
 (0.11870699980032202, 'INLAND'),
 (0.1003983095965346, 'population_per_household'),
 (0.09036386236856708, 'longitude'),
 (0.08676232275742518, 'bedrooms_per_room'),
 (0.08355116063341483, 'latitude'),
 (0.060830506332311914, 'rooms per_household'),
 (0.042353044703058076, 'housing_median_age'),
 (0.02125476162060122, '<1H OCEAN'),
 (0.021231217368587112, 'population'),
 (0.021008540360326337, 'total_rooms'),
 (0.019203176853532282, 'total_bedrooms'),
 (0.018974971054573436, 'households'),
 (0.010288154049394642, 'NEAR OCEAN'),
 (0.005696270103778209, 'NEAR BAY'),
 (0.0002781977161531303, 'ISLAND')]

In [28]:
final_model = grid_search.best_estimator_
y_predict_test = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_predict_test)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

48300.38643303764


**References:**

Python for Data Analysis, 2nd Edition, McKinney (2017)