In [2]:
import os
if os.getcwd().endswith("notebooks"):
    os.chdir('..')

print("Current working directory: ", os.getcwd())
if not os.getcwd().endswith("California-Housing-ML"):
    raise ValueError("Please change working directory to 'path/toCalifornia-Housing-ML' before proceeding")
!pip install -r requirements.txt

Current working directory:  /Users/irellzane/MLprojects/California-Housing-ML


In [67]:
import pandas as pd
import numpy as np
from modules.preprocessing import preprocessing

In [4]:
housing = pd.read_csv("data/housing_train.csv", index_col=0)
housing_labels = housing["median_house_value"]
housing.drop("median_house_value", axis=1, inplace=True)

## Train and Evaluate

#### With Linear Regression

In [76]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(housing, housing_labels)

housing_predictions = lin_reg.predict(housing)
housing_predictions[:5].round(-2)

array([246000., 372700., 135700.,  91400., 330900.])

In [77]:
housing_labels.iloc[:5].values

array([458300., 483800., 101700.,  96100., 361800.])

In [78]:
from sklearn.metrics import root_mean_squared_error

lin_rmse = root_mean_squared_error(housing_labels, housing_predictions)
lin_rmse

68972.88910758459

#### With Decision Tree Regressor

In [79]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocessing,
                         DecisionTreeRegressor(random_state=42))
tree_reg.fit(housing, housing_labels)

housing_predictions = tree_reg.predict(housing)
housing_predictions[:5].round(2)

array([458300., 483800., 101700.,  96100., 361800.])

In [80]:
housing_labels[:5].values

array([458300., 483800., 101700.,  96100., 361800.])

In [81]:
tree_rmse = root_mean_squared_error(housing_labels, housing_predictions)
tree_rmse # Model Overfits!

0.0

## Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score

In [82]:
tree_rmse = -cross_val_score(tree_reg, housing, housing_labels, 
                            scoring="neg_root_mean_squared_error", cv=10)

pd.Series(tree_rmse).describe()

count       10.000000
mean     67013.360949
std       1460.198570
min      64289.376198
25%      66776.146282
50%      67086.216281
75%      68140.275029
max      68659.294290
dtype: float64

#### Attempt with Random Forest Regressor

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

In [86]:
forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42))
forest_rmses = -cross_val_score(forest_reg, housing, housing_labels, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(forest_rmses).describe()

count       10.000000
mean     47124.604437
std       1069.311372
min      45292.329302
25%      46712.106520
50%      47172.209883
75%      47561.377695
max      49354.705514
dtype: float64

## Fine Tune Hyperparams

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestRegressor(random_state=42))
])

#### Grid Search

In [90]:
param_grid = [
    {
        'preprocessing__geo__n_clusters': [5, 8, 10],
        'random_forest__max_features': [4, 6, 8]
    },
    {
        'preprocessing__geo__n_clusters': [10, 15],
        'random_forest__max_features': [6, 8, 10]
    }
]
grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')

grid_search.fit(housing, housing_labels)

In [92]:
grid_search.best_params_

{'preprocessing__geo__n_clusters': 15, 'random_forest__max_features': 6}

In [14]:
def clean_param_search_res(cv_res):
    score_cols = ['split0', 'split1', 'split2', 'mean_test_rmse']
    clean_cv_res = cv_res[['param_preprocessing__geo__n_clusters',
                    'param_random_forest__max_features', 'split0_test_score',
                    'split1_test_score', 'split2_test_score', 'mean_test_score']].copy()
    clean_cv_res.columns = ['n_clusters', 'max_features'] + score_cols
    clean_cv_res.sort_values('mean_test_rmse', inplace=True, ascending=False)
    clean_cv_res[score_cols] = -clean_cv_res[score_cols].round(0).astype(np.int32)
    return clean_cv_res

In [13]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)

clean_param_search_res(cv_res).head()

NameError: name 'grid_search' is not defined

#### Randomized Search

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
    'preprocessing__geo__n_clusters': randint(low=3, high=50),
    'random_forest__max_features': randint(low=2, high=20)
}

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3,
    scoring='neg_root_mean_squared_error', random_state=42
)

rnd_search.fit(housing, housing_labels)

In [16]:
rnd_search.best_params_

{'preprocessing__geo__n_clusters': 45, 'random_forest__max_features': 9}

In [17]:
cv_res = pd.DataFrame(rnd_search.cv_results_)
clean_param_search_res(cv_res).head()

Unnamed: 0,n_clusters,max_features,split0,split1,split2,mean_test_rmse
1,45,9,41204,42108,42998,42103
8,32,7,41991,42557,43431,42660
5,42,4,42031,42968,43617,42872
0,41,16,42580,42869,43604,43018
2,23,8,42421,42841,44232,43165


## Analyze feature importances

In [18]:
final_model = rnd_search.best_estimator_

feature_importances = final_model["random_forest"].feature_importances_
feature_importances.round(2)
sorted(zip(feature_importances, final_model["preprocessing"].get_feature_names_out()), reverse=True)

[(0.18836603202647126, 'log__median_income'),
 (0.07795960969938898, 'cat__ocean_proximity_INLAND'),
 (0.06110388595864347, 'bedrooms__ratio'),
 (0.05772194900488602, 'rooms_per_house__ratio'),
 (0.04569274355282605, 'people_per_house__ratio'),
 (0.041977095119231075, 'geo__Cluster 30 similarity'),
 (0.024893290428216704, 'geo__Cluster 9 similarity'),
 (0.02349145973584661, 'geo__Cluster 36 similarity'),
 (0.021384735075780065, 'geo__Cluster 18 similarity'),
 (0.019231937253583756, 'geo__Cluster 3 similarity'),
 (0.019114201150802552, 'geo__Cluster 32 similarity'),
 (0.01740110192655986, 'geo__Cluster 25 similarity'),
 (0.01628160145237179, 'geo__Cluster 1 similarity'),
 (0.015974756525084677, 'geo__Cluster 26 similarity'),
 (0.014014150061434243, 'geo__Cluster 7 similarity'),
 (0.013588284206905332, 'geo__Cluster 10 similarity'),
 (0.01344850861123562, 'geo__Cluster 12 similarity'),
 (0.013432151644106247, 'geo__Cluster 34 similarity'),
 (0.012971875049373518, 'geo__Cluster 19 similar

## Save model

In [56]:
import joblib

joblib.dump(final_model, "models/my_california_housing_model.pkl.gz", compress='gzip')

['models/my_california_housing_model.pkl.gz']

In [57]:
housing_labels.iloc[:5].values

array([[458300.],
       [483800.],
       [101700.],
       [ 96100.],
       [361800.]])

In [58]:
from sklearn.metrics import root_mean_squared_error

lin_rmse = root_mean_squared_error(housing_labels, housing_predictions)
lin_rmse

68972.88910758459

#### With Decision Tree Regressor

In [59]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing, housing_labels)

housing_predictions = tree_reg.predict(housing)
housing_predictions[:5].round(2)

array([458300., 483800., 101700.,  96100., 361800.])

In [60]:
housing_labels[:5].values

array([[458300.],
       [483800.],
       [101700.],
       [ 96100.],
       [361800.]])

In [61]:
tree_rmse = root_mean_squared_error(housing_labels, housing_predictions)
tree_rmse # Model Overfits!

0.0

## Cross Validation

In [62]:
from sklearn.model_selection import cross_val_score

tree_rmse = -cross_val_score(tree_reg, housing, housing_labels, 
                            scoring="neg_root_mean_squared_error", cv=10)

pd.Series(tree_rmse).describe()

count       10.000000
mean     66149.463525
std       2505.997238
min      62602.424703
25%      63914.082682
50%      66248.434408
75%      68107.671276
max      69853.960730
dtype: float64

#### Attempt with Random Forest Regressor

In [63]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

forest_reg = RandomForestRegressor(random_state=42)
forest_rmses = -cross_val_score(forest_reg, housing, housing_labels["median_house_value"], scoring="neg_root_mean_squared_error", cv=10)

In [64]:
pd.Series(forest_rmses).describe()

count       10.000000
mean     47174.587364
std        996.823918
min      45617.905147
25%      46559.165989
50%      47225.189327
75%      47495.894845
max      49392.754151
dtype: float64

## Fine Tune Hyperparams