In [52]:
import os
if os.getcwd().endswith("notebooks"):
    os.chdir('..')

print("Current working directory: ", os.getcwd())
if not os.getcwd().endswith("California-Housing-ML"):
    raise ValueError("Please change working directory to 'path/toCalifornia-Housing-ML' before proceeding")
!pip install -r requirements.txt

Current working directory:  /Users/irellzane/MLprojects/California-Housing-ML


In [67]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.compose import ColumnTransformer, make_column_selector


class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans = KMeans(n_clusters=self.n_clusters, 
                             random_state=self.random_state)
        self.kmeans.fit(X, sample_weight=sample_weight)
        return self
    
    def transform(self, X):
        self.cluster_centers_ = self.kmeans.cluster_centers_
        return rbf_kernel(X, self.kmeans.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler()
    )

cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"),
                             OneHotEncoder(handle_unknown="ignore"))

log_pipeline = make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(np.log, feature_names_out="one-to-one"),
        StandardScaler()
    )

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1, random_state=42)
default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]), 
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object))
],
remainder=default_num_pipeline
)

In [55]:
housing = pd.read_csv("data/housing_train_prepped.csv", index_col=0)
housing_labels = pd.read_csv("data/housing_train_labels.csv", index_col=0)
housing.head()

Unnamed: 0,bedrooms__ratio,rooms_per_house__ratio,people_per_house__ratio,log__total_bedrooms,log__total_rooms,log__population,log__households,log__median_income,geo__Cluster 0 similarity,geo__Cluster 1 similarity,...,geo__Cluster 6 similarity,geo__Cluster 7 similarity,geo__Cluster 8 similarity,geo__Cluster 9 similarity,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN,remainder__housing_median_age
13096,1.846624,-0.866027,-0.330204,1.324114,0.637892,0.456906,1.310369,-1.071522,0.4581829,1.241847e-14,...,0.0008489216,0.9770322,2.382191e-08,3.819126e-18,0.0,0.0,0.0,1.0,0.0,1.861119
14973,-0.508121,0.02455,-0.253616,-0.252671,-0.063576,-0.711654,-0.14203,1.194712,6.511495e-10,0.9579596,...,5.614049e-27,1.260964e-13,0.1103491,0.354761,1.0,0.0,0.0,0.0,0.0,0.90763
3785,-0.202155,-0.041193,-0.051041,-0.925266,-0.859927,-0.941997,-0.91303,-0.756981,0.3432506,4.261141e-15,...,0.005641131,0.7303265,2.508224e-08,2.669659e-18,0.0,1.0,0.0,0.0,0.0,0.351428
14689,-0.149006,-0.034858,-0.141475,0.952773,0.943475,0.6707,0.925373,-0.912253,2.244844e-15,0.2704823,...,5.913326e-35,5.2012629999999996e-20,0.001712982,0.8874598,0.0,1.0,0.0,0.0,0.0,-0.919891
20507,0.963208,-0.666554,-0.306148,1.437622,1.00359,0.719093,1.481464,0.034537,1.090228e-11,0.9422206,...,5.421817e-30,1.04803e-15,0.02568824,0.5279506,0.0,0.0,0.0,0.0,1.0,0.5898


## Train and Evaluate

#### With Linear Regression

In [56]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing, housing_labels)

housing_predictions = lin_reg.predict(housing)
housing_predictions[:5].round(-2)

array([[246000.],
       [372700.],
       [135700.],
       [ 91400.],
       [330900.]])

In [57]:
housing_labels.iloc[:5].values

array([[458300.],
       [483800.],
       [101700.],
       [ 96100.],
       [361800.]])

In [58]:
from sklearn.metrics import root_mean_squared_error

lin_rmse = root_mean_squared_error(housing_labels, housing_predictions)
lin_rmse

68972.88910758459

#### With Decision Tree Regressor

In [59]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing, housing_labels)

housing_predictions = tree_reg.predict(housing)
housing_predictions[:5].round(2)

array([458300., 483800., 101700.,  96100., 361800.])

In [60]:
housing_labels[:5].values

array([[458300.],
       [483800.],
       [101700.],
       [ 96100.],
       [361800.]])

In [61]:
tree_rmse = root_mean_squared_error(housing_labels, housing_predictions)
tree_rmse # Model Overfits!

0.0

## Cross Validation

In [62]:
from sklearn.model_selection import cross_val_score

tree_rmse = -cross_val_score(tree_reg, housing, housing_labels, 
                            scoring="neg_root_mean_squared_error", cv=10)

pd.Series(tree_rmse).describe()

count       10.000000
mean     66149.463525
std       2505.997238
min      62602.424703
25%      63914.082682
50%      66248.434408
75%      68107.671276
max      69853.960730
dtype: float64

#### Attempt with Random Forest Regressor

In [63]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

forest_reg = RandomForestRegressor(random_state=42)
forest_rmses = -cross_val_score(forest_reg, housing, housing_labels["median_house_value"], scoring="neg_root_mean_squared_error", cv=10)

In [64]:
pd.Series(forest_rmses).describe()

count       10.000000
mean     47174.587364
std        996.823918
min      45617.905147
25%      46559.165989
50%      47225.189327
75%      47495.894845
max      49392.754151
dtype: float64

## Fine Tune Hyperparams