In [12]:
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from scipy.stats import randint
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, RandomizedSearchCV
from joblib import dump as jdump
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

In [13]:
# Import data
housing_raw = pd.read_csv('./data/housing.csv')

In [14]:
# Use income data to create column with categorical data
housing_raw['income_cat'] = pd.cut(housing_raw['median_income'], bins=[0,1.5,3,4.5,6,np.inf], labels=[1,2,3,4,5])

In [15]:
# Split into stratified train and test sets
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing_raw, housing_raw["income_cat"]):
    housing_train = housing_raw.loc[train_index]
    housing_test = housing_raw.loc[test_index]

In [16]:
# Get housing X and y from train data
housing = housing_train.drop("median_house_value", axis=1)
housing_labels = housing_train["median_house_value"].copy()

In [17]:
# Isolate columns with numerical values
housing_num = housing.drop('ocean_proximity', axis=1)

In [18]:
# Get indices for use in add_extra_features function
rooms_idx, bedrooms_idx, population_idx, household_idx = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

In [19]:
def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_idx] / X[:, household_idx]
    population_per_household = X[:, population_idx] / X[:, household_idx]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_idx] / X[:, rooms_idx]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]

attr_adder = FunctionTransformer(add_extra_features, validate=False,
                                 kw_args={"add_bedrooms_per_room": False})
housing_extra_attribs = attr_adder.fit_transform(housing.values)

In [20]:
# Pipeline for numerical data
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
        ('std_scaler', StandardScaler()),
    ])

In [27]:
numerical_cols = list(housing_num)
categorical_cols = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, numerical_cols),
        ("cat", OneHotEncoder(), categorical_cols),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=2, cv=2, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

In [None]:
final_model = rnd_search.best_estimator_

y_test = housing_test["median_house_value"].copy()
X_test = housing_test.drop("median_house_value", axis=1)

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

In [None]:
now = datetime.now()
time_info = now.strftime("%m-%d-%H-%M")
output_string = f'./output/ca_housing_rfr_model_{time_info}.joblib'
jdump(final_model, output_string)