In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

# custom functions 
from src.features.build_features import get_indices
from src.features.build_features import add_extra_features


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [3]:
file_name = 'data/raw/housing.csv'
housing_df = pd.read_csv(file_name)
housing_df.shape

(20640, 10)

## split data with strata 

In [4]:
# Create income category
# Divide by 1.5 to limit the number of income categories
housing_df["income_cat"] = np.ceil(housing_df["median_income"] / 1.5)
# Label those above 5 as 5
housing_df["income_cat"].where(housing_df["income_cat"] < 5, 5.0, inplace=True)

In [5]:
# split data again but this time with strata
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# create indices and strata with respect to income_cat
for train_index, test_index in split.split(housing_df, housing_df["income_cat"]):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set = housing_df.loc[test_index]

In [6]:
# Now we can drop the income cat column 
for df in [strat_train_set, strat_test_set]:
    df.drop("income_cat", axis=1, inplace=True)

# Prepare data for ML 

In [7]:
# drop labels for training set
# here the drop method also creates a copy
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [8]:
print(add_extra_features.__doc__)

Adds extra features to data sets.

    If boolean is true adds bedrooms per room column.
    Also adds rooms_per_household and population_per_household columns.

    Parameters
    ----------
    X : numpy.Array
        dataframe to modify
    column_name : string (optional)
        column containing hourly data


    Returns
    -------
    X : numpy.Array
        transformed array .
    


## numerical transformation pipelines 
- impute with median 
- add extra features
- scale data

In [9]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
        ('std_scaler', StandardScaler()),
    ])

## Combine pipelines 


In [10]:
# Now we can combine both pipelines numerical and categorical into one 
num_attribs = list(housing)
num_attribs.remove("ocean_proximity")
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

## Save the transformed data and pipeline parameters


In [11]:
joblib.dump(full_pipeline, 'models/full_pipeline.pkl') 
joblib.dump(housing_prepared, 'data/interim/'+'housing_prepared'+'.pkl') 
joblib.dump(housing_labels, 'data/interim/'+'housing_labels'+'.pkl') 

['data/interim/housing_labels.pkl']

# Train model 

In [12]:
# Load saved transformed data
housing_prepared = joblib.load('data/interim/'+'housing_prepared'+'.pkl')
housing_labels = joblib.load('data/interim/'+'housing_labels'+'.pkl')

In [13]:
forest_reg = RandomForestRegressor(n_estimators=30, max_features=8, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [14]:
housing_prepared.shape

(16512, 16)

In [15]:
# Save model 
joblib.dump(forest_reg, "models/forest_model.pkl", compress =True) 

['models/forest_model.pkl']

# Evaluate Test data

In [16]:
# Load model 
full_pipeline = joblib.load('models/full_pipeline.pkl')
forest_reg = joblib.load("models/forest_model.pkl") 
# Load data 
strat_test_set = joblib.load('data/processed/'+'strat_test_set'+'.pkl')

In [17]:
final_model = forest_reg

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared =full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [18]:
final_rmse

47730.22690385927

In [19]:
forest_reg.score(X_test_prepared, y_test)

0.8251813171106783

In [22]:
# % error 
((final_predictions-y_test)/y_test*100).mean()

7.398967736725318