In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error


# Load data

In [55]:
file_name = 'data/raw/housing.csv'
housing_df = pd.read_csv(file_name)

In [56]:
housing_df.shape

(20640, 10)

## split data with strata 

In [57]:
# Create income category
# Divide by 1.5 to limit the number of income categories
housing_df["income_cat"] = np.ceil(housing_df["median_income"] / 1.5)
# Label those above 5 as 5
housing_df["income_cat"].where(housing_df["income_cat"] < 5, 5.0, inplace=True)

In [58]:
# split data again but this time with strata
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing_df, housing_df["income_cat"]):
    strat_train_set = housing_df.loc[train_index]
    strat_test_set = housing_df.loc[test_index]

In [59]:
# Now we can drop the income cat column 
for df in [strat_train_set, strat_test_set]:
    df.drop("income_cat", axis=1, inplace=True)

# Prepare data for ML 

In [119]:
# drop labels for training set
# here the drop method also creates a copy 
housing = strat_train_set.drop("median_house_value", axis=1) 
housing_labels = strat_train_set["median_house_value"].copy()

In [151]:
# get the right column indices: safer than hard-coding indices 3, 4, 5, 6
rooms_ix, bedrooms_ix, population_ix, household_ix = [list(housing.columns).index(
    col) for col in ("total_rooms", "total_bedrooms", "population", "households")]

# these get converted to numpy arrays 
def add_extra_features(X, add_bedrooms_per_room=True):

    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]

## transformation pipelines 

In [155]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
        ('std_scaler', StandardScaler()),
    ])


In [167]:
# Now we can combine both pipelines numerical and categorical into one 
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [36]:
# Save the transformed data and pipeline parameters
joblib.dump(full_pipeline, 'models/full_pipeline.pkl') 
joblib.dump(housing_prepared, 'data/interim/'+'housing_prepared'+'.pkl') 
joblib.dump(housing_labels, 'data/interim/'+'housing_labels'+'.pkl') 

['data/interim/housing_labels.pkl']

# Train model 

In [37]:
# Load saved transformed data
housing_prepared = joblib.load('data/interim/'+'housing_prepared'+'.pkl')
housing_labels = joblib.load('data/interim/'+'housing_labels'+'.pkl')

In [38]:
forest_reg = RandomForestRegressor(n_estimators=30, max_features=8, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [39]:
housing_prepared.shape

(16512, 16)

In [85]:
# Save model 
joblib.dump(forest_reg, "models/forest_model.pkl", compress =True) 

['models/forest_model.pkl']

In [41]:
import pickle
pickle.dump(forest_reg, open('model.pkl', 'wb'))

# Evaluate Test data

In [42]:
# Load model 
full_pipeline = joblib.load('models/full_pipeline.pkl')
forest_reg = joblib.load("models/forest_model.pkl") 

In [43]:
final_model = forest_reg

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared =full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [44]:
final_rmse

47730.22690385927

In [47]:
forest_reg.score(X_test_prepared, y_test)

0.8251813171106783

In [None]:
# compute statistic for % error 

In [50]:
# % error 
(abs(final_predictions-y_test)/y_test*100).mean()

18.145818263808412

In [53]:
type(X_test)

pandas.core.frame.DataFrame