In [1]:
import numpy as np
import pandas as pd

# Reading and Preparing Data

In [2]:
data_file_path = '../data/MELBOURNE_HOUSE_PRICES_LESS.csv'

mhp_data = pd.read_csv(data_file_path)
# keep only "Property Sold" and "Property Sold Prior"
mhp_data = mhp_data[mhp_data.Method.isin(['S', 'SP'])]

mhp_data = mhp_data[mhp_data.Price.notnull()]

mhp_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Postcode,Regionname,Propertycount,Distance,CouncilArea
0,Abbotsford,49 Lithgow St,3,h,1490000.0,S,Jellis,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,59A Turner St,3,h,1220000.0,S,Marshall,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,119B Yarra St,3,h,1420000.0,S,Nelson,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,68 Vida St,3,h,1515000.0,S,Barry,1/04/2017,3040,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,92 Clydesdale Rd,2,h,670000.0,S,Nelson,1/04/2017,3042,Western Metropolitan,3464,10.4,Moonee Valley City Council


In [3]:
from sklearn.model_selection import train_test_split

mhp_train_set, mhp_val_set = train_test_split(mhp_data, test_size = 0.2, random_state = 111)


In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class ToFloatConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.astype(float)

In [5]:
from datetime import datetime

class DaysAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, base_date = None):
        if base_date is None:
            self.base_date = datetime(2000, 1, 1)
        else:
            self.base_date = base_date
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        def cnv(date):
            d = datetime.strptime(date, '%d/%m/%Y')
            return (d - self.base_date).days
        
        return X.assign(Days=X.Date.apply(cnv))


In [6]:
mph_num = ['Rooms', 'Distance', 'Propertycount', 'Days']
mph_cat = ['Type', 'CouncilArea']

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('to_float', ToFloatConverter()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

column_transformer = ColumnTransformer([
    
    ('num', num_pipeline, mph_num),
    ('cat', cat_pipeline, mph_cat),
])

full_pipeline = Pipeline([
    ('add_days', DaysAttributeAdder()),
    ('col_tr', column_transformer),
])

train_X = full_pipeline.fit_transform(mhp_train_set)
train_y = mhp_train_set.Price

In [8]:
val_X = full_pipeline.transform(mhp_val_set)
val_y = mhp_val_set.Price

# Learning

In [9]:
from sklearn.metrics import mean_squared_error

def get_rmse(model, X, y):
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    return np.sqrt(mse)

In [10]:
def print_rmse(name, model, train_X, train_y, val_X, val_y):
    print(name, 'train RMSE', get_rmse(model, train_X, train_y))
    print(name, 'validation RMSE', get_rmse(model, val_X, val_y))

In [11]:
def print_rmse_grid(grid):
    for mean_score, params in zip(grid.cv_results_["mean_test_score"], grid.cv_results_["params"]):
        print(np.sqrt(-mean_score), params)

## Random Forest

In [12]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(random_state = 111, n_estimators = 10)
forest_reg.fit(train_X, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=111, verbose=0, warm_start=False)

In [13]:
print_rmse('Random Forest', forest_reg, train_X, train_y, val_X, val_y)

Random Forest train RMSE 155278.4528430587
Random Forest validation RMSE 297831.5987795503


### Grid Search with Random Forest

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 100, 300], 'max_features': [2, 4, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg_grid = RandomForestRegressor(random_state=111)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', 
                           return_train_score=True, n_jobs=-1)
grid_search.fit(train_X, train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=111, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'n_estimators': [3, 100, 300], 'max_features': [2, 4, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [15]:
print_rmse('Grid Search with Random Forest', grid_search.best_estimator_, train_X, train_y, val_X, val_y)

Grid Search with Random Forest train RMSE 141478.96699113166
Grid Search with Random Forest validation RMSE 288477.9068006992


In [16]:
print_rmse_grid(grid_search)

338376.7554100356 {'max_features': 2, 'n_estimators': 3}
308339.12847567303 {'max_features': 2, 'n_estimators': 100}
307978.3126519749 {'max_features': 2, 'n_estimators': 300}
336460.24171068537 {'max_features': 4, 'n_estimators': 3}
309109.5178949636 {'max_features': 4, 'n_estimators': 100}
308701.9054881939 {'max_features': 4, 'n_estimators': 300}
338355.3083373792 {'max_features': 8, 'n_estimators': 3}
309501.54407662066 {'max_features': 8, 'n_estimators': 100}
309453.915886689 {'max_features': 8, 'n_estimators': 300}
358168.7248268884 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
347526.3566014455 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
357659.0742603974 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
348330.25164857786 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
359280.96435237786 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
350250.4515118374 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}


In [17]:
grid_search.best_params_

{'max_features': 2, 'n_estimators': 300}