In [1]:
import numpy as np
import pandas as pd

## Reading Data

In [2]:
data_file_path = '../data/MELBOURNE_HOUSE_PRICES_LESS.csv'

mhp_data = pd.read_csv(data_file_path)
# keep only "Property Sold" and "Property Sold Prior"
mhp_data = mhp_data[mhp_data.Method.isin(['S', 'SP'])]

mhp_data = mhp_data[mhp_data.Price.notnull()]
mhp_data.describe()

Unnamed: 0,Rooms,Price,Postcode,Propertycount,Distance
count,37104.0,37104.0,37104.0,37104.0,37104.0
mean,3.046572,962698.3,3124.967227,7590.386778,13.168995
std,0.920345,551146.4,132.916619,4499.211539,7.671679
min,1.0,112000.0,3000.0,83.0,0.0
25%,2.0,610500.0,3048.0,4242.0,7.2
50%,3.0,810000.0,3095.0,6567.0,12.1
75%,4.0,1180000.0,3163.0,10412.0,17.3
max,31.0,7650000.0,3980.0,21650.0,55.8


In [3]:
mhp_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Postcode,Regionname,Propertycount,Distance,CouncilArea
0,Abbotsford,49 Lithgow St,3,h,1490000.0,S,Jellis,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,59A Turner St,3,h,1220000.0,S,Marshall,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,119B Yarra St,3,h,1420000.0,S,Nelson,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,68 Vida St,3,h,1515000.0,S,Barry,1/04/2017,3040,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,92 Clydesdale Rd,2,h,670000.0,S,Nelson,1/04/2017,3042,Western Metropolitan,3464,10.4,Moonee Valley City Council


In [4]:
from sklearn.model_selection import train_test_split

mhp_train_set, mhp_val_set = train_test_split(mhp_data, test_size = 0.2, random_state = 111)

## Tranforming Data

In [5]:
mph_num = ['Rooms', 'Distance', 'Propertycount']
mph_cat = ['Type', 'CouncilArea']

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class ToFloatConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.astype(float)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('to_float', ToFloatConverter()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, mph_num),
    ('cat', cat_pipeline, mph_cat),
])

train_X = full_pipeline.fit_transform(mhp_train_set)
train_y = mhp_train_set.Price
train_X.shape

(29683, 39)

In [8]:
val_X = full_pipeline.transform(mhp_val_set)
val_y = mhp_val_set.Price
val_X.shape

(7421, 39)

## Learning with Random Forest

In [9]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(random_state = 111, n_estimators = 10)
forest_reg.fit(train_X, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=111, verbose=0, warm_start=False)

In [10]:
from sklearn.metrics import mean_squared_error

def get_rmse(model, X, y):
    predictions = model.predict(X)
    mse = mean_squared_error(y, predictions)
    return np.sqrt(mse)


In [11]:
print('train RMSE', get_rmse(forest_reg, train_X, train_y))
print('val RMSE', get_rmse(forest_reg, val_X, val_y))

train RMSE 258141.35677966513
val RMSE 271380.96816115326
