## Melbourne House Prices

### Initialize

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
dfMlb = pd.read_csv('house_prices.csv')

In [3]:
dfMlb.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,03/12/16,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,04/02/16,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,04/03/17,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,04/03/17,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,04/06/16,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [4]:
# Select target
y = dfMlb.loc[:,['Price']]
y.head()

Unnamed: 0,Price
0,1480000
1,1035000
2,1465000
3,850000
4,1600000


In [5]:
# Select features
X = dfMlb.drop(['Price'], axis=1)
X.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,S,Biggin,03/12/16,2.5,3067,2,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,S,Biggin,04/02/16,2.5,3067,2,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,SP,Biggin,04/03/17,2.5,3067,3,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,PI,Biggin,04/03/17,2.5,3067,3,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,VB,Nelson,04/06/16,2.5,3067,3,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [6]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Build Random Forest Model

In [54]:
# Random Forest model function

from sklearn.ensemble import RandomForestRegressor

def get_random_forest_mae(X_trn, X_tst, y_trn, y_tst):
    mdlRfsMlb = RandomForestRegressor(random_state=1)
    mdlRfsMlb.fit(X_trn, y_trn)
    y_tst_prd = mdlRfsMlb.predict(X_tst)
    return mean_absolute_error(y_tst, y_tst_prd)

In [9]:
# Try and build a model using all features
get_random_forest_mae(X_train, X_test, y_train, y_test)

ValueError: could not convert string to float: 'Brighton'

### Numberical Features

In [10]:
X.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Method            object
SellerG           object
Date              object
Distance         float64
Postcode           int64
Bedroom2           int64
Bathroom           int64
Car              float64
Landsize           int64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount      int64
dtype: object

In [11]:
# Select numeric featues
cols_num = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
Xnum = X[cols_num]

In [12]:
# Split n into train and test
Xnum_train, Xnum_test, y_train, y_test = train_test_split(Xnum, y, test_size=0.2, random_state=1)

In [13]:
# Try and build a model using all features
get_random_forest_mae(Xnum_train, Xnum_test, y_train, y_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [14]:
Xnum_train.isna().sum()

Rooms               0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                52
Landsize            0
BuildingArea     5193
YearBuilt        4312
Lattitude           0
Longtitude          0
Propertycount       0
dtype: int64

### Missing Values

#### Approach 1: Drop missing values

In [15]:
cols_num_null = [col for col in Xnum_train.columns if Xnum_train[col].isna().any()]
cols_num_null

['Car', 'BuildingArea', 'YearBuilt']

In [16]:
Xnum_train_drpnull = Xnum_train.drop(cols_num_null, axis=1)
Xnum_test_drpnull = Xnum_test.drop(cols_num_null, axis=1)
Xnum_train_drpnull.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
1041,3,11.2,3186,3,1,366,-37.9038,145.0001,10579
1989,3,7.8,3058,3,1,238,-37.7539,144.9612,11204
10157,3,5.2,3056,3,1,439,-37.77047,144.97005,11918
1711,2,11.4,3163,2,1,0,-37.8863,145.066,7822
11565,4,11.0,3018,4,2,615,-37.87057,144.83623,5301


In [17]:
get_random_forest_mae(Xnum_train_drpnull, Xnum_test_drpnull, y_train, y_test)

  mdlRfsMlb.fit(X_trn, y_trn)


176556.1092096132


#### Approach 2: Impute missing values

In [18]:
# Replace with fixed value

Xnum_train_repnull = Xnum_train.fillna(0)
Xnum_test_repnull = Xnum_test.fillna(0)

print('MAE from replacing missing value columns: ')
get_random_forest_mae(Xnum_train_repnull, Xnum_test_repnull, y_train, y_test)

MAE from replacing missing value columns: 


  mdlRfsMlb.fit(X_trn, y_trn)


167656.98217318885


In [24]:
# Replace with ffil

Xnum_train_repnull = Xnum_train.fillna(method='ffill')
Xnum_test_repnull = Xnum_test.fillna(method='ffill')

print('MAE from replacing missing value columns: ')
get_random_forest_mae(Xnum_train_repnull, Xnum_test_repnull, y_train, y_test)


MAE from replacing missing value columns: 


  mdlRfsMlb.fit(X_trn, y_trn)


172541.71958447297


In [25]:
# Replace with a mean

Xnum_train_repnull = Xnum_train.fillna(Xnum_train.mean())
Xnum_test_repnull = Xnum_test.fillna(Xnum_train.mean())

print('MAE from replacing missing value columns: ')
get_random_forest_mae(Xnum_train_repnull, Xnum_test_repnull, y_train, y_test)

MAE from replacing missing value columns: 


  mdlRfsMlb.fit(X_trn, y_trn)


172541.71958447297


In [23]:
# Going forward, let us replace all missing numeric values with the column mean

X_train[cols_num] = Xnum_train_repnull[cols_num]
X_test[cols_num] = Xnum_test_repnull[cols_num]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


### Categorical Features

In [32]:
cols_obj = [col for col in X.columns if X[col].dtype in ['object']]
cols_obj

['Suburb',
 'Address',
 'Type',
 'Method',
 'SellerG',
 'Date',
 'CouncilArea',
 'Regionname']

In [33]:
# Label encoding all non-numeric features

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

Xle_train = X_train.copy()
Xle_test = X_test.copy()

label_encoder = LabelEncoder()

# This will throw an error because it is trying to encode too many values
for col in cols_obj:
    Xle_train[col] = label_encoder.fit_transform(X_train[col])
    Xle_test[col] = label_encoder.transform(X_test[col])

ValueError: y contains previously unseen labels: 'Beaconsfield Upper'

In [51]:
cols_cat = [col for col in X.columns if X[col].dtype in ['object'] and X[col].nunique() < 10]
cols_cat

['Type', 'Method', 'Regionname']

In [55]:
# Label encoding only categorical features

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

Xle_train = X_train.copy()
Xle_test = X_test.copy()

label_encoder = LabelEncoder()

for col in cols_cat:
    Xle_train[col] = label_encoder.fit_transform(X_train[col])
    Xle_test[col] = label_encoder.transform(X_test[col])

In [56]:
print("MAE from adding categorical columns: ")
# Combine both numerical & categorical columns
mae = get_random_forest_mae(Xle_train[cols_num+cols_cat], Xle_test[cols_num+cols_cat], y_train, y_test)
print(mae)

MAE from adding categorical columns: 


  mdlRfsMlb.fit(X_trn, y_trn)


160381.4257916053


### Build Gradient Boosting Model

In [68]:
from xgboost import XGBRegressor

mdlXgbMlb = XGBRegressor()
mdlXgbMlb.fit(Xle_train[cols_num+cols_cat], y_train)
y_test_prd = mdlXgbMlb.predict(Xle_test[cols_num+cols_cat])
mae = mean_absolute_error(y_test, y_test_prd)

print("MAE from default XGBoost model: ")
print(mae)

MAE from default XGBoost model: 
163202.95284310568


In [72]:
# Tuning some parameters from XGBRegressor
from xgboost import XGBRegressor

mdlXgbMlb = XGBRegressor(learning_rate=0.01, max_depth=5, n_estimators=5000)
mdlXgbMlb.fit(Xle_train[cols_num+cols_cat], y_train)
y_test_prd = mdlXgbMlb.predict(Xle_test[cols_num+cols_cat])
mae = mean_absolute_error(y_test, y_test_prd)



print("MAE from tuned XGBoost model: ")
print(mae)

MAE from tuned XGBoost model: 
157110.6315986285
