In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from utils import *

cwd = os.getcwd()
N = 1

# Raw Data Appraisal

In [4]:
# Data for experiments
data = pd.read_csv(os.path.join(cwd, 'data/full_v3.csv'))
data

Unnamed: 0,PropID,Suburb,Bedrms,Bathrms,Cars,LandSize,BuildingArea,YearBuilt,Price,Month,Year,ICSEA_Primary,ICSEA_Year7,IRSD,IRSAD,IER,IEO,Population
0,Abbotsford_VIC_3067_1,Abbotsford,3.0,2.0,1.0,0.0,180.0,1900.0,1325000.0,9.0,2016.0,1063.0,1041.0,1077,1114,952,1159.0,8184.0
1,Abbotsford_VIC_3067_2,Abbotsford,2.0,1.0,0.0,106.0,67.0,1925.0,775000.0,3.0,2015.0,1041.0,1041.0,1077,1114,952,1159.0,8184.0
2,Abbotsford_VIC_3067_9,Abbotsford,2.0,1.0,0.0,140.0,89.0,1890.0,1100000.0,10.0,2015.0,1041.0,1041.0,1077,1114,952,1159.0,8184.0
3,Abbotsford_VIC_3067_10,Abbotsford,2.0,1.0,1.0,156.0,98.0,1900.0,847000.0,7.0,2013.0,1041.0,1041.0,1077,1114,952,1159.0,8184.0
4,Abbotsford_VIC_3067_11,Abbotsford,1.0,1.0,0.0,155.0,79.0,1900.0,1035000.0,2.0,2017.0,1063.0,1041.0,1077,1114,952,1159.0,8184.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160264,Yarrambat_VIC_3091_106,Yarrambat,5.0,3.0,2.0,1321.0,213.0,1980.0,495000.0,10.0,2014.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0
160265,Yarrambat_VIC_3091_106,Yarrambat,3.0,2.0,2.0,1321.0,213.0,1980.0,610000.0,1.0,2017.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0
160266,Yarrambat_VIC_3091_177,Yarrambat,3.0,2.0,2.0,1311.0,163.0,1985.0,781000.0,9.0,2017.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0
160267,Yarrambat_VIC_3091_181,Yarrambat,3.0,2.0,2.0,1311.0,163.0,1985.0,781000.0,9.0,2017.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0


In [5]:
encdata = pd.get_dummies(data[['Suburb', 'Bedrms', 'Bathrms', 'Cars', 'LandSize', 'BuildingArea', 'YearBuilt', 'Price', 'Year']], columns=['Suburb'])
encdata

Unnamed: 0,Bedrms,Bathrms,Cars,LandSize,BuildingArea,YearBuilt,Price,Year,Suburb_Abbotsford,Suburb_Aberfeldie,...,Suburb_Wonga Park,Suburb_Woori Yallock,Suburb_Wyndham Vale,Suburb_Yallambie,Suburb_Yan Yean,Suburb_Yannathan,Suburb_Yarra Glen,Suburb_Yarra Junction,Suburb_Yarrambat,Suburb_Yellingbo
0,3.0,2.0,1.0,0.0,180.0,1900.0,1325000.0,2016.0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,1.0,0.0,106.0,67.0,1925.0,775000.0,2015.0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,1.0,0.0,140.0,89.0,1890.0,1100000.0,2015.0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,1.0,1.0,156.0,98.0,1900.0,847000.0,2013.0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,1.0,0.0,155.0,79.0,1900.0,1035000.0,2017.0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160264,5.0,3.0,2.0,1321.0,213.0,1980.0,495000.0,2014.0,0,0,...,0,0,0,0,0,0,0,0,1,0
160265,3.0,2.0,2.0,1321.0,213.0,1980.0,610000.0,2017.0,0,0,...,0,0,0,0,0,0,0,0,1,0
160266,3.0,2.0,2.0,1311.0,163.0,1985.0,781000.0,2017.0,0,0,...,0,0,0,0,0,0,0,0,1,0
160267,3.0,2.0,2.0,1311.0,163.0,1985.0,781000.0,2017.0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
LR_result = []
RF_result = []
KNN_result = []
SV_result = []
XGB_result = []
LGB_result = []

for i in range(N):
    train_data = encdata[encdata['Year']<2017].drop(columns=['Year']).sample(frac=1)
    test_data = encdata[encdata['Year']>2016].drop(columns=['Year']).sample(frac=1)
    X_train = train_data.drop(columns=['Price'])
    y_train = train_data['Price'].apply(np.log)
    X_test = test_data.drop(columns=['Price'])
    y_test = test_data['Price'].apply(np.log)

    SV = SVR()
    SV.fit(X_train, y_train)
    SV_result.append(np.mean(abs(pct_error(y_test, SV.predict(X_test), log=False))))

    XGB = xgb.XGBRegressor()
    XGB.fit(X_train, y_train)
    XGB_result.append(np.mean(abs(pct_error(y_test, XGB.predict(X_test), log=False))))

    LGB = lgb.LGBMRegressor()
    LGB.fit(X_train, y_train)
    LGB_result.append(np.mean(abs(pct_error(y_test, LGB.predict(X_test), log=False))))

LR = LinearRegression()
LR.fit(X_train, y_train)
LR_result.append(np.mean(abs(pct_error(y_test, LR.predict(X_test), log=False))))

RF = RandomForestRegressor()
RF.fit(X_train, y_train)
RF_result.append(np.mean(abs(pct_error(y_test, RF.predict(X_test), log=False))))

KNN = KNeighborsRegressor()
KNN.fit(X_train, y_train)
KNN_result.append(np.mean(abs(pct_error(y_test, KNN.predict(X_test), log=False))))

print("Raw Data \n","-"*10)
print("Linear Regression")
print(np.mean(LR_result), np.var(LR_result))
print("Random Forest")
print(np.mean(RF_result), np.var(RF_result))
print("K Nearest Neighbors")
print(np.mean(KNN_result), np.var(KNN_result))
print("Support Vector")
print(np.mean(SV_result), np.var(SV_result))
print("XGBoost")
print(np.mean(XGB_result), np.var(XGB_result))
print("LightGBM")
print(np.mean(LGB_result), np.var(LGB_result))

25.95747950787382 0.0
27.88899877202018 0.00015981342801465255
26.005540303522885 6.310887241768095e-30


# Suburb Replaced with Quantities

In [8]:
subdata = data.drop(columns=['PropID', 'Suburb', 'Month'])
subdata

Unnamed: 0,Bedrms,Bathrms,Cars,LandSize,BuildingArea,YearBuilt,Price,Year,ICSEA_Primary,ICSEA_Year7,IRSD,IRSAD,IER,IEO,Population
0,3.0,2.0,1.0,0.0,180.0,1900.0,1325000.0,2016.0,1063.0,1041.0,1077,1114,952,1159.0,8184.0
1,2.0,1.0,0.0,106.0,67.0,1925.0,775000.0,2015.0,1041.0,1041.0,1077,1114,952,1159.0,8184.0
2,2.0,1.0,0.0,140.0,89.0,1890.0,1100000.0,2015.0,1041.0,1041.0,1077,1114,952,1159.0,8184.0
3,2.0,1.0,1.0,156.0,98.0,1900.0,847000.0,2013.0,1041.0,1041.0,1077,1114,952,1159.0,8184.0
4,1.0,1.0,0.0,155.0,79.0,1900.0,1035000.0,2017.0,1063.0,1041.0,1077,1114,952,1159.0,8184.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160264,5.0,3.0,2.0,1321.0,213.0,1980.0,495000.0,2014.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0
160265,3.0,2.0,2.0,1321.0,213.0,1980.0,610000.0,2017.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0
160266,3.0,2.0,2.0,1311.0,163.0,1985.0,781000.0,2017.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0
160267,3.0,2.0,2.0,1311.0,163.0,1985.0,781000.0,2017.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0


In [None]:
LR_result = []
RF_result = []
KNN_result = []
SV_result = []
XGB_result = []
LGB_result = []

for i in range(N):
    train_data = subdata[subdata['Year']<2017].drop(columns=['Year']).sample(frac=1)
    test_data = subdata[subdata['Year']>2016].drop(columns=['Year']).sample(frac=1)
    X_train = train_data.drop(columns=['Price'])
    y_train = train_data['Price'].apply(np.log)
    X_test = test_data.drop(columns=['Price'])
    y_test = test_data['Price'].apply(np.log)

    SV = SVR()
    SV.fit(X_train, y_train)
    SV_result.append(np.mean(abs(pct_error(y_test, SV.predict(X_test), log=False))))

    XGB = xgb.XGBRegressor()
    XGB.fit(X_train, y_train)
    XGB_result.append(np.mean(abs(pct_error(y_test, XGB.predict(X_test), log=False))))

    LGB = lgb.LGBMRegressor()
    LGB.fit(X_train, y_train)
    LGB_result.append(np.mean(abs(pct_error(y_test, LGB.predict(X_test), log=False))))

LR = LinearRegression()
LR.fit(X_train, y_train)
LR_result.append(np.mean(abs(pct_error(y_test, LR.predict(X_test), log=False))))

RF = RandomForestRegressor()
RF.fit(X_train, y_train)
RF_result.append(np.mean(abs(pct_error(y_test, RF.predict(X_test), log=False))))

KNN = KNeighborsRegressor()
KNN.fit(X_train, y_train)
KNN_result.append(np.mean(abs(pct_error(y_test, KNN.predict(X_test), log=False))))

print("Suburb replaced with Quantities\n","-"*10)
print("Linear Regression")
print(np.mean(LR_result), np.var(LR_result))
print("Random Forest")
print(np.mean(RF_result), np.var(RF_result))
print("K Nearest Neighbors")
print(np.mean(KNN_result), np.var(KNN_result))
print("Support Vector")
print(np.mean(SV_result), np.var(SV_result))
print("XGBoost")
print(np.mean(XGB_result), np.var(XGB_result))
print("LightGBM")
print(np.mean(LGB_result), np.var(LGB_result))

# Put Month Inside

In [9]:
mondata = data.drop(columns=['PropID', 'Suburb'])
mondata

Unnamed: 0,Bedrms,Bathrms,Cars,LandSize,BuildingArea,YearBuilt,Price,Month,Year,ICSEA_Primary,ICSEA_Year7,IRSD,IRSAD,IER,IEO,Population
0,3.0,2.0,1.0,0.0,180.0,1900.0,1325000.0,9.0,2016.0,1063.0,1041.0,1077,1114,952,1159.0,8184.0
1,2.0,1.0,0.0,106.0,67.0,1925.0,775000.0,3.0,2015.0,1041.0,1041.0,1077,1114,952,1159.0,8184.0
2,2.0,1.0,0.0,140.0,89.0,1890.0,1100000.0,10.0,2015.0,1041.0,1041.0,1077,1114,952,1159.0,8184.0
3,2.0,1.0,1.0,156.0,98.0,1900.0,847000.0,7.0,2013.0,1041.0,1041.0,1077,1114,952,1159.0,8184.0
4,1.0,1.0,0.0,155.0,79.0,1900.0,1035000.0,2.0,2017.0,1063.0,1041.0,1077,1114,952,1159.0,8184.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160264,5.0,3.0,2.0,1321.0,213.0,1980.0,495000.0,10.0,2014.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0
160265,3.0,2.0,2.0,1321.0,213.0,1980.0,610000.0,1.0,2017.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0
160266,3.0,2.0,2.0,1311.0,163.0,1985.0,781000.0,9.0,2017.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0
160267,3.0,2.0,2.0,1311.0,163.0,1985.0,781000.0,9.0,2017.0,1057.0,1020.0,1103,1097,1147,1047.0,1588.0


In [None]:
LR_result = []
RF_result = []
KNN_result = []
SV_result = []
XGB_result = []
LGB_result = []

for i in range(N):
    train_data = mondata[mondata['Year']<2017].drop(columns=['Year']).sample(frac=1)
    test_data = mondata[mondata['Year']>2016].drop(columns=['Year']).sample(frac=1)
    X_train = train_data.drop(columns=['Price'])
    y_train = train_data['Price'].apply(np.log)
    X_test = test_data.drop(columns=['Price'])
    y_test = test_data['Price'].apply(np.log)

    SV = SVR()
    SV.fit(X_train, y_train)
    SV_result.append(np.mean(abs(pct_error(y_test, SV.predict(X_test), log=False))))

    XGB = xgb.XGBRegressor()
    XGB.fit(X_train, y_train)
    XGB_result.append(np.mean(abs(pct_error(y_test, XGB.predict(X_test), log=False))))

    LGB = lgb.LGBMRegressor()
    LGB.fit(X_train, y_train)
    LGB_result.append(np.mean(abs(pct_error(y_test, LGB.predict(X_test), log=False))))

LR = LinearRegression()
LR.fit(X_train, y_train)
LR_result.append(np.mean(abs(pct_error(y_test, LR.predict(X_test), log=False))))

RF = RandomForestRegressor()
RF.fit(X_train, y_train)
RF_result.append(np.mean(abs(pct_error(y_test, RF.predict(X_test), log=False))))

KNN = KNeighborsRegressor()
KNN.fit(X_train, y_train)
KNN_result.append(np.mean(abs(pct_error(y_test, KNN.predict(X_test), log=False))))

print("Data with Month\n","-"*10)
print("Linear Regression")
print(np.mean(LR_result), np.var(LR_result))
print("Random Forest")
print(np.mean(RF_result), np.var(RF_result))
print("K Nearest Neighbors")
print(np.mean(KNN_result), np.var(KNN_result))
print("Support Vector")
print(np.mean(SV_result), np.var(SV_result))
print("XGBoost")
print(np.mean(XGB_result), np.var(XGB_result))
print("LightGBM")
print(np.mean(LGB_result), np.var(LGB_result))

# Data with Hindex

In [24]:
pd.options.mode.chained_assignment = None
data_wt = data.iloc[:0]
for year in np.linspace(2013.0,2017.0,5):
    for month in np.linspace(1.0,12.0,12):
        temp_data = data[(data['Year']==year) & (data['Month']==month)]
        Q1, Q2, Q3 = np.quantile(temp_data['Price'],[0.25,0.5,0.75])
        if year == 2013.0 and month == 1.0:
            temp_data[['Q1','Q2','Q3']] = [Q1, Q2, Q3]
        else:
            temp_data[['Q1','Q2','Q3']] = old
        old = [Q1, Q2, Q3]
        
        temp_data['Hindex2'] = temp_data.apply(lambda row:  0.0 if row['Price']<row['Q1'] else 
                        (1.0 if row['Price']<row['Q2'] else 
                        (2.0 if row['Price']<row['Q3'] else 3.0)), axis=1)
        data_wt = pd.concat([data_wt,temp_data],ignore_index=True)
data_wt

Unnamed: 0,PropID,Suburb,Bedrms,Bathrms,Cars,LandSize,BuildingArea,YearBuilt,Price,ICSEA_Primary,...,IER,IEO,Population,Month,Year,Q1,Q2,Q3,Hindex,Hindex2
0,Albert_Park_VIC_3206_647,Albert Park,2.0,1.0,0.0,82.0,71.0,1910.0,13.592367,1155.0,...,1031,1167.0,6215.0,1.0,2013.0,12.718896,12.959844,13.270783,3.0,3.0
1,Albert_Park_VIC_3206_915,Albert Park,2.0,1.0,0.0,82.0,71.0,1910.0,13.592367,1155.0,...,1031,1167.0,6215.0,1.0,2013.0,12.718896,12.959844,13.270783,3.0,3.0
2,Alphington_VIC_3078_347,Alphington,3.0,2.0,0.0,691.0,176.0,1930.0,13.883169,1168.0,...,1044,1145.0,5080.0,1.0,2013.0,12.718896,12.959844,13.270783,3.0,3.0
3,Altona_Meadows_VIC_3028_872,Altona Meadows,4.0,2.0,2.0,464.0,150.0,2006.0,12.923912,1098.0,...,977,953.0,19160.0,1.0,2013.0,12.718896,12.959844,13.270783,,1.0
4,Altona_Meadows_VIC_3028_1654,Altona Meadows,4.0,2.0,1.0,464.0,184.0,2000.0,13.049793,1098.0,...,977,953.0,19160.0,1.0,2013.0,12.718896,12.959844,13.270783,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160264,Wyndham_Vale_VIC_3024_3314,Wyndham Vale,4.0,2.0,2.0,448.0,171.0,2012.0,13.180632,1015.0,...,1004,952.0,23273.0,12.0,2017.0,13.240813,13.473020,13.886085,0.0,0.0
160265,Yarra_Junction_VIC_3797_47,Yarra Junction,3.0,2.0,4.0,1258.0,191.0,2001.0,13.422468,1007.0,...,995,948.0,2549.0,12.0,2017.0,13.240813,13.473020,13.886085,1.0,1.0
160266,Yarra_Junction_VIC_3797_173,Yarra Junction,3.0,2.0,2.0,602.0,139.0,2010.0,13.223113,1007.0,...,995,948.0,2549.0,12.0,2017.0,13.240813,13.473020,13.886085,1.0,0.0
160267,Yarra_Junction_VIC_3797_253,Yarra Junction,3.0,2.0,2.0,100.0,136.0,1985.0,13.304685,1007.0,...,995,948.0,2549.0,12.0,2017.0,13.240813,13.473020,13.886085,1.0,1.0


In [25]:
data = data_wt.drop(columns=['PropID', 'Suburb', 'Month', 'Price', 'Q1', 'Q2', 'Q3', 'Hindex'])
train_data = data[data['Year']<2017].drop(columns=['Year'])
test_data = data[data['Year']>2016].drop(columns=['Year'])
X_train = train_data.drop(columns=['Hindex'])
y_train = train_data['Hindex']
X_test = test_data.drop(columns=['Hindex'])
y_test = test_data['Hindex']
cla = lgb.LGBMClassifier()
cla.fit(X_train, y_train)
cla.score(X_test, y_test)

ValueError: Input contains NaN.

In [15]:
data = pd.read_csv(os.path.join(cwd, 'data/ver2_house_sold_real.csv'))


  data[data[data['Year']>2016]['Hindex'].isna()]


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).