In [2]:
import pandas as pd 
import numpy as np
house_sales = "../Resources/Perth_Housing_Prices450_D.csv"
sales_450 = pd.read_csv(house_sales)
sales_450.set_index('ADDRESS', inplace=True)

house_sales2 = "../Resources/Perth_Housing_Prices1000_D.csv"
sales_1000 = pd.read_csv(house_sales2)
sales_1000.set_index('ADDRESS', inplace=True)

house_sales3 = "../Resources/Perth_Housing_Prices_plus_D.csv"
sales_plus = pd.read_csv(house_sales3)
sales_plus.set_index('ADDRESS', inplace=True)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# column names for recombining dataframe
columns = ['SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE', 'LAND_AREA',
       'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK', 'Prediction']

In [4]:
# create empty dataframe for recombining into
combined_df = pd.DataFrame(columns=columns)
combined_df

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction


In [5]:
# linear regression model with dataframe broken down by suburb and only properties with
# land area less than 450m2 analysed.


pred_house_sell = {}


# loop on every suburb
for SUBURB in sales_450['SUBURB'].unique():
    # slice
    sales_slice = sales_450[sales_450['SUBURB'] == SUBURB]
    sales_slice = sales_slice[sales_slice['LAND_AREA'] < 450]
    df = sales_slice.copy()

    # set X and y
    y = sales_slice['PRICE']
    X = sales_slice.drop(['SUBURB', 'PRICE', 'NEAREST_STN', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 
                          'NEAREST_SCH', 'NEAREST_SCH_RANK'], axis=1)
    
    # scale the data
    X_scaled = StandardScaler().fit_transform(X)

    # train, test, split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

    #building linear regression model on training data
    model1 = LinearRegression().fit(X_train, y_train)
    pred_house_sell[SUBURB] = model1.predict(X_test)
    
    # make prediction and add it to the sliced dataframe
    prediction = model1.predict(X_scaled)
    df['Prediction'] = prediction
   
    # rebuild full dataframe
    combined_df = pd.concat([combined_df, df], ignore_index=False)

    # print r2 score
    print('Accuracy Score:', SUBURB, r2_score(y_test, pred_house_sell[SUBURB]))  


Accuracy Score: Butler 0.23191025603633364
Accuracy Score: Alkimos 0.11616288526644292
Accuracy Score: Banksia Grove 0.71584611122515
Accuracy Score: Pearsall -0.8501845981838048
Accuracy Score: Harrisdale -0.17257155092719745
Accuracy Score: Shenton Park 0.6020106795352951
Accuracy Score: Bertram -0.7274736360347616
Accuracy Score: Aveley 0.5518728448445402
Accuracy Score: Redcliffe -1.2854320403579385
Accuracy Score: North Fremantle -0.2524160499873027
Accuracy Score: Queens Park 0.4718840818094563
Accuracy Score: Eglinton 0.1114513895245155
Accuracy Score: West Leederville 0.6857471685708383
Accuracy Score: Inglewood 0.7323165118177912
Accuracy Score: Clarkson -0.8215015698079655
Accuracy Score: South Fremantle 0.5012635895443157
Accuracy Score: Wandi -1.309577734153168
Accuracy Score: Innaloo 0.1821724654098189
Accuracy Score: Ascot 0.35499311965182934
Accuracy Score: Joondanna 0.4702659611593113
Accuracy Score: Ridgewood 0.10320736828309773
Accuracy Score: Cannington -0.3181521773

In [6]:
# linear regression model with dataframe broken down by suburb and only properties with
# land area 450m2 to less than 1000m2 analysed.


pred_house_sell = {}


# loop on every suburb
for SUBURB in sales_1000['SUBURB'].unique():
    # slice
    sales_slice = sales_1000[sales_1000['SUBURB'] == SUBURB]
    sales_slice = sales_slice[(sales_slice['LAND_AREA'] > 449) & (sales_slice['LAND_AREA'] < 1000)]
    df = sales_slice.copy()

    # set X and y
    y = sales_slice['PRICE']
    X = sales_slice.drop(['SUBURB', 'PRICE', 'NEAREST_STN', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 
                          'NEAREST_SCH', 'NEAREST_SCH_RANK'], axis=1)
    
    # scale the data
    X_scaled = StandardScaler().fit_transform(X)

    # train, test, split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

    #building linear regression model on training data
    model2 = LinearRegression().fit(X_train, y_train)
    pred_house_sell[SUBURB] = model2.predict(X_test)
    
    # make prediction and add it to the sliced dataframe
    prediction = model2.predict(X_scaled)
    df['Prediction'] = prediction
   
    # rebuild full dataframe
    combined_df = pd.concat([combined_df, df], ignore_index=False)

    # print r2 score
    print('Accuracy Score:', SUBURB, r2_score(y_test, pred_house_sell[SUBURB]))

Accuracy Score: Merriwa 0.005130461113675544
Accuracy Score: Carramar 0.5317142597306166
Accuracy Score: Iluka 0.6420652883034017
Accuracy Score: Jane Brook 0.25535190827953735
Accuracy Score: Tapping 0.4463518265607016
Accuracy Score: Huntingdale 0.3167497168332005
Accuracy Score: Port Kennedy 0.6309119305761374
Accuracy Score: Bertram 0.3835392540583019
Accuracy Score: Secret Harbour 0.7894131847954111
Accuracy Score: Hocking 0.3440035129635136
Accuracy Score: Mindarie 0.61031330541179
Accuracy Score: Darch 0.4037895910629218
Accuracy Score: Atwell 0.7316306549238394
Accuracy Score: Wattle Grove 0.3044835528191302
Accuracy Score: Bennett Springs 0.20702945670174133
Accuracy Score: Gwelup 0.5483942936247828
Accuracy Score: High Wycombe 0.6815831973520532
Accuracy Score: Currambine 0.27425891984446227
Accuracy Score: Quinns Rocks 0.8164437437801837
Accuracy Score: Singleton 0.45162243051373563
Accuracy Score: Kinross 0.4288855825362633
Accuracy Score: Warnbro 0.5999609272260409
Accurac

In [7]:
# linear regression model with dataframe broken down by suburb and only properties with
# land area 1000m2 or greater analysed.


pred_house_sell = {}


# loop on every suburb
for SUBURB in sales_plus['SUBURB'].unique():
    # slice
    sales_slice = sales_plus[sales_plus['SUBURB'] == SUBURB]
    sales_slice = sales_slice[sales_slice['LAND_AREA'] > 999]
    df = sales_slice.copy()

    # set X and y
    y = sales_slice['PRICE']
    X = sales_slice.drop(['SUBURB', 'PRICE', 'NEAREST_STN', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 
                          'NEAREST_SCH', 'NEAREST_SCH_RANK'], axis=1)
    
    # scale the data
    X_scaled = StandardScaler().fit_transform(X)

    # train, test, split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

    #building linear regression model on training data
    model3 = LinearRegression().fit(X_train, y_train)
    pred_house_sell[SUBURB] = model3.predict(X_test)
    
    # make prediction and add it to the sliced dataframe
    prediction = model3.predict(X_scaled)
    df['Prediction'] = prediction
   
    # rebuild full dataframe
    combined_df = pd.concat([combined_df, df], ignore_index=False)

    # print r2 score
    print('Accuracy Score:', SUBURB, r2_score(y_test, pred_house_sell[SUBURB]))

Accuracy Score: Darlington 0.012426651982863302
Accuracy Score: Roleystone 0.408755984259059
Accuracy Score: Mount Helena 0.4483366636369638
Accuracy Score: Lesmurdie 0.5409849060837155
Accuracy Score: Gooseberry Hill 0.10430902059348168
Accuracy Score: Mundaring 0.4004608312078748
Accuracy Score: Bedfordale 0.04159060059356812
Accuracy Score: Stoneville 0.5440444594183798
Accuracy Score: Kalamunda -0.11923571178115111
Accuracy Score: Glen Forrest 0.6632596799479318
Accuracy Score: Helena Valley 0.5971346311116772
Accuracy Score: The Vines -0.04785549481258444
Accuracy Score: Parkerville 0.6778272909650508
Accuracy Score: Maida Vale 0.6125099455766441
Accuracy Score: Serpentine 0.38033947727636386
Accuracy Score: Gidgegannup 0.5276788192208878
Accuracy Score: Greenmount 0.009307570578659052
Accuracy Score: Bullsbrook -16.948544274371528
Accuracy Score: Oakford -0.694246751044928
Accuracy Score: Dalkeith 0.11512303537871726
Accuracy Score: Mount Richon 0.7698654298117392
Accuracy Score:

In [9]:
# empty dictionary for r2 scores
temp = {}

# iterate through recombined dataframe to attain the r2 score for each suburb
for SUBURB in combined_df['SUBURB'].unique():
    sales_slice = combined_df[combined_df['SUBURB'] == SUBURB]
    #df = sales_slice.copy()
    price = sales_slice['PRICE']
    print('Accuracy Score:', SUBURB, r2_score(price, sales_slice['Prediction']))
    temp[SUBURB] = r2_score(price, sales_slice['Prediction'])

# convert r2 dictionary to dataframe
temp = pd.DataFrame.from_dict(temp, orient ='index').reset_index()
temp.head()

Accuracy Score: Butler 0.7442912287744253
Accuracy Score: Alkimos 0.6408411126955679
Accuracy Score: Banksia Grove 0.7358804626508805
Accuracy Score: Pearsall 0.7306469304053844
Accuracy Score: Harrisdale 0.8017784049682389
Accuracy Score: Shenton Park 0.7304788334768334
Accuracy Score: Bertram 0.6068838116946744
Accuracy Score: Aveley 0.7638809941724093
Accuracy Score: Redcliffe 0.4491746096803467
Accuracy Score: North Fremantle -13.739752122500985
Accuracy Score: Queens Park 0.6969918920981267
Accuracy Score: Eglinton -6.437628653719629
Accuracy Score: West Leederville 0.6909078948106073
Accuracy Score: Inglewood 0.8362437419949704
Accuracy Score: Clarkson 0.7046989076725676
Accuracy Score: South Fremantle 0.7256993657261297
Accuracy Score: Wandi 0.5949253376438116
Accuracy Score: Innaloo 0.6521398935796696
Accuracy Score: Ascot 0.4514703971962064
Accuracy Score: Joondanna 0.5537558037387602
Accuracy Score: Ridgewood 0.34229136732378584
Accuracy Score: Cannington -0.5332306437640641


Unnamed: 0,index,0
0,Butler,0.744291
1,Alkimos,0.640841
2,Banksia Grove,0.73588
3,Pearsall,0.730647
4,Harrisdale,0.801778


In [10]:
# rename columns for merging
temp.columns = ['SUBURB', 'r2 Score']
temp.head()

Unnamed: 0,SUBURB,r2 Score
0,Butler,0.744291
1,Alkimos,0.640841
2,Banksia Grove,0.73588
3,Pearsall,0.730647
4,Harrisdale,0.801778


In [11]:
# merge r2 Score onto recombined dataframe
sales_df_r2 = pd.merge(combined_df.reset_index(), temp, how='left', on=['SUBURB', 'SUBURB']).set_index('index')
sales_df_r2.head(2)

Unnamed: 0_level_0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction,r2 Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
32 Langport Way,Butler,339000,3,2,2,362,144,2013,36000,Butler Station,2500,9/01/2018,6036,-31.651509,115.718454,IRENE MCCORMACK CATHOLIC COLLEGE,0.898715,120.0,336522.18167,0.744291
3 Landbeach Boulevard,Butler,390000,3,2,2,365,146,2008,36900,Butler Station,1400,10/01/2018,6036,-31.646096,115.708062,BUTLER COLLEGE,0.622269,39.0,311312.773832,0.744291


In [17]:
# create a dataframe from the counts of properties per suburb
counts_df = combined_df['SUBURB'].value_counts().rename_axis('SUBURB').reset_index(name='property_count')
counts_df

Unnamed: 0,SUBURB,property_count
0,Bertram,231
1,Iluka,212
2,Carramar,208
3,Butler,207
4,Mindarie,202
...,...,...
291,Carmel,9
292,West Swan,8
293,Glendalough,8
294,Belhus,8


In [18]:
# add the property count to the r2 score dataframe
temp2 = pd.merge(temp, counts_df, how='left', on=['SUBURB', 'SUBURB'])
temp2

Unnamed: 0,SUBURB,r2 Score,property_count
0,Butler,0.744291,207
1,Alkimos,0.640841,114
2,Banksia Grove,0.735880,165
3,Pearsall,0.730647,145
4,Harrisdale,0.801778,147
...,...,...,...
291,Orange Grove,-0.591843,10
292,Carmel,0.959620,9
293,Glendalough,-8.007986,8
294,West Swan,0.387529,8


In [19]:
# write the r2 score/property count dataframe to csv
temp2.to_csv('suburb_r2_land_size_date_count.csv', sep=',')