In [2]:
import pandas as pd 
import numpy as np
house_sales = "Resources/Perth_Housing_Prices2.csv"
sales_df = pd.read_csv(house_sales)
sales_df.set_index('ADDRESS', inplace=True)
sales_df.head()

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
sales_df['GARAGE'] = sales_df['GARAGE'].fillna(0)

In [4]:
sales_df.columns

Index(['SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE', 'LAND_AREA',
       'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK'],
      dtype='object')

In [5]:
# column names for recombining dataframe
columns = ['SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE', 'LAND_AREA',
       'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK', 'Prediction']

In [6]:
# create empty dataframe for recombining into
combined_df = pd.DataFrame(columns=columns)
combined_df

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction


In [12]:
pred_house_sell = {}
temp = {}

# loop on every suburb
for SUBURB in sales_df['SUBURB'].unique():
    # slice
    sales_slice = sales_df[sales_df['SUBURB'] == SUBURB]
    df = sales_slice.copy()

    # set X and y
    y = sales_slice['PRICE']
    X = sales_slice.drop(['SUBURB', 'PRICE', 'NEAREST_STN', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH', 'NEAREST_SCH_RANK', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN_DIST', 'NEAREST_SCH_DIST'], axis=1)
    
    # scale the data
    X_scaled = StandardScaler().fit_transform(X)

    # train, test, split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

    #building linear regression model on training data
    model1= LinearRegression().fit(X_train, y_train)
    pred_house_sell[SUBURB] = model1.predict(X_test)
    
    # make prediction and add it to the sliced dataframe
    prediction = model1.predict(X_scaled)
    df['Prediction'] = prediction
   
    # rebuild full dataframe
    combined_df = pd.concat([combined_df, df], ignore_index=False)

    # print r2 score
    print('Accuracy Score:', r2_score(y_test, pred_house_sell[SUBURB]))  

    # build dictionary with all r2 scores
    temp[SUBURB] = r2_score(y_test, pred_house_sell[SUBURB])

# convert dictionary to dataframe
temp = pd.DataFrame.from_dict(temp, orient ='index').reset_index()

Accuracy Score: 0.6760870381452443
Accuracy Score: 0.8080780135497013
Accuracy Score: 0.5153912311264413
Accuracy Score: -0.15912477142034498
Accuracy Score: 0.003217128365855859
Accuracy Score: 0.44879573220892466
Accuracy Score: 0.12417706984956012
Accuracy Score: 0.36660484190859843
Accuracy Score: 0.5240016107400389
Accuracy Score: -230416.4516971735
Accuracy Score: 0.5165751528426297
Accuracy Score: 0.17354990575086748
Accuracy Score: 0.35701915069830636
Accuracy Score: -1.536930889926707
Accuracy Score: -0.004044056257718198
Accuracy Score: 0.1845736578937518
Accuracy Score: 0.74112201446405
Accuracy Score: 0.5014894103853068
Accuracy Score: 0.20066667257237747
Accuracy Score: -0.00798650187624883
Accuracy Score: 0.05456669059971575
Accuracy Score: 0.6682336339898971
Accuracy Score: 0.4467907569354015
Accuracy Score: 0.6313948357359804
Accuracy Score: -0.14721514335148878
Accuracy Score: 0.7439485112178427
Accuracy Score: 0.6718663556986462
Accuracy Score: 0.47088018984750335
Acc



In [19]:
# rename columns for merging
temp.columns = ['SUBURB', 'r2 Score']
temp.head()

Unnamed: 0,SUBURB,r2 Score
0,Aveley,0.676087
1,Bibra Lake,0.808078
2,Ferndale,0.515391
3,Bedford,-0.159125
4,Queens Park,0.003217


In [27]:
combined_df.head(2)

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction
9 Covenham Crescent,Aveley,430000,4,2,2,375,160,2017,22400,Woodbridge Station,11700,10-2020\r,6069,-31.786825,115.994063,SWAN VALLEY ANGLICAN COMMUNITY SCHOOL,1.167177,53.0,395079.114566
23 Semerwater Crescent,Aveley,358000,4,1,0,440,167,2016,22400,Midland Station,11400,06-2020\r,6069,-31.789123,116.000095,SWAN VALLEY ANGLICAN COMMUNITY SCHOOL,1.501317,53.0,374139.956622


In [26]:
# merge r2 Score onto recombined dataframe
sales_df_r2 = pd.merge(combined_df, temp, how='left', on=['SUBURB', 'SUBURB'])
sales_df_r2.head(2)

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction,r2 Score
0,Aveley,430000,4,2,2,375,160,2017,22400,Woodbridge Station,11700,10-2020\r,6069,-31.786825,115.994063,SWAN VALLEY ANGLICAN COMMUNITY SCHOOL,1.167177,53.0,395079.114566,0.676087
1,Aveley,358000,4,1,0,440,167,2016,22400,Midland Station,11400,06-2020\r,6069,-31.789123,116.000095,SWAN VALLEY ANGLICAN COMMUNITY SCHOOL,1.501317,53.0,374139.956622,0.676087


In [21]:
# remove nan's of r2 scores that didn't have enough data to calculate
sales_df_r2['r2 Score'] = sales_df_r2['r2 Score'].fillna(0)

In [24]:
sales_df_r2.tail()

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction,r2 Score
33646,Whitby,450000,4,2,2,510,208,1940,38900,Armadale Station,14300,01-2017\r,6123,-32.281773,115.997529,COURT GRAMMAR SCHOOL,1.470061,,450000.0,0.0
33647,Whitby,440000,4,2,2,510,179,1969,39000,Armadale Station,14300,06-2017\r,6123,-32.281772,115.997709,COURT GRAMMAR SCHOOL,1.476795,,440000.0,0.0
33648,Whitby,430000,4,2,2,480,187,1965,39000,Armadale Station,14400,06-2017\r,6123,-32.282182,115.99712,COURT GRAMMAR SCHOOL,1.497308,,430000.0,0.0
33649,Treeby,580000,4,2,2,373,182,2004,19800,Cockburn Central Station,1500,09-2019\r,6164,-32.129518,115.873926,ATWELL COLLEGE,1.851464,129.0,580000.0,0.0
33650,Treeby,610000,4,2,2,428,276,2008,19300,Cockburn Central Station,1700,09-2019\r,6164,-32.124537,115.876416,ATWELL COLLEGE,2.449745,129.0,580000.0,0.0


In [23]:
# write csv to file
sales_df_r2.to_csv('Perth Housing Predictions and r2.csv', sep=',')