In [14]:
import pandas as pd 
import numpy as np
house_sales = "Resources/Perth_Housing_Prices2.csv"
sales_df = pd.read_csv(house_sales)
sales_df.set_index('ADDRESS', inplace=True)
sales_df.head()

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [15]:
sales_df['GARAGE'] = sales_df['GARAGE'].fillna(0)

In [16]:
sales_df.columns

Index(['SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE', 'LAND_AREA',
       'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK'],
      dtype='object')

In [17]:
# column names for recombining dataframe
columns = ['SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE', 'LAND_AREA',
       'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK', 'Prediction']

In [18]:
# create empty dataframe for recombining into
combined_df = pd.DataFrame(columns=columns)
combined_df

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction


In [19]:
pred_house_sell = {}
temp = {}

# loop on every suburb
for SUBURB in sales_df['SUBURB'].unique():
    # slice
    sales_slice = sales_df[sales_df['SUBURB'] == SUBURB]
    df = sales_slice.copy()

    # set X and y
    y = sales_slice['PRICE']
    X = sales_slice.drop(['SUBURB', 'PRICE', 'NEAREST_STN', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH', 'NEAREST_SCH_RANK', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN_DIST', 'NEAREST_SCH_DIST'], axis=1)
    
    # scale the data
    X_scaled = StandardScaler().fit_transform(X)

    # train, test, split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

    #building linear regression model on training data
    model1= LinearRegression().fit(X_train, y_train)
    pred_house_sell[SUBURB] = model1.predict(X_test)
    
    # make prediction and add it to the sliced dataframe
    prediction = model1.predict(X_scaled)
    df['Prediction'] = prediction
   
    # rebuild full dataframe
    combined_df = pd.concat([combined_df, df], ignore_index=False)

    # print r2 score
    print('Accuracy Score:', r2_score(y_test, pred_house_sell[SUBURB]))  

    # build dictionary with all r2 scores
    temp[SUBURB] = r2_score(y_test, pred_house_sell[SUBURB])

# convert dictionary to dataframe
temp = pd.DataFrame.from_dict(temp, orient ='index').reset_index()

Accuracy Score: 0.6478040806092109
Accuracy Score: 0.5403139184686288
Accuracy Score: 0.6272796456563385
Accuracy Score: 0.28610535219579625
Accuracy Score: 0.5956744753551182
Accuracy Score: 0.5961696691110585
Accuracy Score: 0.2205435178632269
Accuracy Score: -0.2960605580659026
Accuracy Score: 0.2883743436470365
Accuracy Score: 0.5849787165035956
Accuracy Score: 0.14580250751334256
Accuracy Score: 0.28959893342938137
Accuracy Score: -0.17474276656050725
Accuracy Score: -0.47519601183712035
Accuracy Score: 0.21722063680425518
Accuracy Score: 0.2541397965316945
Accuracy Score: 0.6239426900779085
Accuracy Score: 0.6288208602995036
Accuracy Score: 0.45909135326361084
Accuracy Score: -0.30514168434145716
Accuracy Score: -0.2666264517991215
Accuracy Score: 0.6896704927497519
Accuracy Score: 0.6061671937506175
Accuracy Score: 0.7541286871696795
Accuracy Score: -0.057946524457281434
Accuracy Score: 0.4144232288936771
Accuracy Score: 0.6983689135595212
Accuracy Score: 0.1480276872821309
Accu



In [20]:
# rename columns for merging
temp.columns = ['SUBURB', 'r2 Score']
temp.head()

Unnamed: 0,SUBURB,r2 Score
0,Aveley,0.647804
1,Bibra Lake,0.540314
2,Ferndale,0.62728
3,Bedford,0.286105
4,Queens Park,0.595674


In [21]:
combined_df.head(2)

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction
9 Covenham Crescent,Aveley,430000,4,2,2,375,160,2017,22400,Woodbridge Station,11700,10-2020\r,6069,-31.786825,115.994063,SWAN VALLEY ANGLICAN COMMUNITY SCHOOL,1.167177,53.0,397258.53749
23 Semerwater Crescent,Aveley,358000,4,1,0,440,167,2016,22400,Midland Station,11400,06-2020\r,6069,-31.789123,116.000095,SWAN VALLEY ANGLICAN COMMUNITY SCHOOL,1.501317,53.0,379741.453582


In [22]:
# merge r2 Score onto recombined dataframe
sales_df_r2 = pd.merge(combined_df.reset_index(), temp, how='left', on=['SUBURB', 'SUBURB']).set_index('index')
sales_df_r2.head(2)

Unnamed: 0_level_0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction,r2 Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
9 Covenham Crescent,Aveley,430000,4,2,2,375,160,2017,22400,Woodbridge Station,11700,10-2020\r,6069,-31.786825,115.994063,SWAN VALLEY ANGLICAN COMMUNITY SCHOOL,1.167177,53.0,397258.53749,0.647804
23 Semerwater Crescent,Aveley,358000,4,1,0,440,167,2016,22400,Midland Station,11400,06-2020\r,6069,-31.789123,116.000095,SWAN VALLEY ANGLICAN COMMUNITY SCHOOL,1.501317,53.0,379741.453582,0.647804


In [23]:
# remove nan's of r2 scores that didn't have enough data to calculate
sales_df_r2['r2 Score'] = sales_df_r2['r2 Score'].fillna(0)

In [24]:
sales_df_r2.tail()

Unnamed: 0_level_0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction,r2 Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
35 Windrow Grove,Whitby,450000,4,2,2,510,208,1940,38900,Armadale Station,14300,01-2017\r,6123,-32.281773,115.997529,COURT GRAMMAR SCHOOL,1.470061,,450000.0,0.0
37 Windrow Grove,Whitby,440000,4,2,2,510,179,1969,39000,Armadale Station,14300,06-2017\r,6123,-32.281772,115.997709,COURT GRAMMAR SCHOOL,1.476795,,440000.0,0.0
46 Windrow Grove,Whitby,430000,4,2,2,480,187,1965,39000,Armadale Station,14400,06-2017\r,6123,-32.282182,115.99712,COURT GRAMMAR SCHOOL,1.497308,,430000.0,0.0
23 Cedarleaf Entrance,Treeby,580000,4,2,2,373,182,2004,19800,Cockburn Central Station,1500,09-2019\r,6164,-32.129518,115.873926,ATWELL COLLEGE,1.851464,129.0,610000.0,0.0
31 Clementine Boulevard,Treeby,610000,4,2,2,428,276,2008,19300,Cockburn Central Station,1700,09-2019\r,6164,-32.124537,115.876416,ATWELL COLLEGE,2.449745,129.0,610000.0,0.0


In [25]:
# write csv to file
sales_df_r2.to_csv('Perth Housing Predictions and r2.csv', sep=',')