In [83]:
import pandas as pd 
import numpy as np
house_sales = "Resources/Perth_Housing_Prices.csv"
sales_df = pd.read_csv(house_sales)
sales_df.set_index('ADDRESS', inplace=True)
sales_df.head()

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [84]:
sales_df['GARAGE'] = sales_df['GARAGE'].fillna(0)

In [85]:
sales_df['SUBURB'].value_counts()

Bertram            231
Iluka              212
Bennett Springs    211
Mindarie           209
Carramar           208
                  ... 
Northbridge         11
Keysbrook            9
O'Connor             8
Neerabup             8
Hope Valley          6
Name: SUBURB, Length: 314, dtype: int64

In [86]:
sales_df.columns

Index(['SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE', 'LAND_AREA',
       'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK'],
      dtype='object')

In [87]:
# column names for recombining dataframe
columns = ['SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE', 'LAND_AREA',
       'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK', 'Prediction']

In [88]:
# create empty dataframe for recombining into
combined_df = pd.DataFrame(columns=columns)
combined_df

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction


In [89]:
pred_house_sell = {}
temp = {}

# loop on every suburb
for SUBURB in sales_df['SUBURB'].unique():
    # slice
    sales_slice = sales_df[sales_df['SUBURB'] == SUBURB]
    df = sales_slice.copy()

    # set X and y
    y = sales_slice['PRICE']
    X = sales_slice.drop(['SUBURB', 'PRICE', 'NEAREST_STN', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 'NEAREST_SCH', 'NEAREST_SCH_RANK', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN_DIST', 'NEAREST_SCH_DIST'], axis=1)
    
    # scale the data
    X_scaled = StandardScaler().fit_transform(X)

    # train, test, split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

    #building linear regression model on training data
    model1= LinearRegression().fit(X_train, y_train)
    pred_house_sell[SUBURB] = model1.predict(X_test)
    
    # make prediction and add it to the sliced dataframe
    prediction = model1.predict(X_scaled)
    df['Prediction'] = prediction
   
    # rebuild full dataframe
    combined_df = pd.concat([combined_df, df], ignore_index=False)

    # print r2 score
    print('Accuracy Score:', r2_score(y_test, pred_house_sell[SUBURB]))  

    # build dictionary with all r2 scores
    temp[SUBURB] = r2_score(y_test, pred_house_sell[SUBURB])

# convert dictionary to dataframe
temp = pd.DataFrame.from_dict(temp, orient ='index').reset_index()

Accuracy Score: 0.5934606601206013
Accuracy Score: 0.23613575569252243
Accuracy Score: 0.25065497954174676
Accuracy Score: 0.5665876532162732
Accuracy Score: 0.3411938839602784
Accuracy Score: 0.48751268813871584
Accuracy Score: -3.3054119132414934
Accuracy Score: 0.4065595469851466
Accuracy Score: 0.47255755476602945
Accuracy Score: 0.2942810238413356
Accuracy Score: 0.5414612342846554
Accuracy Score: 0.6868752773108918
Accuracy Score: 0.3998586347182962
Accuracy Score: 0.18127648162769572
Accuracy Score: 0.4949101660105927
Accuracy Score: -0.2287978722810331
Accuracy Score: -0.04138171459372697
Accuracy Score: 0.26149174430787236
Accuracy Score: -0.25443169437044166
Accuracy Score: 0.4183411134377196
Accuracy Score: 0.4945200028164749
Accuracy Score: 0.520745448873193
Accuracy Score: 0.5832857846167994
Accuracy Score: 0.840843986664902
Accuracy Score: 0.28686500388427305
Accuracy Score: 0.4340432910883729
Accuracy Score: 0.7330713455254387
Accuracy Score: -1504.1254863673587
Accuracy

In [90]:
# rename columns for merging
temp.columns = ['SUBURB', 'r2 Score']
temp.head()

Unnamed: 0,SUBURB,r2 Score
0,Floreat,0.593461
1,City Beach,0.236136
2,Dalkeith,0.250655
3,Mosman Park,0.566588
4,Watermans Bay,0.341194


In [91]:
combined_df.head(2)

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction
35 Oakdale Street,Floreat,2440000,6,4,2,809,442,2009,6800,Loch Street Station,3500,2018-01-08,6014,-31.9406,115.78772,NEWMAN COLLEGE,1.936173,44.0,2207074.0
127 Grovedale Road,Floreat,2325000,4,2,2,709,382,2013,6100,Karrakatta Station,3000,2020-01-02,6014,-31.94122,115.79516,NEWMAN COLLEGE,2.135921,44.0,1675303.0


In [92]:
# merge r2 Score onto recombined dataframe
sales_df_r2 = pd.merge(combined_df.reset_index(), temp, how='left', on=['SUBURB', 'SUBURB']).set_index('index')
sales_df_r2.head(2)

Unnamed: 0_level_0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction,r2 Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
35 Oakdale Street,Floreat,2440000,6,4,2,809,442,2009,6800,Loch Street Station,3500,2018-01-08,6014,-31.9406,115.78772,NEWMAN COLLEGE,1.936173,44.0,2207074.0,0.593461
127 Grovedale Road,Floreat,2325000,4,2,2,709,382,2013,6100,Karrakatta Station,3000,2020-01-02,6014,-31.94122,115.79516,NEWMAN COLLEGE,2.135921,44.0,1675303.0,0.593461


In [93]:
# remove nan's of r2 scores that didn't have enough data to calculate
sales_df_r2['r2 Score'] = sales_df_r2['r2 Score'].fillna(0)

In [94]:
sales_df_r2.head()

Unnamed: 0_level_0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction,r2 Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
35 Oakdale Street,Floreat,2440000,6,4,2,809,442,2009,6800,Loch Street Station,3500,2018-01-08,6014,-31.9406,115.78772,NEWMAN COLLEGE,1.936173,44.0,2207074.0,0.593461
127 Grovedale Road,Floreat,2325000,4,2,2,709,382,2013,6100,Karrakatta Station,3000,2020-01-02,6014,-31.94122,115.79516,NEWMAN COLLEGE,2.135921,44.0,1675303.0,0.593461
11 Kilkenny Road,Floreat,2300000,4,3,4,809,292,1980,7000,Loch Street Station,4100,2020-01-12,6014,-31.934491,115.787759,NEWMAN COLLEGE,1.256936,44.0,1678658.0,0.593461
11 Shannon Street,Floreat,2200000,3,2,2,809,266,2005,7300,Loch Street Station,4000,2020-01-10,6014,-31.93618,115.78378,NEWMAN COLLEGE,1.48314,44.0,1428023.0,0.593461
58 Oceanic Drive,Floreat,2200000,5,4,4,809,100,1956,7300,Loch Street Station,3800,2018-01-02,6014,-31.937999,115.783754,NEWMAN COLLEGE,1.681339,44.0,1515806.0,0.593461


In [95]:
# write csv to file
sales_df_r2.to_csv('Perth Housing Predictions and r2.csv', sep=',')

In [96]:
temp.to_csv('suburb_r2_scores.csv', sep=',')