In [1]:
import pandas as pd 
import numpy as np
house_sales = "Resources/Perth_Housing_Prices.csv"
sales_df = pd.read_csv(house_sales)
sales_df.set_index('ADDRESS', inplace=True)
sales_df.head()

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
sales_df['GARAGE'] = sales_df['GARAGE'].fillna(0)

In [3]:
sales_df['SUBURB'].value_counts()

Bertram            231
Iluka              212
Bennett Springs    211
Mindarie           209
Carramar           208
                  ... 
Northbridge         11
Keysbrook            9
O'Connor             8
Neerabup             8
Hope Valley          6
Name: SUBURB, Length: 314, dtype: int64

In [4]:
sales_df.columns

Index(['SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE', 'LAND_AREA',
       'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK'],
      dtype='object')

In [5]:
# column names for recombining dataframe
columns = ['SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE', 'LAND_AREA',
       'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK', 'Prediction']

In [6]:
# create empty dataframe for recombining into
combined_df = pd.DataFrame(columns=columns)
combined_df

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction


In [7]:
pred_house_sell = {}
temp = {}

# loop on every suburb
for SUBURB in sales_df['SUBURB'].unique():
    # slice
    sales_slice = sales_df[sales_df['SUBURB'] == SUBURB]
    df = sales_slice.copy()

    # set X and y
    y = sales_slice['PRICE']
    X = sales_slice.drop(['SUBURB', 'PRICE', 'NEAREST_STN', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE', 
                          'NEAREST_SCH', 'NEAREST_SCH_RANK'], axis=1)
    
    # scale the data
    X_scaled = StandardScaler().fit_transform(X)

    # train, test, split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

    #building linear regression model on training data
    model1= LinearRegression().fit(X_train, y_train)
    pred_house_sell[SUBURB] = model1.predict(X_test)
    
    # make prediction and add it to the sliced dataframe
    prediction = model1.predict(X_scaled)
    df['Prediction'] = prediction
   
    # rebuild full dataframe
    combined_df = pd.concat([combined_df, df], ignore_index=False)

    # print r2 score
    print('Accuracy Score:', r2_score(y_test, pred_house_sell[SUBURB]))  

    # build dictionary with all r2 scores
    temp[SUBURB] = r2_score(y_test, pred_house_sell[SUBURB])

# convert dictionary to dataframe
temp = pd.DataFrame.from_dict(temp, orient ='index').reset_index()

Accuracy Score: 0.5672405291913516
Accuracy Score: 0.3592606303214878
Accuracy Score: 0.47459853205923086
Accuracy Score: 0.5290514071087253
Accuracy Score: 0.556061589083837
Accuracy Score: 0.10667587595173067
Accuracy Score: -0.04862023772396684
Accuracy Score: 0.1322117276564223
Accuracy Score: 0.6991573696000191
Accuracy Score: 0.38957857475119106
Accuracy Score: 0.2876067951621011
Accuracy Score: 0.6408330531434461
Accuracy Score: 0.6075093962921381
Accuracy Score: 0.1420032326365941
Accuracy Score: 0.3331549522509941
Accuracy Score: -0.4310825264255309
Accuracy Score: -0.21476804914980807
Accuracy Score: 0.04707304994395545
Accuracy Score: 0.40706608143949397
Accuracy Score: 0.6324681356897932
Accuracy Score: 0.6320158852751281
Accuracy Score: 0.5089670298231548
Accuracy Score: 0.5217195050899655
Accuracy Score: 0.7621752570101984
Accuracy Score: 0.6445096230173466
Accuracy Score: 0.37527093779506293
Accuracy Score: 0.22256641260547094
Accuracy Score: -1.5928570833227247
Accuracy

In [8]:
# rename columns for merging
temp.columns = ['SUBURB', 'r2 Score']
temp.head()

Unnamed: 0,SUBURB,r2 Score
0,Floreat,0.567241
1,City Beach,0.359261
2,Dalkeith,0.474599
3,Mosman Park,0.529051
4,Watermans Bay,0.556062


In [9]:
combined_df.head(2)

Unnamed: 0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction
35 Oakdale Street,Floreat,2440000,6,4,2,809,442,2009,6800,Loch Street Station,3500,2018-01-08,6014,-31.9406,115.78772,NEWMAN COLLEGE,1.936173,44.0,2236837.0
127 Grovedale Road,Floreat,2325000,4,2,2,709,382,2013,6100,Karrakatta Station,3000,2020-01-02,6014,-31.94122,115.79516,NEWMAN COLLEGE,2.135921,44.0,1688877.0


In [10]:
# merge r2 Score onto recombined dataframe
sales_df_r2 = pd.merge(combined_df.reset_index(), temp, how='left', on=['SUBURB', 'SUBURB']).set_index('index')
sales_df_r2.head(2)

Unnamed: 0_level_0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction,r2 Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
35 Oakdale Street,Floreat,2440000,6,4,2,809,442,2009,6800,Loch Street Station,3500,2018-01-08,6014,-31.9406,115.78772,NEWMAN COLLEGE,1.936173,44.0,2236837.0,0.567241
127 Grovedale Road,Floreat,2325000,4,2,2,709,382,2013,6100,Karrakatta Station,3000,2020-01-02,6014,-31.94122,115.79516,NEWMAN COLLEGE,2.135921,44.0,1688877.0,0.567241


In [11]:
# remove nan's of r2 scores that didn't have enough data to calculate
sales_df_r2['r2 Score'] = sales_df_r2['r2 Score'].fillna(0)

In [12]:
sales_df_r2.head()

Unnamed: 0_level_0,SUBURB,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN,NEAREST_STN_DIST,DATE_SOLD,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH,NEAREST_SCH_DIST,NEAREST_SCH_RANK,Prediction,r2 Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
35 Oakdale Street,Floreat,2440000,6,4,2,809,442,2009,6800,Loch Street Station,3500,2018-01-08,6014,-31.9406,115.78772,NEWMAN COLLEGE,1.936173,44.0,2236837.0,0.567241
127 Grovedale Road,Floreat,2325000,4,2,2,709,382,2013,6100,Karrakatta Station,3000,2020-01-02,6014,-31.94122,115.79516,NEWMAN COLLEGE,2.135921,44.0,1688877.0,0.567241
11 Kilkenny Road,Floreat,2300000,4,3,4,809,292,1980,7000,Loch Street Station,4100,2020-01-12,6014,-31.934491,115.787759,NEWMAN COLLEGE,1.256936,44.0,1707688.0,0.567241
11 Shannon Street,Floreat,2200000,3,2,2,809,266,2005,7300,Loch Street Station,4000,2020-01-10,6014,-31.93618,115.78378,NEWMAN COLLEGE,1.48314,44.0,1584590.0,0.567241
58 Oceanic Drive,Floreat,2200000,5,4,4,809,100,1956,7300,Loch Street Station,3800,2018-01-02,6014,-31.937999,115.783754,NEWMAN COLLEGE,1.681339,44.0,1635198.0,0.567241


In [13]:
# write csv to file
sales_df_r2.to_csv('Perth Housing Predictions and r2.csv', sep=',')

In [15]:
temp.to_csv('suburb_r2_scores.csv', sep=',')