In [1]:
# Reading csv file
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

cal_df = pd.read_csv("all_perth_310121.csv")

print (cal_df.describe())

cal_df.isnull().sum()

              PRICE      BEDROOMS     BATHROOMS        GARAGE      LAND_AREA  \
count  3.365600e+04  33656.000000  33656.000000  31178.000000   33656.000000   
mean   6.370720e+05      3.659110      1.823063      2.199917    2740.644016   
std    3.558256e+05      0.752038      0.587427      1.365225   16693.513215   
min    5.100000e+04      1.000000      1.000000      1.000000      61.000000   
25%    4.100000e+05      3.000000      1.000000      2.000000     503.000000   
50%    5.355000e+05      4.000000      2.000000      2.000000     682.000000   
75%    7.600000e+05      4.000000      2.000000      2.000000     838.000000   
max    2.440000e+06     10.000000     16.000000     99.000000  999999.000000   

         FLOOR_AREA    BUILD_YEAR      CBD_DIST  NEAREST_STN_DIST  \
count  33656.000000  30501.000000  33656.000000      33656.000000   
mean     183.501545   1989.706436  19777.374465       4523.371494   
std       72.102982     20.964330  11364.415413       4495.064024   
min

ADDRESS                 0
SUBURB                  0
PRICE                   0
BEDROOMS                0
BATHROOMS               0
GARAGE               2478
LAND_AREA               0
FLOOR_AREA              0
BUILD_YEAR           3155
CBD_DIST                0
NEAREST_STN             0
NEAREST_STN_DIST        0
DATE_SOLD               0
POSTCODE                0
LATITUDE                0
LONGITUDE               0
NEAREST_SCH             0
NEAREST_SCH_DIST        0
NEAREST_SCH_RANK    10952
dtype: int64

In [2]:
# Handling missing value
cal_df.GARAGE=cal_df.GARAGE.fillna(cal_df.GARAGE.mean())
cal_df.BUILD_YEAR=cal_df.BUILD_YEAR.fillna(cal_df.BUILD_YEAR.mean())
cal_df.NEAREST_SCH_RANK=cal_df.NEAREST_SCH_RANK.fillna(cal_df.NEAREST_SCH_RANK.mean())
cal_df.isnull().sum()

ADDRESS             0
SUBURB              0
PRICE               0
BEDROOMS            0
BATHROOMS           0
GARAGE              0
LAND_AREA           0
FLOOR_AREA          0
BUILD_YEAR          0
CBD_DIST            0
NEAREST_STN         0
NEAREST_STN_DIST    0
DATE_SOLD           0
POSTCODE            0
LATITUDE            0
LONGITUDE           0
NEAREST_SCH         0
NEAREST_SCH_DIST    0
NEAREST_SCH_RANK    0
dtype: int64

In [3]:
# Dropping columns that are less likely to have predictive power
columns_to_drop = ['ADDRESS', 'SUBURB', 'DATE_SOLD', 'NEAREST_STN', 'NEAREST_SCH']
cal_df = cal_df.drop(columns_to_drop, axis=1)

# After dropping, Checking the resulting DataFrame
print(cal_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33656 entries, 0 to 33655
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PRICE             33656 non-null  int64  
 1   BEDROOMS          33656 non-null  int64  
 2   BATHROOMS         33656 non-null  int64  
 3   GARAGE            33656 non-null  float64
 4   LAND_AREA         33656 non-null  int64  
 5   FLOOR_AREA        33656 non-null  int64  
 6   BUILD_YEAR        33656 non-null  float64
 7   CBD_DIST          33656 non-null  int64  
 8   NEAREST_STN_DIST  33656 non-null  int64  
 9   POSTCODE          33656 non-null  int64  
 10  LATITUDE          33656 non-null  float64
 11  LONGITUDE         33656 non-null  float64
 12  NEAREST_SCH_DIST  33656 non-null  float64
 13  NEAREST_SCH_RANK  33656 non-null  float64
dtypes: float64(6), int64(8)
memory usage: 3.6 MB
None


In [4]:
# Getting column names first
print(cal_df.head())
names = cal_df.columns
# Creating the Scaler object
scaler = StandardScaler()
# Fitting the data on scaler object
scaled_df = scaler.fit_transform(cal_df)
scaled_df = pd.DataFrame(scaled_df, columns=names)
scaled_df.head()

    PRICE  BEDROOMS  BATHROOMS  GARAGE  LAND_AREA  FLOOR_AREA  BUILD_YEAR  \
0  565000         4          2     2.0        600         160      2003.0   
1  365000         3          2     2.0        351         139      2013.0   
2  287000         3          1     1.0        719          86      1979.0   
3  255000         2          1     2.0        651          59      1953.0   
4  325000         4          1     2.0        466         131      1998.0   

   CBD_DIST  NEAREST_STN_DIST  POSTCODE   LATITUDE   LONGITUDE  \
0     18300              1800      6164 -32.115900  115.842450   
1     26900              4900      6167 -32.193470  115.859554   
2     22600              1900      6111 -32.120578  115.993579   
3     17900              3600      6056 -31.900547  116.038009   
4     11200              2000      6054 -31.885790  115.947780   

   NEAREST_SCH_DIST  NEAREST_SCH_RANK  
0          0.828339         72.672569  
1          5.524324        129.000000  
2          1.649178 

Unnamed: 0,PRICE,BEDROOMS,BATHROOMS,GARAGE,LAND_AREA,FLOOR_AREA,BUILD_YEAR,CBD_DIST,NEAREST_STN_DIST,POSTCODE,LATITUDE,LONGITUDE,NEAREST_SCH_DIST,NEAREST_SCH_RANK
0,-0.202552,0.453295,0.301212,-0.152145,-0.128234,-0.325949,0.666104,-0.130002,-0.605867,1.199671,-0.873207,-0.311632,-0.56526,0.0
1,-0.764633,-0.876444,0.301212,-0.152145,-0.14315,-0.617203,1.167176,0.626757,0.083788,1.247928,-1.309542,-0.166853,2.124348,1.687556
2,-0.983845,-0.876444,-1.401153,-0.91319,-0.121105,-1.352274,-0.53647,0.248378,-0.58362,0.347128,-0.899522,0.96766,-0.095127,1.2082
3,-1.073778,-2.206184,-1.401153,-0.152145,-0.125179,-1.726744,-1.839258,-0.1652,-0.205422,-0.537585,0.338161,1.343754,-0.139674,0.0
4,-0.877049,0.453295,-1.401153,-0.152145,-0.136261,-0.728157,0.415567,-0.754768,-0.561373,-0.569757,0.421168,0.579976,-0.172022,0.0


In [5]:
# Independent variable
X = scaled_df.drop('PRICE', axis = 1)

print (X.info())
print (X.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33656 entries, 0 to 33655
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   BEDROOMS          33656 non-null  float64
 1   BATHROOMS         33656 non-null  float64
 2   GARAGE            33656 non-null  float64
 3   LAND_AREA         33656 non-null  float64
 4   FLOOR_AREA        33656 non-null  float64
 5   BUILD_YEAR        33656 non-null  float64
 6   CBD_DIST          33656 non-null  float64
 7   NEAREST_STN_DIST  33656 non-null  float64
 8   POSTCODE          33656 non-null  float64
 9   LATITUDE          33656 non-null  float64
 10  LONGITUDE         33656 non-null  float64
 11  NEAREST_SCH_DIST  33656 non-null  float64
 12  NEAREST_SCH_RANK  33656 non-null  float64
dtypes: float64(13)
memory usage: 3.3 MB
None
(33656, 13)


In [6]:
# Target variable
y = scaled_df['PRICE']

print (y.describe())
print (y.shape)

count    3.365600e+04
mean     1.646730e-17
std      1.000015e+00
min     -1.647101e+00
25%     -6.381648e-01
50%     -2.854587e-01
75%      3.454777e-01
max      5.066961e+00
Name: PRICE, dtype: float64
(33656,)


In [7]:
# Splitting into train and test
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.20, random_state=1)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(26924, 13)
(26924,)
(6732, 13)
(6732,)


In [8]:
# Building Linear regression model
regression = linear_model.LinearRegression() 
regression.fit(X_train, Y_train) 


print ("\n\nLet's evalaute model on our held-out test data\n")

print("R-Sqaured: %.3f"
      % regression.score(X_test,Y_test)) 

Y_pred = regression.predict(X_test) 
print("Mean squared error: %.3f"
      % mean_squared_error(Y_test, Y_pred))  

print("Mean absolute error: %.3f"
      % mean_absolute_error(Y_test, Y_pred))

print ("\n\n\nLet's see how parameters were learnt in training.\n")

print("The intercept / beta_0 is {}".format(regression.intercept_)) 
for i, col_name in enumerate(X_train.columns):
    print("The coefficient (beta_{}) for {} is {}".format(i+1, col_name, regression.coef_[i]))



Let's evalaute model on our held-out test data

R-Sqaured: 0.618
Mean squared error: 0.393
Mean absolute error: 0.426



Let's see how parameters were learnt in training.

The intercept / beta_0 is -0.0013446489915461546
The coefficient (beta_1) for BEDROOMS is -0.00836329443126372
The coefficient (beta_2) for BATHROOMS is 0.15453639252168008
The coefficient (beta_3) for GARAGE is 0.04684130987709101
The coefficient (beta_4) for LAND_AREA is 0.07695065698364965
The coefficient (beta_5) for FLOOR_AREA is 0.43798658413468233
The coefficient (beta_6) for BUILD_YEAR is -0.19484704640730555
The coefficient (beta_7) for CBD_DIST is -0.4052183200749838
The coefficient (beta_8) for NEAREST_STN_DIST is 0.16919095850567434
The coefficient (beta_9) for POSTCODE is -0.08871080676044786
The coefficient (beta_10) for LATITUDE is -0.14275479145858389
The coefficient (beta_11) for LONGITUDE is -0.26919590282720396
The coefficient (beta_12) for NEAREST_SCH_DIST is 0.0758737984359762
The coefficient (