In [1]:
from datetime import datetime, date
import warnings
import numpy as np
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')


## Combine and clean data files from the NSW government

The following code joins all of the CSV files from NSW government into a single CSV.

In [2]:
# All data files from the NSW government can be found at http://maps.six.nsw.gov.au/csv/current/suburb/
# All of the 4325 files are quite large and should be updated, so they are not included in the github repo.

# If a new gov dataset is needed, just re-download all the csv files from the website above and run this cell.
# I used chrono download manager's sniffer feature to download them all.

# I know the analysis is based on Sydney house prices but these government data files are for the whole state of NSW Australia.
# I'm sure there's a smart way to get rid of some of these files to reduce the data size, and improve process time,
# but for now I'm just too lazy to do that.

# uncomment the following code if you want to re-combine the csv files.

# extension = 'csv'
# all_filenames = [i for i in glob.glob('./data/gov/*.{}'.format(extension))]

# #combine all files in the list
# combined = pd.concat([pd.read_csv(f) for f in all_filenames ])

# # removing less useful columns
# combined = combined.drop(['MULTI-PROPERTY SALE (Y/N)', 'STRATA/NON STRATA', 'PROPERTY NUMBER',
#                'DEALING NUMBER', 'EXTRACTION DATE'], axis=1)

# # removing duplicates

# combined = combined.drop_duplicates(keep='last')

# # do the hard work now so don't have to do this again each time
# combined['SALE DATE']=pd.to_datetime(combined['SALE DATE'])

# combined.shape

# #export to csv
# combined.to_csv( "./data/gov_combined.csv", index=False, encoding='utf-8-sig')


In [3]:
# check the shape and columns of the data frame
gov1 = pd.read_csv('./data/gov_combined.csv') # if you don't want to run the prvious cell again every time, just comment it out
print(gov1.shape)
gov1.head()


(908210, 4)


Unnamed: 0,ADDRESS,SALE PRICE,SALE DATE,AREA
0,"7685 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",435000,2019-08-30,400100.0
1,"2162 AARONS PASS ROAD, AARONS PASS NSW 2850",315000,2017-08-22,428500.0
2,"7749 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",340000,2019-12-05,398500.0
3,"159 SUTTERS LANE, AARONS PASS NSW 2850",365000,2021-01-20,470800.0
4,"8248 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",700000,2018-10-19,402000.0


In [4]:
# Split street address and suburb

addr = gov1.ADDRESS.str.split(
    ',', n=1, expand=True).apply(lambda x: x.str.strip())
gov2 = gov1.copy()
for i in addr.columns:
    gov2[i] = addr[i]


In [5]:
gov2.head()

Unnamed: 0,ADDRESS,SALE PRICE,SALE DATE,AREA,0,1
0,"7685 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",435000,2019-08-30,400100.0,7685 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
1,"2162 AARONS PASS ROAD, AARONS PASS NSW 2850",315000,2017-08-22,428500.0,2162 AARONS PASS ROAD,AARONS PASS NSW 2850
2,"7749 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",340000,2019-12-05,398500.0,7749 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
3,"159 SUTTERS LANE, AARONS PASS NSW 2850",365000,2021-01-20,470800.0,159 SUTTERS LANE,AARONS PASS NSW 2850
4,"8248 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",700000,2018-10-19,402000.0,8248 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850


In [6]:
gov2.rename(columns={0:'ADDR', 1:'SUBURB', 'SALE DATE':'DATE','SALE PRICE':'PRICE'},inplace=True)
gov2.head()


Unnamed: 0,ADDRESS,PRICE,DATE,AREA,ADDR,SUBURB
0,"7685 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",435000,2019-08-30,400100.0,7685 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
1,"2162 AARONS PASS ROAD, AARONS PASS NSW 2850",315000,2017-08-22,428500.0,2162 AARONS PASS ROAD,AARONS PASS NSW 2850
2,"7749 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",340000,2019-12-05,398500.0,7749 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
3,"159 SUTTERS LANE, AARONS PASS NSW 2850",365000,2021-01-20,470800.0,159 SUTTERS LANE,AARONS PASS NSW 2850
4,"8248 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",700000,2018-10-19,402000.0,8248 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850


In [7]:
# Leaving only the useful columns

cols = ['ADDR','SUBURB','PRICE','AREA','DATE']
gov3 = gov2[cols]
gov3.columns = ['Address','Suburb','Price','Size','Date']
gov3.head()

Unnamed: 0,Address,Suburb,Price,Size,Date
0,7685 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850,435000,400100.0,2019-08-30
1,2162 AARONS PASS ROAD,AARONS PASS NSW 2850,315000,428500.0,2017-08-22
2,7749 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850,340000,398500.0,2019-12-05
3,159 SUTTERS LANE,AARONS PASS NSW 2850,365000,470800.0,2021-01-20
4,8248 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850,700000,402000.0,2018-10-19


In [8]:
sales_gov = gov3.copy()

## Combine and clean data scraped from RateMyAgent.com.au

In [9]:
# All data files from the RateMyAgent website are scraped with some tool I shall not name.
# The data should be updated once every month.
# uncomment the following code if you want to re-combine the csv files.

# extension = 'csv'
# all_filenames = [i for i in glob.glob('./data/rma/*.{}'.format(extension))]

# #combine all files in the list
# combined = pd.concat([pd.read_csv(f) for f in all_filenames ])
# #export to csv
# combined.to_csv( "./data/rma_combined.csv", index=False, encoding='utf-8-sig')


In [10]:
rma = pd.read_csv('./data/rma_combined.csv')
rma.shape

(28405, 9)

In [11]:
rma.head()

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Sale Type,Price
0,https://www.ratemyagent.com.au/real-estate-age...,7A Australia Street,Camperdown NSW 2050,2.0,2.0,1.0,Townhouse,Sold by Auction on 02 Nov 2021,"$1,950,000"
1,https://www.ratemyagent.com.au/real-estate-age...,9/46 Stewart St,Ermington NSW 2115,3.0,2.0,1.0,Townhouse,Sold by Private Sale on 01 Nov 2021,"$960,000"
2,https://www.ratemyagent.com.au/real-estate-age...,39 William Street,Granville NSW 2142,1.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
3,https://www.ratemyagent.com.au/real-estate-age...,7/12 Union Street,West Ryde NSW 2114,2.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
4,https://www.ratemyagent.com.au/real-estate-age...,404/2A Cooks Avenue,Canterbury NSW 2193,2.0,2.0,2.0,Unit,Sold by Private Sale on 01 Nov 2021,"$740,000"


In [12]:
# remove duplicates

rma1 = rma.drop_duplicates(keep='last')
rma1.shape

(28353, 9)

In [13]:
rma1.isnull().sum()

Link           74
Address         1
Suburb          1
Beds           38
Baths          71
Carpark      2159
Type          220
Sale Type       9
Price           1
dtype: int64

In [14]:
# Dropping rows that are missing important values

rma1.dropna(subset=['Price','Beds'],inplace=True)
rma1.head()

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Sale Type,Price
0,https://www.ratemyagent.com.au/real-estate-age...,7A Australia Street,Camperdown NSW 2050,2.0,2.0,1.0,Townhouse,Sold by Auction on 02 Nov 2021,"$1,950,000"
1,https://www.ratemyagent.com.au/real-estate-age...,9/46 Stewart St,Ermington NSW 2115,3.0,2.0,1.0,Townhouse,Sold by Private Sale on 01 Nov 2021,"$960,000"
2,https://www.ratemyagent.com.au/real-estate-age...,39 William Street,Granville NSW 2142,1.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
3,https://www.ratemyagent.com.au/real-estate-age...,7/12 Union Street,West Ryde NSW 2114,2.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
4,https://www.ratemyagent.com.au/real-estate-age...,404/2A Cooks Avenue,Canterbury NSW 2193,2.0,2.0,2.0,Unit,Sold by Private Sale on 01 Nov 2021,"$740,000"


In [15]:
# Fill null values in column: "Type". If there's a slash(/) in the address, it's an apartment, else it's a house.

nulls = rma1[rma1['Type'].isnull()]
apartments = nulls[nulls.Address.str.contains('/')]
houses = nulls[~nulls.Address.str.contains('/')]
for i in houses.index:
    rma1.at[i,'Type'] = 'House'
for i in apartments.index:
    rma1.at[i,'Type'] = 'Apartment'


In [16]:
# Fill null values in column: "Carpark". If it's an apartment, there's no parking. If it's a house, there's 2 parking spaces.

nulls = rma1[rma1['Carpark'].isnull()]
apartments = nulls[nulls.Address.str.contains('/')]
houses = nulls[~nulls.Address.str.contains('/')]
for i in houses.index:
    rma1.at[i,'Carpark'] = 1
for i in apartments.index:
    rma1.at[i,'Carpark'] = 0

In [17]:
rma1.isnull().sum()

Link         73
Address       0
Suburb        0
Beds          0
Baths        33
Carpark       0
Type          0
Sale Type     8
Price         0
dtype: int64

In [18]:
rma1.head()

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Sale Type,Price
0,https://www.ratemyagent.com.au/real-estate-age...,7A Australia Street,Camperdown NSW 2050,2.0,2.0,1.0,Townhouse,Sold by Auction on 02 Nov 2021,"$1,950,000"
1,https://www.ratemyagent.com.au/real-estate-age...,9/46 Stewart St,Ermington NSW 2115,3.0,2.0,1.0,Townhouse,Sold by Private Sale on 01 Nov 2021,"$960,000"
2,https://www.ratemyagent.com.au/real-estate-age...,39 William Street,Granville NSW 2142,1.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
3,https://www.ratemyagent.com.au/real-estate-age...,7/12 Union Street,West Ryde NSW 2114,2.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
4,https://www.ratemyagent.com.au/real-estate-age...,404/2A Cooks Avenue,Canterbury NSW 2193,2.0,2.0,2.0,Unit,Sold by Private Sale on 01 Nov 2021,"$740,000"


In [19]:
# Split sale type and date

rma1['Sale Type'] = rma['Sale Type'].str.replace('Sold by ','' )

In [20]:
saletype = rma1['Sale Type'].str.split(' on ',expand=True)
for i in saletype.columns:
    rma1[i] = saletype[i]

In [21]:
rma2 = rma1.drop('Sale Type',axis=1)
rma2.rename({0:'By',1:'Date'},axis=1,inplace=True)

# trim price
rma2['Price'] = rma2['Price'].str.replace('$','').str.replace(',','')

# to datetime just in case
rma2['Date']=pd.to_datetime(rma2['Date'])



In [22]:
sales_rma = rma2.copy()
sales_rma

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Price,By,Date
0,https://www.ratemyagent.com.au/real-estate-age...,7A Australia Street,Camperdown NSW 2050,2.0,2.0,1.0,Townhouse,1950000,Auction,2021-11-02
1,https://www.ratemyagent.com.au/real-estate-age...,9/46 Stewart St,Ermington NSW 2115,3.0,2.0,1.0,Townhouse,960000,Private Sale,2021-11-01
2,https://www.ratemyagent.com.au/real-estate-age...,39 William Street,Granville NSW 2142,1.0,1.0,1.0,Unit,Price unavailable,Private Sale,2021-11-01
3,https://www.ratemyagent.com.au/real-estate-age...,7/12 Union Street,West Ryde NSW 2114,2.0,1.0,1.0,Unit,Price unavailable,Private Sale,2021-11-01
4,https://www.ratemyagent.com.au/real-estate-age...,404/2A Cooks Avenue,Canterbury NSW 2193,2.0,2.0,2.0,Unit,740000,Private Sale,2021-11-01
...,...,...,...,...,...,...,...,...,...,...
28400,https://www.ratemyagent.com.au/real-estate-age...,301/7 Sevier Avenue,Rhodes NSW 2138,2.0,2.0,1.0,Apartment,Price unavailable,Auction,2021-05-29
28401,https://www.ratemyagent.com.au/real-estate-age...,36/132 Killeaton St,St Ives NSW 2075,2.0,2.0,1.0,Apartment,935000,Auction,2021-05-29
28402,https://www.ratemyagent.com.au/real-estate-age...,12/19 Selwyn St,Wollstonecraft NSW 2065,2.0,1.0,1.0,Apartment,Price unavailable,Auction,2021-05-29
28403,https://www.ratemyagent.com.au/real-estate-age...,5/62 Middle Head Road,Mosman NSW 2088,2.0,1.0,1.0,Apartment,Price unavailable,Auction,2021-05-29


## Prepare datasets before joining

Unify Street Names (Street -> St, etc.)

In [23]:
sales_gov.Address = sales_gov.Address.str.title().str.strip()
sales_gov.Suburb = sales_gov.Suburb.str.title().str.strip()
sales_rma.Address = sales_rma.Address.str.title().str.strip()
sales_rma.Suburb = sales_rma.Suburb.str.title().str.strip()


In [24]:
road_names = sales_rma.Address.str.split(' ', expand = True)


In [25]:
rd_names = road_names[2].unique()

filtered = []
for n in rd_names:
    if n == None:
        pass
    elif len(n)<3:
            filtered.append(n)

filtered


['St',
 'Rd',
 'Pl',
 'Cl',
 'Ln',
 'Dr',
 'Sp',
 'La',
 '16',
 'Ct',
 'N',
 'On',
 '&',
 'De',
 'Av',
 'Of',
 'Vw']

In [26]:
road_names[road_names[2]=='Cl']



Unnamed: 0,0,1,2,3,4,5,6,7,8
237,6,Goodlet,Cl,,,,,,
599,2,William,Cl,,,,,,
1308,7,Roberts,Cl,,,,,,
1508,12/3,Packard,Cl,,,,,,
2317,39/3,Ramu,Cl,,,,,,
...,...,...,...,...,...,...,...,...,...
23765,37/2,Belair,Cl,,,,,,
25893,11/6,Fairway,Cl,,,,,,
26523,23/5,Belair,Cl,,,,,,
27429,124/2,Dolphin,Cl,,,,,,


In [27]:
rma2.loc[237]

Link       https://www.ratemyagent.com.au/real-estate-age...
Address                                         6 Goodlet Cl
Suburb                              Lane Cove North NSW 2066
Beds                                                     3.0
Baths                                                    2.0
Carpark                                                  2.0
Type                                               Townhouse
Price                                      Price unavailable
By                                             Private Sale 
Date                                     2021-10-21 00:00:00
Name: 237, dtype: object

In [28]:
abbr = [
    [' Street',' St'],
    [' Road',' Rd'],
    [' Avenue',' Ave'],
    [' Place',' Pl'],
    [' Close',' Cl'],
    [' Lane',' Ln'],
    [' Drive',' Dr'],
    [' Highway',' Hwy'],
    [' Parade',' Pde'],
    [' Square',' Sq'],
    [' Court',' Ct'],
    [' Glade',' Gld'],

    ['Parkway','Pkwy'],
    ['Boulevard','Blvd'],
    ['Circuit','Cct'],
    
    [' st',' St'],
    [' st.',' St'],
    [' ST',' St'],
    [' RD',' Rd'],
    [' ave', 'Ave'],
    [' AVE', 'Ave'],
    [' road',' Rd'],
    [' ROAD',' Rd'],
    ['Crt','Ct'],
    ['Pde.','Pde'],

    [' Crescent',' Cres'],
    [' Glen',' Gln'],
    [' Plaza', 'Plz'],
    [' View', ' Vw'],

    ['Mallard La', 'Mallard Ln']
]

In [29]:
# replace all street address names with abbreviation
for row in abbr:
    sales_gov.Address = sales_gov.Address.str.replace(row[0],row[1])
    sales_rma.Address = sales_rma.Address.str.replace(row[0],row[1])

In [30]:
sales_gov.sample(5)

Unnamed: 0,Address,Suburb,Price,Size,Date
870411,21/46 Cobbett St,Wetherill Park Nsw 2164,680000,0.0,2018-06-19
887162,8 Ibis Pde,Woodberry Nsw 2322,270000,461.6,2020-03-06
592437,13 Nunkeri Pl,North Nowra Nsw 2541,471000,719.4,2018-10-16
813223,15/60 Manning St,Tuncurry Nsw 2428,214500,0.0,2020-03-06
93500,71 Mill Hill Rd,Bondi Junction Nsw 2022,1800000,196.0,2020-07-23


Unifying Suburb Names:

In [31]:
rma_suburbs = pd.DataFrame(sales_rma.Suburb.unique())
rma_suburbs.columns = ['Name']


In [32]:
gov_suburbs = pd.DataFrame(sales_gov.Suburb.unique())
gov_suburbs.columns = ['Name']

In [33]:
rma_suburbs.Name.isin(gov_suburbs.Name).astype(int).unique()

array([1])

Seems every suburb is included in the NSW dataset, which means there's no error in the Naming

## Join Two Datasets

In [34]:
sales = pd.merge(sales_rma,sales_gov, on=['Address','Suburb'])
sales.shape

(21403, 13)

In [35]:
sales

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Price_x,By,Date_x,Price_y,Size,Date_y
0,https://www.ratemyagent.com.au/real-estate-age...,9/46 Stewart St,Ermington Nsw 2115,3.0,2.0,1.0,Townhouse,960000,Private Sale,2021-11-01,960000,0.0,2021-09-18
1,https://www.ratemyagent.com.au/real-estate-age...,13/28A Henry St,Ashfield Nsw 2131,2.0,1.0,1.0,Unit,650000,Auction,2021-11-01,520000,0.0,2019-11-02
2,https://www.ratemyagent.com.au/real-estate-age...,4/29 Norfolk St,Blacktown Nsw 2148,2.0,2.0,1.0,Townhouse,670000,Private Sale,2021-11-01,630000,0.0,2018-06-15
3,https://www.ratemyagent.com.au/real-estate-age...,4/5 Thurston St,Penrith Nsw 2750,2.0,1.0,1.0,Unit,Price unavailable,Private Sale,2021-11-01,376000,0.0,2017-07-15
4,https://www.ratemyagent.com.au/real-estate-age...,2/43 Mackenzie St,Strathfield Nsw 2135,3.0,2.0,2.0,Townhouse,1677000,Auction,2021-11-01,1475000,0.0,2019-05-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21398,https://www.ratemyagent.com.au/real-estate-age...,36/132 Killeaton St,St Ives Nsw 2075,2.0,2.0,1.0,Apartment,935000,Auction,2021-05-29,935000,0.0,2021-05-29
21399,https://www.ratemyagent.com.au/real-estate-age...,12/19 Selwyn St,Wollstonecraft Nsw 2065,2.0,1.0,1.0,Apartment,Price unavailable,Auction,2021-05-29,1500000,0.0,2021-05-29
21400,https://www.ratemyagent.com.au/real-estate-age...,12/19 Selwyn St,Wollstonecraft Nsw 2065,2.0,1.0,1.0,Apartment,Price unavailable,Auction,2021-05-29,1180000,0.0,2017-10-14
21401,https://www.ratemyagent.com.au/real-estate-age...,5/62 Middle Head Rd,Mosman Nsw 2088,2.0,1.0,1.0,Apartment,Price unavailable,Auction,2021-05-29,1130000,0.0,2021-05-29


In [36]:
# sort by time
sales1=sales.copy()
sales1['Date_y'] = pd.to_datetime(sales['Date_y'])
sales1['Date_x'] = pd.to_datetime(sales['Date_x'])

sales1.sort_values(by='Date_y', ascending=False , inplace=True)
# sales1.set_index(keys='Date_y', drop=True, inplace=True)

In [37]:
sales1.sample(10)

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Price_x,By,Date_x,Price_y,Size,Date_y
8197,https://www.ratemyagent.com.au/real-estate-age...,160 Coward St,Mascot Nsw 2020,3.0,1.0,3.0,House,2095000,Auction,2021-10-02,2095000,550.1,2021-10-01
19879,https://www.ratemyagent.com.au/real-estate-age...,807/10 Avon Rd,Pymble Nsw 2073,2.0,1.0,1.0,Apartment,Price unavailable,Private Sale,2021-06-25,825000,0.0,2021-06-25
2134,https://www.ratemyagent.com.au/real-estate-age...,3/66 Buller St,North Parramatta Nsw 2151,3.0,2.0,2.0,Townhouse,840000,Private Sale,2021-07-03,770000,0.0,2020-01-15
17741,https://www.ratemyagent.com.au/real-estate-age...,22/40 Park Ave,Waitara Nsw 2077,1.0,1.0,1.0,Apartment,520000,Private Sale,2021-08-05,520000,0.0,2021-07-26
19855,https://www.ratemyagent.com.au/real-estate-age...,310/21 Grosvenor St,Neutral Bay Nsw 2089,1.0,1.0,0.0,Apartment,Price unavailable,Private Sale,2021-06-25,750000,0.0,2021-06-25
16630,https://www.ratemyagent.com.au/real-estate-age...,4/30 Wheeler Pde,Dee Why Nsw 2099,2.0,2.0,2.0,Apartment,1515000,Private Sale,2021-08-27,1515000,0.0,2021-08-26
16188,https://www.ratemyagent.com.au/real-estate-age...,5C/8 Hampden St,Paddington Nsw 2021,1.0,1.0,1.0,Apartment,Price unavailable,Auction,2021-09-04,761000,0.0,2021-09-04
20007,https://www.ratemyagent.com.au/real-estate-age...,3/4 Rothschild Ave,Rosebery Nsw 2018,1.0,1.0,1.0,Apartment,Price unavailable,Private Sale,2021-06-23,832000,0.0,2021-06-23
18041,https://www.ratemyagent.com.au/real-estate-age...,4/29A Innes Rd,Greenwich Nsw 2065,2.0,1.0,1.0,Apartment,825000,Private Sale,2021-07-28,825000,0.0,2021-07-28
18345,https://www.ratemyagent.com.au/real-estate-age...,119/149 Pyrmont St,Pyrmont Nsw 2009,2.0,2.0,1.0,Apartment,1022000,Private Sale,2021-07-23,1022000,0.0,2021-07-22


In [38]:
cols = ['Address','Suburb','Beds','Baths','Carpark','Type','Size','By','Price_x','Price_y', 'Date_x','Date_y']
sales2 = sales1[cols]

In [39]:
sales2.rename({'Price_x':'Price_rma','Price_y':'Price_gov','Date_x':'Date_rma','Date_y':'Date_gov'},axis=1,inplace=True)
sales2.shape

(21403, 12)

In [40]:
sales2 = sales2.drop_duplicates(subset=['Address','Suburb','Price_gov'],keep='last')

In [41]:
sales2.head(10)

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
9423,5 Ryan St,Lilyfield Nsw 2040,5.0,3.0,1.0,House,219.7,Auction,2830000,2830000,2021-09-18,2021-11-17
11131,38 Nightingale Sq,Glossodia Nsw 2756,3.0,1.0,2.0,House,980.1,Private Sale,880000,880000,2021-08-31,2021-11-17
7566,20 Gatley Ct,Wattle Grove Nsw 2173,4.0,2.0,1.0,House,483.0,Private Sale,1130000,1130000,2021-10-14,2021-11-16
15210,2/17 Isabel Ave,Vaucluse Nsw 2030,3.0,2.0,2.0,Apartment,0.0,Auction,Price unavailable,3462000,2021-09-23,2021-11-10
9036,20 Prince St,Canley Heights Nsw 2166,3.0,1.0,1.0,House,442.6,Private Sale,850000,850000,2021-09-22,2021-11-08
8468,81 Coogee Bay Rd,Randwick Nsw 2031,3.0,2.0,1.0,House,297.2,Auction,Price unavailable,3080000,2021-09-29,2021-11-05
13966,913/187 Kent St,Millers Point Nsw 2000,1.0,1.0,0.0,Apartment,0.0,Private Sale,1100000,1100000,2021-11-01,2021-11-01
13085,629 Polding St,Bossley Park Nsw 2176,3.0,1.0,1.0,House,564.6,Auction,940000,940000,2021-08-10,2021-10-28
310,24/75 Broome St,Maroubra Nsw 2035,1.0,1.0,1.0,Unit,0.0,Private Sale,720000,720000,2021-10-07,2021-10-27
7092,7A South Creek Rd,Dee Why Nsw 2099,3.0,1.0,2.0,House,323.7,Private Sale,Price unavailable,1890000,2021-10-26,2021-10-26


## Showing the differences between the two datasets

In [42]:
sales_rma[(sales_rma['Suburb']=='West Ryde Nsw 2114')].sort_values('Date',ascending=False)[0:10].drop(['Link','Type','By','Beds','Baths','Carpark'],axis=1)

Unnamed: 0,Address,Suburb,Price,Date
3,7/12 Union St,West Ryde Nsw 2114,Price unavailable,2021-11-01
8672,23 Falconer St,West Ryde Nsw 2114,2580000,2021-10-30
9425,23 Reserve St,West Ryde Nsw 2114,2461000,2021-10-23
19217,7/15 Riverview St,West Ryde Nsw 2114,475000,2021-10-20
19588,17/20 Herbert St,West Ryde Nsw 2114,Price unavailable,2021-10-14
10770,54 Hermitage Rd,West Ryde Nsw 2114,Price unavailable,2021-10-13
10741,14 Moss St,West Ryde Nsw 2114,2270000,2021-10-13
19810,67/57 West Pde,West Ryde Nsw 2114,482000,2021-10-12
10935,15 Falconer St,West Ryde Nsw 2114,2880000,2021-10-12
689,2B Hermoyne St,West Ryde Nsw 2114,1560000,2021-10-06


In [43]:
sales_gov[(sales_gov['Suburb']=='West Ryde Nsw 2114')].sort_values('Date',ascending=False)[0:10].drop(['Size'],axis=1)

Unnamed: 0,Address,Suburb,Price,Date
865010,7/15 Riverview St,West Ryde Nsw 2114,475000,2021-10-08
865108,6/4 Union St,West Ryde Nsw 2114,580000,2021-10-05
864860,7/1A Macpherson St,West Ryde Nsw 2114,1400000,2021-10-02
865289,32/22 Herbert St,West Ryde Nsw 2114,760000,2021-10-01
864503,4/12 Adelaide St,West Ryde Nsw 2114,12200,2021-09-24
865058,4/84 Station St,West Ryde Nsw 2114,655000,2021-09-22
865203,67/61 West Pde,West Ryde Nsw 2114,482000,2021-09-21
864837,12 Lambert St,West Ryde Nsw 2114,1840000,2021-09-18
864685,9/2 Mulvihill St,West Ryde Nsw 2114,285000,2021-09-16
865204,30/61 West Pde,West Ryde Nsw 2114,660000,2021-09-15


In [44]:
sales2[sales2['Suburb']=='West Ryde Nsw 2114'].sort_values('Date_rma',ascending=False)[0:5]

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
7134,23 Reserve St,West Ryde Nsw 2114,5.0,4.0,2.0,House,809.4,Auction,2461000,2060000,2021-10-23,2017-12-09
14159,7/15 Riverview St,West Ryde Nsw 2114,1.0,1.0,1.0,Apartment,0.0,Private Sale,475000,475000,2021-10-20,2021-10-08
386,4/84 Station St,West Ryde Nsw 2114,2.0,1.0,1.0,Unit,0.0,Private Sale,655000,655000,2021-10-01,2021-09-22
15035,30/61 West Pde,West Ryde Nsw 2114,2.0,1.0,1.0,Apartment,0.0,Private Sale,660000,660000,2021-09-27,2021-09-15
9381,12 Lambert St,West Ryde Nsw 2114,3.0,1.0,2.0,House,742.6,Auction,1840000,1840000,2021-09-18,2021-09-18


1. RMA dataset is newer, Gov dataset ends 24 Oct, 2021.
2. Although RMA's records are newer, sometimes the date stamp comes later than the Gov records.
3. RMA dataset contains some of the sales that's not been submitted to the government. E.g., some of the sales from Sept 2021 are still not in the Gov dataset.
4. RMA dataset is not complete, some of the sales that's in the Gov dataset is not in the RMA dataset. Those properties were probably sold privately or were sold on other platforms.


In [45]:
# This cell exists solely because someone may want to remove the flipped houses (sold again within a few months),
# uncomment the next line if you want one property to have only one sale record, which has the most recent sale record.

# sales3 = sales2.drop_duplicates(subset=['Address','Suburb'],keep='first') 
sales3 = sales2.copy()
sales3.shape

(20937, 12)

## Fill "Price unavailable" from government dataset

The rma data set contains the latest sale prices, but some the prices are listed as "unavailable", either because they are sold else where or because the price was never disclosed by the seller or buyer.

Lucky for us, these transactions have to be disclosed to the government within a few months of selling. So, we can get the missing prices from the government dataset.

** "Last Known Price" Disclaimer

For most of the time, the government dataset lags behind the real time sales, because people need time to settle the deal, sometimes by several months. This is why we can eventually get all of the price data from the government. But for the recent months, we will have to make do with the data from RMA.

One problem caused by filling the price from the government dataset is: if a property was sold again within a few months, the price we fill from the government will be the last known prices, but not the most recent ones.

E.g., 3 month ago, a house was sold by private sale for 1.0 million. Since it's "private", rma didn't get the price (sometimes they do although this time they didn't), so the price of this sale was marked as "unavailable". Then the transaction went through in a few days, and the government got the settled price: "1.0 million" and recorded it. Fast forward to today, the house was sold again for 1.5 million, RMA still didn't get the price(maybe another private sale), so this time it was still marked as "price unavailable". If at this stage we get the data from both rma and government, and fill the price recorded by the government of 1.0 million to this sale, then apparently it's wrong.

In short, this means the data can be misleading, and the house price may not be the actual price, but the "last known price". Fortunately if we update the government dataset constantly in the following months, we will eventually get the accurate prices for recent sales, but of course, the data won't be as fresh by then.

In [46]:
print('There are',sales3[sales3.Price_rma == 'Price unavailable'].shape[0],'records without price.')
sales3[sales3.Price_rma == 'Price unavailable'][0:10]

There are 6330 records without price.


Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
15210,2/17 Isabel Ave,Vaucluse Nsw 2030,3.0,2.0,2.0,Apartment,0.0,Auction,Price unavailable,3462000,2021-09-23,2021-11-10
8468,81 Coogee Bay Rd,Randwick Nsw 2031,3.0,2.0,1.0,House,297.2,Auction,Price unavailable,3080000,2021-09-29,2021-11-05
7092,7A South Creek Rd,Dee Why Nsw 2099,3.0,1.0,2.0,House,323.7,Private Sale,Price unavailable,1890000,2021-10-26,2021-10-26
7133,97 Hargrave St,Paddington Nsw 2021,3.0,1.0,1.0,House,94.84,Auction,Price unavailable,2755000,2021-10-23,2021-10-23
7148,6 Coolaroo Rd,Lane Cove North Nsw 2066,3.0,1.0,3.0,House,562.8,Auction,Price unavailable,3450000,2021-10-23,2021-10-23
14107,6/217 Malabar Rd,South Coogee Nsw 2034,3.0,1.0,1.0,Apartment,0.0,Private Sale,Price unavailable,1485000,2021-10-22,2021-10-22
14086,202/127 Pennant St,Parramatta Nsw 2150,2.0,2.0,1.0,Apartment,0.0,Private Sale,Price unavailable,645000,2021-10-22,2021-10-22
14138,10/31 Addison Rd,Manly Nsw 2095,3.0,2.0,2.0,Apartment,0.0,Auction,Price unavailable,3100000,2021-10-21,2021-10-21
7221,5 Glnview Cl,Bella Vista Nsw 2153,5.0,3.0,3.0,House,660.1,Auction,Price unavailable,2950000,2021-10-21,2021-10-21
15318,2/66 Helen St,Lane Cove North Nsw 2066,2.0,1.0,1.0,Apartment,0.0,Auction,Price unavailable,950000,2021-09-21,2021-10-21


In [47]:
# from dateutil.relativedelta import *

for i in sales3.index:
    entry = sales3.loc[i]
    if entry.Price_rma == 'Price unavailable':
        date_rma = entry.Date_rma
        date_gov = entry.Date_gov

        # date_1mon_before = date_gov + relativedelta(months=-2)
        # date_1mon_after = date_gov + relativedelta(months=+2)

        if date_rma<date_gov:
            sales3.at[i, 'Price_rma']=sales3.loc[i].Price_gov




In [48]:
# Removing the entries with no Price

sales4 = sales3[sales3.Price_rma!="Price unavailable"] 

# Clean up

In [49]:
sales4.isnull().sum()[sales4.isnull().sum()!=0]

Baths       9
Size        2
By          1
Date_rma    1
dtype: int64

In [50]:
# check for null values in time columns
sales4[sales4.Date_rma.isnull() | sales4.Date_gov.isnull()]

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
19728,1502/3 Network Pl,North Ryde Nsw 2113,1.0,1.0,1.0,Apartment,0.0,,705000,705000,NaT,2021-06-21


In [51]:
for i in range(0, len(sales4)):
    if pd.isnull(sales4['Date_rma'].iloc[i]):
        sales4['Date_rma'].iloc[i] = sales4['Date_gov'].iloc[i]


In [52]:
# properties without baths
sales4[sales4.Baths.isnull()]


Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
14707,70/95 Annandale St,Annandale Nsw 2038,1.0,,0.0,Apartment,0.0,Private Sale,300000,300000,2021-10-04,2021-10-04
9216,102 Broomfield St,Cabramatta Nsw 2166,1.0,,1.0,House,885.2,Auction,2150000,2150000,2021-09-20,2021-09-22
16962,28/1 Dwyer St,Chippendale Nsw 2008,1.0,,0.0,Apartment,0.0,Private Sale,370000,370000,2021-08-20,2021-08-20
12528,63 Amarco Cct,The Ponds Nsw 2769,2.0,,1.0,House,600.3,Private Sale,1530000,1530000,2021-08-16,2021-08-16
17791,38/5 Darley St,Darlinghurst Nsw 2010,1.0,,0.0,Apartment,0.0,Auction,485000,485000,2021-08-03,2021-08-03
18816,109/72 Henrietta St,Waverley Nsw 2024,1.0,,0.0,Apartment,0.0,Auction,500000,500000,2021-07-14,2021-07-14
18091,618/18 Park Ln,Chippendale Nsw 2008,1.0,,0.0,Apartment,0.0,Private Sale,575000,575000,2021-07-27,2021-07-13
19106,1/80 Victoria Rd,Marrickville Nsw 2204,1.0,,0.0,Apartment,0.0,Auction,505000,505000,2021-07-08,2021-07-08
20093,402/355 Kent St,Sydney Nsw 2000,1.0,,0.0,Apartment,0.0,Private Sale,660000,660000,2021-06-22,2021-06-15


In [53]:
# after checking some of the apartment listed here on other platforms such as domain.com.au,
# I found that most of these 1 bed apartments have at lease 1 bathroom, so they are filled with 1.

sales4.loc[(sales4.Baths.isnull()) & (sales4.Type == 'Apartment'), 'Baths'] = 1


In [54]:
sales4[sales4.Baths.isnull()]


Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
9216,102 Broomfield St,Cabramatta Nsw 2166,1.0,,1.0,House,885.2,Auction,2150000,2150000,2021-09-20,2021-09-22
12528,63 Amarco Cct,The Ponds Nsw 2769,2.0,,1.0,House,600.3,Private Sale,1530000,1530000,2021-08-16,2021-08-16


In [55]:
# these two etries are wrong after checking, so I simply dropped them. RMA's employees didn't do their job on this two rows.
sales5 = sales4.drop([9216, 12528], axis=0)


In [56]:
sales5[sales5.isnull().any(axis=1)]


Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
11123,27B Alfred Rd,Forest Lodge Nsw 2037,2.0,2.0,2.0,House,,Auction,2085000,2085000,2021-08-31,2021-08-31
12613,5 Burnell St,Drummoyne Nsw 2047,4.0,2.0,4.0,House,,Auction,7575000,7575000,2021-08-14,2021-08-14
19728,1502/3 Network Pl,North Ryde Nsw 2113,1.0,1.0,1.0,Apartment,0.0,,705000,705000,2021-06-21,2021-06-21


In [57]:
sales5.loc[sales5.By.isnull() , 'By'] = 'Auction' # Let it be auction, doesn't matter
sales5.loc[11123, 'Size'] = 160
sales5.loc[12613, 'Size'] = 695


In [58]:
if sales5.isnull().sum().sum()==0:
    print("No null values")
else:
    print(sales5.isnull().sum()[sales5.isnull().sum()!=0])


No null values


## Final Trim

In [59]:
recent_sales=sales5.drop(columns=['Price_gov','Date_gov']) # we will use the more recent price column, which was rma's with prices filled from gov's
recent_sales.rename({'Price_rma':'Price','Date_rma':'Date'},axis=1,inplace=True) # we will use only one time column, doesn't matter if it's rma's or gov's
recent_sales.sort_values('Date',ascending=False,inplace=True)

#export to csv
recent_sales.to_csv( "./data/recent_sales.csv", index=False, encoding='utf-8-sig')


In [60]:
recent_sales

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price,Date
1,13/28A Henry St,Ashfield Nsw 2131,2.0,1.0,1.0,Unit,0.00,Auction,650000,2021-11-01
6960,4 Yvette St,Baulkham Hills Nsw 2153,4.0,2.0,1.0,House,695.60,Private Sale,1625000,2021-11-01
6958,9 Nichols Ave,Revesby Nsw 2212,3.0,1.0,1.0,House,557.34,Auction,1350000,2021-11-01
6956,52 Charlotte Rd,Rooty Hill Nsw 2766,3.0,2.0,3.0,House,550.20,Private Sale,895000,2021-11-01
6954,10 Lillypilly St,Colebee Nsw 2761,5.0,3.0,2.0,House,533.40,Private Sale,1437000,2021-11-01
...,...,...,...,...,...,...,...,...,...,...
6941,2/402 Railway Pde,Allawah Nsw 2218,2.0,1.0,2.0,Unit,0.00,Private Sale,620000,2020-11-03
6937,5/22 Ramona St,Quakers Hill Nsw 2763,4.0,2.0,2.0,Townhouse,0.00,Private Sale,750000,2020-11-03
6935,10/10 Marsden Rd,St Marys Nsw 2760,3.0,2.0,2.0,Townhouse,0.00,Private Sale,497500,2020-11-03
6944,3/240 Katoomba St,Katoomba Nsw 2780,3.0,1.0,1.0,Townhouse,0.00,Private Sale,565000,2020-11-03


In [61]:
%store recent_sales

Stored 'recent_sales' (DataFrame)
