In [1]:
from datetime import datetime, date
import warnings
import numpy as np
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')


## Combine and clean data files from the NSW government

The following code joins all of the CSV files from NSW government into a single CSV.

In [2]:
# All data files from the NSW government can be found at http://maps.six.nsw.gov.au/csv/current/suburb/
# All of the 4325 files are quite large and should be updated, so they are not included in the github repo.

# If a new gov dataset is needed, just re-download all the csv files from the website above and run this cell.
# I used chrono download manager's sniffer feature to download them all.

# I know the analysis is based on Sydney house prices but these government data files are for the whole state of NSW Australia.
# I'm sure there's a smart way to get rid of some of these files to reduce the data size, and improve process time,
# but for now I'm just too lazy to do that.

# uncomment the following code if you want to re-combine the csv files.

# extension = 'csv'
# all_filenames = [i for i in glob.glob('./data/gov/*.{}'.format(extension))]

# #combine all files in the list
# combined = pd.concat([pd.read_csv(f) for f in all_filenames ])

# # removing less useful columns
# combined = combined.drop(['MULTI-PROPERTY SALE (Y/N)', 'STRATA/NON STRATA', 'PROPERTY NUMBER',
#                'DEALING NUMBER', 'EXTRACTION DATE'], axis=1)

# # removing duplicates

# combined = combined.drop_duplicates(keep='last')

# # do the hard work now so don't have to do this again each time
# combined['SALE DATE']=pd.to_datetime(combined['SALE DATE'])

# combined.shape

# #export to csv
# combined.to_csv( "./data/gov_combined.csv", index=False, encoding='utf-8-sig')


In [3]:
# check the shape and columns of the data frame
gov1 = pd.read_csv('./data/gov_combined.csv') # if you don't want to run the prvious cell again every time, just comment it out
print(gov1.shape)
gov1.head()


(908210, 4)


Unnamed: 0,ADDRESS,SALE PRICE,SALE DATE,AREA
0,"7685 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",435000,2019-08-30,400100.0
1,"2162 AARONS PASS ROAD, AARONS PASS NSW 2850",315000,2017-08-22,428500.0
2,"7749 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",340000,2019-12-05,398500.0
3,"159 SUTTERS LANE, AARONS PASS NSW 2850",365000,2021-01-20,470800.0
4,"8248 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",700000,2018-10-19,402000.0


In [4]:
# Split street address and suburb

addr = gov1.ADDRESS.str.split(
    ',', n=1, expand=True).apply(lambda x: x.str.strip())
gov2 = gov1.copy()
for i in addr.columns:
    gov2[i] = addr[i]


In [5]:
gov2.head()

Unnamed: 0,ADDRESS,SALE PRICE,SALE DATE,AREA,0,1
0,"7685 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",435000,2019-08-30,400100.0,7685 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
1,"2162 AARONS PASS ROAD, AARONS PASS NSW 2850",315000,2017-08-22,428500.0,2162 AARONS PASS ROAD,AARONS PASS NSW 2850
2,"7749 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",340000,2019-12-05,398500.0,7749 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
3,"159 SUTTERS LANE, AARONS PASS NSW 2850",365000,2021-01-20,470800.0,159 SUTTERS LANE,AARONS PASS NSW 2850
4,"8248 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",700000,2018-10-19,402000.0,8248 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850


In [6]:
gov2.rename(columns={0:'ADDR', 1:'SUBURB', 'SALE DATE':'DATE','SALE PRICE':'PRICE'},inplace=True)
gov2.head()


Unnamed: 0,ADDRESS,PRICE,DATE,AREA,ADDR,SUBURB
0,"7685 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",435000,2019-08-30,400100.0,7685 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
1,"2162 AARONS PASS ROAD, AARONS PASS NSW 2850",315000,2017-08-22,428500.0,2162 AARONS PASS ROAD,AARONS PASS NSW 2850
2,"7749 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",340000,2019-12-05,398500.0,7749 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
3,"159 SUTTERS LANE, AARONS PASS NSW 2850",365000,2021-01-20,470800.0,159 SUTTERS LANE,AARONS PASS NSW 2850
4,"8248 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",700000,2018-10-19,402000.0,8248 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850


In [7]:
# Leaving only the useful columns

cols = ['ADDR','SUBURB','PRICE','AREA','DATE']
gov3 = gov2[cols]
gov3.columns = ['Address','Suburb','Price','Size','Date']
gov3.head()

Unnamed: 0,Address,Suburb,Price,Size,Date
0,7685 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850,435000,400100.0,2019-08-30
1,2162 AARONS PASS ROAD,AARONS PASS NSW 2850,315000,428500.0,2017-08-22
2,7749 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850,340000,398500.0,2019-12-05
3,159 SUTTERS LANE,AARONS PASS NSW 2850,365000,470800.0,2021-01-20
4,8248 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850,700000,402000.0,2018-10-19


In [8]:
sales_gov = gov3.copy()

## Combine and clean data scraped from RateMyAgent.com.au

In [9]:
# All data files from the RateMyAgent website are scraped with some tool I shall not name.
# The data should be updated once every month.
# This cell is only needed when combining new data with existing data.
# uncomment the following code if you want to re-combine the csv files.

# extension = 'csv'
# all_filenames = [i for i in glob.glob('./data/rma/*.{}'.format(extension))]

# #combine all files in the list
# combined = pd.concat([pd.read_csv(f) for f in all_filenames ])
# #export to csv
# combined.to_csv( "./data/rma_combined.csv", index=False, encoding='utf-8-sig')


In [10]:
rma = pd.read_csv('./data/rma_combined.csv')
rma.shape

(58261, 9)

In [11]:
rma.head()

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Sale Type,Price
0,https://www.ratemyagent.com.au/real-estate-age...,239 Cope St,Waterloo NSW 2017,4.0,2.0,1.0,Townhouse,Sold by Auction on 06 Dec 2021,"$1,495,000"
1,https://www.ratemyagent.com.au/real-estate-age...,6/7 Bank Street,Meadowbank NSW 2114,2.0,1.0,1.0,Unit,Sold by Private Sale on 06 Dec 2021,"$540,000"
2,https://www.ratemyagent.com.au/real-estate-age...,12/1 Ozone Street,Cronulla NSW 2230,2.0,1.0,1.0,Unit,Sold by Auction on 06 Dec 2021,"$1,040,000"
3,https://www.ratemyagent.com.au/real-estate-age...,2/61 Cornelia St,Wiley Park NSW 2195,2.0,1.0,1.0,Unit,Sold by Private Sale on 06 Dec 2021,"$380,000"
4,https://www.ratemyagent.com.au/real-estate-age...,28/570 President Ave,Sutherland NSW 2232,3.0,2.0,2.0,Unit,Sold by Private Sale on 06 Dec 2021,Price unavailable


In [12]:
# remove duplicates

rma1 = rma.drop_duplicates(keep='last')
rma1.shape

(40965, 9)

In [13]:
rma1.isnull().sum()

Link          108
Address         1
Suburb          1
Beds           58
Baths         122
Carpark      3843
Type         1501
Sale Type      17
Price           1
dtype: int64

In [14]:
# Dropping rows that are missing important values

rma1.dropna(subset=['Price','Beds'],inplace=True)
rma1.head()

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Sale Type,Price
0,https://www.ratemyagent.com.au/real-estate-age...,239 Cope St,Waterloo NSW 2017,4.0,2.0,1.0,Townhouse,Sold by Auction on 06 Dec 2021,"$1,495,000"
1,https://www.ratemyagent.com.au/real-estate-age...,6/7 Bank Street,Meadowbank NSW 2114,2.0,1.0,1.0,Unit,Sold by Private Sale on 06 Dec 2021,"$540,000"
2,https://www.ratemyagent.com.au/real-estate-age...,12/1 Ozone Street,Cronulla NSW 2230,2.0,1.0,1.0,Unit,Sold by Auction on 06 Dec 2021,"$1,040,000"
3,https://www.ratemyagent.com.au/real-estate-age...,2/61 Cornelia St,Wiley Park NSW 2195,2.0,1.0,1.0,Unit,Sold by Private Sale on 06 Dec 2021,"$380,000"
4,https://www.ratemyagent.com.au/real-estate-age...,28/570 President Ave,Sutherland NSW 2232,3.0,2.0,2.0,Unit,Sold by Private Sale on 06 Dec 2021,Price unavailable


In [15]:
# Fill null values in column: "Type". If there's a slash(/) in the address, it's an apartment, else it's a house.

nulls = rma1[rma1['Type'].isnull()]
apartments = nulls[nulls.Address.str.contains('/')]
houses = nulls[~nulls.Address.str.contains('/')]
for i in houses.index:
    rma1.at[i,'Type'] = 'House'
for i in apartments.index:
    rma1.at[i,'Type'] = 'Apartment'


In [16]:
# Fill null values in column: "Carpark". If it's an apartment, there's no parking. If it's a house, there's 2 parking spaces.

nulls = rma1[rma1['Carpark'].isnull()]
apartments = nulls[nulls.Address.str.contains('/')]
houses = nulls[~nulls.Address.str.contains('/')]
for i in houses.index:
    rma1.at[i,'Carpark'] = 1
for i in apartments.index:
    rma1.at[i,'Carpark'] = 0

In [17]:
rma1.isnull().sum()

Link         107
Address        0
Suburb         0
Beds           0
Baths         64
Carpark        0
Type           0
Sale Type     16
Price          0
dtype: int64

In [18]:
rma1.head()

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Sale Type,Price
0,https://www.ratemyagent.com.au/real-estate-age...,239 Cope St,Waterloo NSW 2017,4.0,2.0,1.0,Townhouse,Sold by Auction on 06 Dec 2021,"$1,495,000"
1,https://www.ratemyagent.com.au/real-estate-age...,6/7 Bank Street,Meadowbank NSW 2114,2.0,1.0,1.0,Unit,Sold by Private Sale on 06 Dec 2021,"$540,000"
2,https://www.ratemyagent.com.au/real-estate-age...,12/1 Ozone Street,Cronulla NSW 2230,2.0,1.0,1.0,Unit,Sold by Auction on 06 Dec 2021,"$1,040,000"
3,https://www.ratemyagent.com.au/real-estate-age...,2/61 Cornelia St,Wiley Park NSW 2195,2.0,1.0,1.0,Unit,Sold by Private Sale on 06 Dec 2021,"$380,000"
4,https://www.ratemyagent.com.au/real-estate-age...,28/570 President Ave,Sutherland NSW 2232,3.0,2.0,2.0,Unit,Sold by Private Sale on 06 Dec 2021,Price unavailable


In [19]:
# Split sale type and date

rma1['Sale Type'] = rma['Sale Type'].str.replace('Sold by ','' )

In [20]:
saletype = rma1['Sale Type'].str.split(' on ',expand=True)
for i in saletype.columns:
    rma1[i] = saletype[i]

In [21]:
rma2 = rma1.drop('Sale Type',axis=1)
rma2.rename({0:'By',1:'Date'},axis=1,inplace=True)

# trim price
rma2['Price'] = rma2['Price'].str.replace('$','').str.replace(',','')

# to datetime just in case
rma2['Date']=pd.to_datetime(rma2['Date'])



In [22]:
sales_rma = rma2.copy()
sales_rma

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Price,By,Date
0,https://www.ratemyagent.com.au/real-estate-age...,239 Cope St,Waterloo NSW 2017,4.0,2.0,1.0,Townhouse,1495000,Auction,2021-12-06
1,https://www.ratemyagent.com.au/real-estate-age...,6/7 Bank Street,Meadowbank NSW 2114,2.0,1.0,1.0,Unit,540000,Private Sale,2021-12-06
2,https://www.ratemyagent.com.au/real-estate-age...,12/1 Ozone Street,Cronulla NSW 2230,2.0,1.0,1.0,Unit,1040000,Auction,2021-12-06
3,https://www.ratemyagent.com.au/real-estate-age...,2/61 Cornelia St,Wiley Park NSW 2195,2.0,1.0,1.0,Unit,380000,Private Sale,2021-12-06
4,https://www.ratemyagent.com.au/real-estate-age...,28/570 President Ave,Sutherland NSW 2232,3.0,2.0,2.0,Unit,Price unavailable,Private Sale,2021-12-06
...,...,...,...,...,...,...,...,...,...,...
58256,https://www.ratemyagent.com.au/real-estate-age...,B508/26 Cambridge Street,Epping NSW 2121,2.0,2.0,1.0,Apartment,Price unavailable,Private Sale,2021-07-09
58257,https://www.ratemyagent.com.au/real-estate-age...,205/628 Canterbury Road,Belmore NSW 2192,2.0,2.0,1.0,Apartment,625000,Private Sale,2021-07-09
58258,https://www.ratemyagent.com.au/real-estate-age...,4.09/655 King Street,Newtown NSW 2042,2.0,2.0,1.0,Apartment,1050000,Private Sale,2021-07-08
58259,https://www.ratemyagent.com.au/real-estate-age...,2/12 Bentley St,Balgowlah NSW 2093,2.0,1.0,1.0,Apartment,1227000,Private Sale,2021-07-08


## Prepare datasets before joining

Unify Street Names (Street -> St, etc.)

In [23]:
sales_gov.Address = sales_gov.Address.str.title().str.strip()
sales_gov.Suburb = sales_gov.Suburb.str.title().str.strip()
sales_rma.Address = sales_rma.Address.str.title().str.strip()
sales_rma.Suburb = sales_rma.Suburb.str.title().str.strip()


In [24]:
road_names = sales_rma.Address.str.split(' ', expand = True)


In [25]:
rd_names = road_names[2].unique()

filtered = []
for n in rd_names:
    if n == None:
        pass
    elif len(n)<3:
            filtered.append(n)

filtered


['St',
 'Rd',
 'Dr',
 'Pl',
 'Ln',
 'Cl',
 '4',
 'Sp',
 'La',
 '16',
 'Ct',
 'N',
 'On',
 '&',
 'De',
 'Vw',
 'Of',
 'Cr',
 'Is',
 'Av']

In [26]:
road_names[road_names[2]=='Cl']



Unnamed: 0,0,1,2,3,4,5,6,7,8
306,8/1,Ferndale,Cl,,,,,,
793,15,Bandicoot,Cl,,,,,,
8942,20,Lisa,Cl,,,,,,
8956,2/23,Alex,Cl,,,,,,
9016,1/14,Johnson,Cl,,,,,,
...,...,...,...,...,...,...,...,...,...
50504,7/5,Fairway,Cl,,,,,,
54826,114/6,Nile,Cl,,,,,,
55075,444/1,Searay,Cl,,,,,,
56278,5/11,Fairway,Cl,,,,,,


In [27]:
rma2.loc[237]

Link       https://www.ratemyagent.com.au/real-estate-age...
Address                                  4/61 Portico Parade
Suburb                                   Toongabbie NSW 2146
Beds                                                     4.0
Baths                                                    2.0
Carpark                                                  2.0
Type                                               Townhouse
Price                                                 856000
By                                             Private Sale 
Date                                     2021-11-26 00:00:00
Name: 237, dtype: object

In [28]:
abbr = [
    [' Street',' St'],
    [' Road',' Rd'],
    [' Avenue',' Ave'],
    [' Place',' Pl'],
    [' Close',' Cl'],
    [' Lane',' Ln'],
    [' Drive',' Dr'],
    [' Highway',' Hwy'],
    [' Parade',' Pde'],
    [' Square',' Sq'],
    [' Court',' Ct'],
    [' Glade',' Gld'],

    ['Parkway','Pkwy'],
    ['Boulevard','Blvd'],
    ['Circuit','Cct'],
    
    [' st',' St'],
    [' st.',' St'],
    [' ST',' St'],
    [' RD',' Rd'],
    [' ave', 'Ave'],
    [' AVE', 'Ave'],
    [' road',' Rd'],
    [' ROAD',' Rd'],
    ['Crt','Ct'],
    ['Pde.','Pde'],

    [' Crescent',' Cres'],
    [' Glen',' Gln'],
    [' Plaza', 'Plz'],
    [' View', ' Vw'],

    ['Mallard La', 'Mallard Ln']
]

In [29]:
# replace all street address names with abbreviation
for row in abbr:
    sales_gov.Address = sales_gov.Address.str.replace(row[0],row[1])
    sales_rma.Address = sales_rma.Address.str.replace(row[0],row[1])

In [30]:
sales_gov.sample(5)

Unnamed: 0,Address,Suburb,Price,Size,Date
713508,26B St George Cres,Sandy Point Nsw 2172,1120000,1081.7,2017-08-28
219737,85 Plantation Rd,Cudgen Nsw 2487,1250000,2285.0,2019-04-16
678911,2031 Emmaville Rd,Reddestone Nsw 2370,105000,400500.0,2017-07-04
166006,1463 Cells Rd,Cells River Nsw 2424,150000,161900.0,2020-09-10
255641,4/58 Forrest Rd,East Hills Nsw 2213,851000,0.0,2017-11-29


Unifying Suburb Names:

In [31]:
rma_suburbs = pd.DataFrame(sales_rma.Suburb.unique())
rma_suburbs.columns = ['Name']


In [32]:
gov_suburbs = pd.DataFrame(sales_gov.Suburb.unique())
gov_suburbs.columns = ['Name']

In [33]:
rma_suburbs.Name.isin(gov_suburbs.Name).astype(int).unique()

array([1, 0])

Seems every suburb is included in the NSW dataset, which means there's no error in the Naming

## Join Two Datasets

In [34]:
sales = pd.merge(sales_rma,sales_gov, on=['Address','Suburb'])
sales.shape

(25694, 13)

In [35]:
sales

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Price_x,By,Date_x,Price_y,Size,Date_y
0,https://www.ratemyagent.com.au/real-estate-age...,2/61 Cornelia St,Wiley Park Nsw 2195,2.0,1.0,1.0,Unit,380000,Private Sale,2021-12-06,345000,0.0,2020-10-17
1,https://www.ratemyagent.com.au/real-estate-age...,28/570 President Ave,Sutherland Nsw 2232,3.0,2.0,2.0,Unit,Price unavailable,Private Sale,2021-12-06,55000,0.0,2021-07-01
2,https://www.ratemyagent.com.au/real-estate-age...,6/5 Nielsen Ave,Carlton Nsw 2218,2.0,1.0,1.0,Unit,740000,Auction,2021-12-04,598000,0.0,2017-03-17
3,https://www.ratemyagent.com.au/real-estate-age...,6/190 Park Rd,Auburn Nsw 2144,3.0,2.0,2.0,Unit,Price unavailable,Private Sale,2021-12-03,680000,0.0,2021-06-19
4,https://www.ratemyagent.com.au/real-estate-age...,18/2A Killara Ave,Killara Nsw 2071,3.0,2.0,2.0,Unit,Price unavailable,Auction,2021-12-03,1725000,0.0,2017-07-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25689,https://www.ratemyagent.com.au/real-estate-age...,205/628 Canterbury Rd,Belmore Nsw 2192,2.0,2.0,1.0,Apartment,625000,Private Sale,2021-07-09,625000,0.0,2021-06-17
25690,https://www.ratemyagent.com.au/real-estate-age...,205/628 Canterbury Rd,Belmore Nsw 2192,2.0,2.0,1.0,Apartment,625000,Private Sale,2021-07-09,600000,0.0,2017-05-23
25691,https://www.ratemyagent.com.au/real-estate-age...,2/12 Bentley St,Balgowlah Nsw 2093,2.0,1.0,1.0,Apartment,1227000,Private Sale,2021-07-08,1227000,0.0,2021-07-08
25692,https://www.ratemyagent.com.au/real-estate-age...,2/12 Bentley St,Balgowlah Nsw 2093,2.0,1.0,1.0,Apartment,1227000,Private Sale,2021-07-08,919000,0.0,2019-03-27


In [36]:
# sort by time
sales1=sales.copy()
sales1['Date_y'] = pd.to_datetime(sales['Date_y'])
sales1['Date_x'] = pd.to_datetime(sales['Date_x'])

sales1.sort_values(by='Date_y', ascending=False , inplace=True)
# sales1.set_index(keys='Date_y', drop=True, inplace=True)

In [37]:
sales1.sample(10)

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Price_x,By,Date_x,Price_y,Size,Date_y
3340,https://www.ratemyagent.com.au/real-estate-age...,1/11 Myahgah Rd,Mosman Nsw 2088,2.0,1.0,0.0,Apartment,Price unavailable,Private Sale,2021-08-10,1150000,0.0,2021-08-10
5183,https://www.ratemyagent.com.au/real-estate-age...,11/15 Bryant St,Padstow Nsw 2211,3.0,1.0,1.0,Townhouse,720000,Private Sale,2021-05-12,720000,0.0,2021-04-28
994,https://www.ratemyagent.com.au/real-estate-age...,1102/65 Manning St,Kiama Nsw 2533,2.0,2.0,1.0,Unit,1000000,Private Sale,2021-11-30,615000,0.0,2016-12-19
1781,https://www.ratemyagent.com.au/real-estate-age...,92/15 Lorraine Ave,Berkeley Vale Nsw 2261,2.0,1.0,1.0,Unit,405000,Private Sale,2021-06-25,370000,0.0,2018-06-16
2216,https://www.ratemyagent.com.au/real-estate-age...,3/46 Solander St,Monterey Nsw 2217,2.0,1.0,1.0,Unit,800000,Private Sale,2021-10-13,685000,0.0,2019-11-12
18927,https://www.ratemyagent.com.au/real-estate-age...,158 Gannons Rd,Caringbah South Nsw 2229,3.0,3.0,4.0,House,2800000,Auction,2021-11-02,1525000,745.38,2018-11-22
24911,https://www.ratemyagent.com.au/real-estate-age...,902/5 Waterways St,Wentworth Point Nsw 2127,1.0,1.0,1.0,Apartment,585000,Private Sale,2021-07-26,585000,0.0,2021-07-07
8136,https://www.ratemyagent.com.au/real-estate-age...,11/283 Maroubra Rd,Maroubra Nsw 2035,2.0,1.0,1.0,Unit,842500,Auction,2020-11-24,842500,0.0,2020-11-24
19948,https://www.ratemyagent.com.au/real-estate-age...,17 Patrick Ave,Castle Hill Nsw 2154,5.0,1.0,2.0,House,2200000,Private Sale,2021-10-06,2200000,929.3,2021-10-05
13271,https://www.ratemyagent.com.au/real-estate-age...,35 Spark St,Earlwood Nsw 2206,3.0,1.0,3.0,House,1525000,Auction,2021-08-07,1525000,373.1,2021-08-07


In [38]:
cols = ['Address','Suburb','Beds','Baths','Carpark','Type','Size','By','Price_x','Price_y', 'Date_x','Date_y']
sales2 = sales1[cols]

In [39]:
sales2.rename({'Price_x':'Price_rma','Price_y':'Price_gov','Date_x':'Date_rma','Date_y':'Date_gov'},axis=1,inplace=True)
sales2.shape

(25694, 12)

In [40]:
sales2 = sales2.drop_duplicates(subset=['Address','Suburb','Price_gov'],keep='last')

In [41]:
sales2.head(10)

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
11129,38 Nightingale Sq,Glossodia Nsw 2756,3.0,1.0,2.0,House,980.1,Private Sale,880000,880000,2021-08-31,2021-11-17
9419,5 Ryan St,Lilyfield Nsw 2040,5.0,3.0,1.0,House,219.7,Auction,2830000,2830000,2021-09-18,2021-11-17
8745,20 Gatley Ct,Wattle Grove Nsw 2173,4.0,2.0,1.0,House,483.0,Private Sale,1130000,1130000,2021-10-19,2021-11-16
18484,12 John St,Concord Nsw 2137,2.0,1.0,2.0,House,392.0,Private Sale,Price unavailable,1750000,2021-11-19,2021-11-10
22489,2/17 Isabel Ave,Vaucluse Nsw 2030,3.0,2.0,2.0,Apartment,0.0,Auction,Price unavailable,3462000,2021-09-23,2021-11-10
20932,20 Prince St,Canley Heights Nsw 2166,3.0,1.0,1.0,House,442.6,Private Sale,850000,850000,2021-09-22,2021-11-08
18797,9 Flinders Cres,Hinchinbrook Nsw 2168,4.0,2.0,2.0,House,654.7,Auction,1110000,1110000,2021-11-07,2021-11-07
20407,81 Coogee Bay Rd,Randwick Nsw 2031,3.0,2.0,1.0,House,297.2,Auction,Price unavailable,3080000,2021-09-29,2021-11-05
1033,304/123 King St,Newcastle Nsw 2300,1.0,1.0,0.0,Apartment,0.0,Private Sale,Price unavailable,580000,2021-11-04,2021-11-04
21361,121/361 Kent St,Sydney Nsw 2000,2.0,2.0,1.0,Apartment,0.0,Auction,920000,920000,2021-11-02,2021-11-02


## Showing the differences between the two datasets

In [42]:
sales_rma[(sales_rma['Suburb']=='West Ryde Nsw 2114')].sort_values('Date',ascending=False)[0:10].drop(['Link','Type','By','Beds','Baths','Carpark'],axis=1)

Unnamed: 0,Address,Suburb,Price,Date
38414,45 Bank St,West Ryde Nsw 2114,1850000,2021-12-06
38444,68A Brush Rd,West Ryde Nsw 2114,2388000,2021-12-05
38498,21 Moss St,West Ryde Nsw 2114,2630000,2021-12-04
38588,21 Moss St,West Ryde Nsw 2114,2630000,2021-12-04
38663,11 Griffiths Ave,West Ryde Nsw 2114,Price unavailable,2021-12-04
48434,11/4 Union St,West Ryde Nsw 2114,665000,2021-12-03
39073,71 Moss St,West Ryde Nsw 2114,2805000,2021-12-01
39191,1073 Victoria Rd,West Ryde Nsw 2114,1600000,2021-11-30
39532,15 Hermitage Rd,West Ryde Nsw 2114,2310000,2021-11-27
39487,89 Winbourne St,West Ryde Nsw 2114,2350000,2021-11-27


In [43]:
sales_gov[(sales_gov['Suburb']=='West Ryde Nsw 2114')].sort_values('Date',ascending=False)[0:10].drop(['Size'],axis=1)

Unnamed: 0,Address,Suburb,Price,Date
865010,7/15 Riverview St,West Ryde Nsw 2114,475000,2021-10-08
865108,6/4 Union St,West Ryde Nsw 2114,580000,2021-10-05
864860,7/1A Macpherson St,West Ryde Nsw 2114,1400000,2021-10-02
865289,32/22 Herbert St,West Ryde Nsw 2114,760000,2021-10-01
864503,4/12 Adelaide St,West Ryde Nsw 2114,12200,2021-09-24
865058,4/84 Station St,West Ryde Nsw 2114,655000,2021-09-22
865203,67/61 West Pde,West Ryde Nsw 2114,482000,2021-09-21
864837,12 Lambert St,West Ryde Nsw 2114,1840000,2021-09-18
864685,9/2 Mulvihill St,West Ryde Nsw 2114,285000,2021-09-16
865204,30/61 West Pde,West Ryde Nsw 2114,660000,2021-09-15


In [44]:
sales2[sales2['Suburb']=='West Ryde Nsw 2114'].sort_values('Date_rma',ascending=False)[0:5]

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
20951,11/4 Union St,West Ryde Nsw 2114,2.0,1.0,1.0,Apartment,0.0,Auction,665000,650000,2021-12-03,2018-04-24
21090,6/4 Union St,West Ryde Nsw 2114,2.0,1.0,1.0,Apartment,0.0,Private Sale,Price unavailable,580000,2021-11-22,2021-10-05
18439,27 Linton Ave,West Ryde Nsw 2114,5.0,2.0,2.0,House,646.56,Auction,2880000,1810000,2021-11-20,2017-02-25
18638,17 Falconer St,West Ryde Nsw 2114,3.0,2.0,1.0,House,687.63,Private Sale,2520000,1281000,2021-11-13,2019-06-06
19163,23 Reserve St,West Ryde Nsw 2114,5.0,4.0,2.0,House,809.4,Auction,2461000,2060000,2021-10-23,2017-12-09


1. RMA dataset is newer, Gov dataset ends 24 Oct, 2021.
2. Although RMA's records are newer, sometimes the date stamp comes later than the Gov records.
3. RMA dataset contains some of the sales that's not been submitted to the government. E.g., some of the sales from Sept 2021 are still not in the Gov dataset.
4. RMA dataset is not complete, some of the sales that's in the Gov dataset is not in the RMA dataset. Those properties were probably sold privately or were sold on other platforms.


In [45]:
# This cell exists solely because someone may want to remove the flipped houses (sold again within a few months),
# uncomment the next line if you want one property to have only one sale record, which has the most recent sale record.

# sales3 = sales2.drop_duplicates(subset=['Address','Suburb'],keep='first') 
sales3 = sales2.copy()
sales3.shape

(23685, 12)

## Fill "Price unavailable" from government dataset

The rma data set contains the latest sale prices, but some the prices are listed as "unavailable", either because they are sold else where or because the price was never disclosed by the seller or buyer.

Lucky for us, these transactions have to be disclosed to the government within a few months of selling. So, we can get the missing prices from the government dataset.

** "Last Known Price" Disclaimer

For most of the time, the government dataset lags behind the real time sales, because people need time to settle the deal, sometimes by several months. This is why we can eventually get all of the price data from the government. But for the recent months, we will have to make do with the data from RMA.

One problem caused by filling the price from the government dataset is: if a property was sold again within a few months, the price we fill from the government will be the last known prices, but not the most recent ones.

E.g., 3 month ago, a house was sold by private sale for 1.0 million. Since it's "private", rma didn't get the price (sometimes they do although this time they didn't), so the price of this sale was marked as "unavailable". Then the transaction went through in a few days, and the government got the settled price: "1.0 million" and recorded it. Fast forward to today, the house was sold again for 1.5 million, RMA still didn't get the price(maybe another private sale), so this time it was still marked as "price unavailable". If at this stage we get the data from both rma and government, and fill the price recorded by the government of 1.0 million to this sale, then apparently it's wrong.

In short, this means the data can be misleading, and the house price may not be the actual price, but the "last known price". Fortunately if we update the government dataset constantly in the following months, we will eventually get the accurate prices for recent sales, but of course, the data won't be as fresh by then.

In [46]:
print('There are',sales3[sales3.Price_rma == 'Price unavailable'].shape[0],'records without price.')
sales3[sales3.Price_rma == 'Price unavailable'][0:10]

There are 7092 records without price.


Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
18484,12 John St,Concord Nsw 2137,2.0,1.0,2.0,House,392.0,Private Sale,Price unavailable,1750000,2021-11-19,2021-11-10
22489,2/17 Isabel Ave,Vaucluse Nsw 2030,3.0,2.0,2.0,Apartment,0.0,Auction,Price unavailable,3462000,2021-09-23,2021-11-10
20407,81 Coogee Bay Rd,Randwick Nsw 2031,3.0,2.0,1.0,House,297.2,Auction,Price unavailable,3080000,2021-09-29,2021-11-05
1033,304/123 King St,Newcastle Nsw 2300,1.0,1.0,0.0,Apartment,0.0,Private Sale,Price unavailable,580000,2021-11-04,2021-11-04
153,4/22 Hillcrest Rd,Quakers Hill Nsw 2763,3.0,2.0,2.0,Townhouse,0.0,Private Sale,Price unavailable,725000,2021-11-06,2021-10-31
228,3/38 Simpson St,Dundas Valley Nsw 2117,3.0,1.0,2.0,Townhouse,0.0,Private Sale,Price unavailable,1065000,2021-10-20,2021-10-28
21377,61/105 Church St,Ryde Nsw 2112,2.0,2.0,1.0,Apartment,0.0,Private Sale,Price unavailable,750000,2021-11-01,2021-10-26
19120,7A South Creek Rd,Dee Why Nsw 2099,3.0,1.0,2.0,House,323.7,Private Sale,Price unavailable,1890000,2021-10-26,2021-10-26
19178,6 Coolaroo Rd,Lane Cove North Nsw 2066,3.0,1.0,3.0,House,562.8,Auction,Price unavailable,3450000,2021-10-23,2021-10-23
19162,97 Hargrave St,Paddington Nsw 2021,3.0,1.0,1.0,House,94.84,Auction,Price unavailable,2755000,2021-10-23,2021-10-23


In [47]:
# from dateutil.relativedelta import *

for i in sales3.index:
    entry = sales3.loc[i]
    if entry.Price_rma == 'Price unavailable':
        date_rma = entry.Date_rma
        date_gov = entry.Date_gov

        # date_1mon_before = date_gov + relativedelta(months=-2)
        # date_1mon_after = date_gov + relativedelta(months=+2)

        if date_rma<date_gov:
            sales3.at[i, 'Price_rma']=sales3.loc[i].Price_gov




In [48]:
# Removing the entries with no Price

sales4 = sales3[sales3.Price_rma!="Price unavailable"] 

# Clean up

In [49]:
sales4.isnull().sum()[sales4.isnull().sum()!=0]

Baths       10
Size         3
By           1
Date_rma     1
dtype: int64

In [50]:
# check for null values in time columns
sales4[sales4.Date_rma.isnull() | sales4.Date_gov.isnull()]

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
16449,1502/3 Network Pl,North Ryde Nsw 2113,1.0,1.0,1.0,Apartment,0.0,,705000,705000,NaT,2021-06-21


In [51]:
for i in range(0, len(sales4)):
    if pd.isnull(sales4['Date_rma'].iloc[i]):
        sales4['Date_rma'].iloc[i] = sales4['Date_gov'].iloc[i]


In [52]:
# properties without baths
sales4[sales4.Baths.isnull()]


Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
14195,70/95 Annandale St,Annandale Nsw 2038,1.0,,0.0,Apartment,0.0,Private Sale,300000,300000,2021-10-04,2021-10-04
9210,102 Broomfield St,Cabramatta Nsw 2166,1.0,,1.0,House,885.2,Auction,2150000,2150000,2021-09-20,2021-09-22
15012,28/1 Dwyer St,Chippendale Nsw 2008,1.0,,0.0,Apartment,0.0,Private Sale,370000,370000,2021-08-20,2021-08-20
12528,63 Amarco Cct,The Ponds Nsw 2769,2.0,,1.0,House,600.3,Private Sale,1530000,1530000,2021-08-16,2021-08-16
15416,38/5 Darley St,Darlinghurst Nsw 2010,1.0,,0.0,Apartment,0.0,Auction,485000,485000,2021-08-03,2021-08-03
15736,109/72 Henrietta St,Waverley Nsw 2024,1.0,,0.0,Apartment,0.0,Auction,500000,500000,2021-07-14,2021-07-14
15514,618/18 Park Ln,Chippendale Nsw 2008,1.0,,0.0,Apartment,0.0,Private Sale,575000,575000,2021-07-27,2021-07-13
15827,1/80 Victoria Rd,Marrickville Nsw 2204,1.0,,0.0,Apartment,0.0,Auction,505000,505000,2021-07-08,2021-07-08
16814,402/355 Kent St,Sydney Nsw 2000,1.0,,0.0,Apartment,0.0,Private Sale,660000,660000,2021-06-22,2021-06-15
21033,30/14 Ward Ave,Rushcutters Bay Nsw 2011,1.0,,0.0,Apartment,0.0,Private Sale,470000,435000,2021-11-26,2019-03-11


In [53]:
# after checking some of the apartment listed here on other platforms such as domain.com.au,
# I found that most of these 1 bed apartments have at lease 1 bathroom, so they are filled with 1.

sales4.loc[(sales4.Baths.isnull()) & (sales4.Type == 'Apartment'), 'Baths'] = 1


In [54]:
sales4[sales4.isnull().any(axis=1)]


Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
9210,102 Broomfield St,Cabramatta Nsw 2166,1.0,,1.0,House,885.2,Auction,2150000,2150000,2021-09-20,2021-09-22
11121,27B Alfred Rd,Forest Lodge Nsw 2037,2.0,2.0,2.0,House,,Auction,2085000,2085000,2021-08-31,2021-08-31
12528,63 Amarco Cct,The Ponds Nsw 2769,2.0,,1.0,House,600.3,Private Sale,1530000,1530000,2021-08-16,2021-08-16
12613,5 Burnell St,Drummoyne Nsw 2047,4.0,2.0,4.0,House,,Auction,7575000,7575000,2021-08-14,2021-08-14
16449,1502/3 Network Pl,North Ryde Nsw 2113,1.0,1.0,1.0,Apartment,0.0,,705000,705000,2021-06-21,2021-06-21
19358,13 Hinton St,Spring Farm Nsw 2570,3.0,2.0,1.0,House,,Private Sale,761000,589950,2021-10-18,2018-08-13


In [55]:
sales4.shape

(16765, 12)

In [56]:
# these two etries are wrong after checking, so I simply dropped them. RMA's employees didn't do their job on this two rows.
sales5 = sales4.dropna()


In [57]:
sales5.shape


(16759, 12)

## Final Trim

In [58]:
recent_sales=sales5.drop(columns=['Price_gov','Date_gov']) # we will use the more recent price column, which was rma's with prices filled from gov's
recent_sales.rename({'Price_rma':'Price','Date_rma':'Date'},axis=1,inplace=True) # we will use only one time column, doesn't matter if it's rma's or gov's
recent_sales.sort_values('Date',ascending=False,inplace=True)

#export to csv
recent_sales.to_csv( "./data/recent_sales.csv", index=False, encoding='utf-8-sig')


In [59]:
recent_sales

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price,Date
18140,17 Moon Cres,Schofields Nsw 2762,5.0,3.0,2.0,House,363.2,Private Sale,1300000,2021-12-06
18142,96 Kenthurst Rd,Kenthurst Nsw 2156,4.0,5.0,5.0,House,8944.0,Auction,3300000,2021-12-06
18141,9 High St,Cabramatta West Nsw 2166,6.0,4.0,2.0,House,329.7,Private Sale,1150000,2021-12-06
18136,77 Park Rd,Kogarah Bay Nsw 2217,3.0,2.0,2.0,House,449.0,Private Sale,1775000,2021-12-06
985,18/2 Hillview Cres,Tuggerah Nsw 2259,2.0,1.0,1.0,Townhouse,0.0,Private Sale,615000,2021-12-06
...,...,...,...,...,...,...,...,...,...,...
8632,68/107 Pacific Hwy,Hornsby Nsw 2077,3.0,2.0,2.0,Unit,0.0,Private Sale,965000,2020-11-03
8631,3/240 Katoomba St,Katoomba Nsw 2780,3.0,1.0,1.0,Townhouse,0.0,Private Sale,565000,2020-11-03
8630,3/240 Katoomba St,Katoomba Nsw 2780,3.0,1.0,1.0,Townhouse,0.0,Private Sale,565000,2020-11-03
8616,43/344 West Botany St,Brighton-Le-Sands Nsw 2216,4.0,3.0,2.0,Townhouse,0.0,Private Sale,1420000,2020-11-03


In [60]:
%store recent_sales

Stored 'recent_sales' (DataFrame)
