In [1]:
from datetime import datetime, date
import warnings
import numpy as np
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')


## Combine and clean data files from the NSW government

The following code joins all of the CSV files from NSW government into a single CSV.

In [2]:
# All data files from the NSW government can be found at http://maps.six.nsw.gov.au/csv/current/suburb/
# All of the 4325 files are quite large and should be updated, so they are not included in the github repo.
# Re-download all the csv files from the website above and run this cell if a new gov dataset is needed.
# I used chrono download manager's sniffer feature to download them all.

extension = 'csv'
all_filenames = [i for i in glob.glob('./data/gov/*.{}'.format(extension))]

#combine all files in the list
combined = pd.concat([pd.read_csv(f) for f in all_filenames ])

# removing less useful columns
combined = combined.drop(['MULTI-PROPERTY SALE (Y/N)', 'STRATA/NON STRATA', 'PROPERTY NUMBER',
               'DEALING NUMBER', 'EXTRACTION DATE'], axis=1)

# removing duplicates

combined = combined.drop_duplicates(keep='last')

combined.shape

#export to csv
combined.to_csv( "./data/gov_combined.csv", index=False, encoding='utf-8-sig')


In [3]:
# check the shape and columns of the data frame
gov1 = pd.read_csv('./data/gov_combined.csv') # if you don't want to run the prvious cell again every time, just comment it out
print(gov1.shape)
gov1.head()


(908210, 4)


Unnamed: 0,ADDRESS,SALE PRICE,SALE DATE,AREA
0,"7685 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",435000,30 August 2019,400100.0
1,"2162 AARONS PASS ROAD, AARONS PASS NSW 2850",315000,22 August 2017,428500.0
2,"7749 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",340000,5 December 2019,398500.0
3,"159 SUTTERS LANE, AARONS PASS NSW 2850",365000,20 January 2021,470800.0
4,"8248 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",700000,19 October 2018,402000.0


In [4]:
# Split street address and suburb

addr = gov1.ADDRESS.str.split(
    ',', n=1, expand=True).apply(lambda x: x.str.strip())
gov2 = gov1.copy()
for i in addr.columns:
    gov2[i] = addr[i]


In [5]:
gov2.head()

Unnamed: 0,ADDRESS,SALE PRICE,SALE DATE,AREA,0,1
0,"7685 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",435000,30 August 2019,400100.0,7685 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
1,"2162 AARONS PASS ROAD, AARONS PASS NSW 2850",315000,22 August 2017,428500.0,2162 AARONS PASS ROAD,AARONS PASS NSW 2850
2,"7749 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",340000,5 December 2019,398500.0,7749 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
3,"159 SUTTERS LANE, AARONS PASS NSW 2850",365000,20 January 2021,470800.0,159 SUTTERS LANE,AARONS PASS NSW 2850
4,"8248 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",700000,19 October 2018,402000.0,8248 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850


In [6]:
gov2.rename(columns={0:'ADDR', 1:'SUBURB', 'SALE DATE':'DATE','SALE PRICE':'PRICE'},inplace=True)
gov2.head()


Unnamed: 0,ADDRESS,PRICE,DATE,AREA,ADDR,SUBURB
0,"7685 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",435000,30 August 2019,400100.0,7685 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
1,"2162 AARONS PASS ROAD, AARONS PASS NSW 2850",315000,22 August 2017,428500.0,2162 AARONS PASS ROAD,AARONS PASS NSW 2850
2,"7749 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",340000,5 December 2019,398500.0,7749 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850
3,"159 SUTTERS LANE, AARONS PASS NSW 2850",365000,20 January 2021,470800.0,159 SUTTERS LANE,AARONS PASS NSW 2850
4,"8248 CASTLEREAGH HIGHWAY, AARONS PASS NSW 2850",700000,19 October 2018,402000.0,8248 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850


In [7]:
# Leaving only the useful columns

cols = ['ADDR','SUBURB','PRICE','AREA','DATE']
gov3 = gov2[cols]
gov3.columns = ['Address','Suburb','Price','Size','Date']
gov3.head()

Unnamed: 0,Address,Suburb,Price,Size,Date
0,7685 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850,435000,400100.0,30 August 2019
1,2162 AARONS PASS ROAD,AARONS PASS NSW 2850,315000,428500.0,22 August 2017
2,7749 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850,340000,398500.0,5 December 2019
3,159 SUTTERS LANE,AARONS PASS NSW 2850,365000,470800.0,20 January 2021
4,8248 CASTLEREAGH HIGHWAY,AARONS PASS NSW 2850,700000,402000.0,19 October 2018


In [8]:
sales_gov = gov3.copy()

## Combine and clean data scraped from RateMyAgent.com.au

In [9]:
# All data files from the RateMyAgent website are scraped with a tool.
# The data should be updated once every month.

extension = 'csv'
all_filenames = [i for i in glob.glob('./data/rma/*.{}'.format(extension))]

#combine all files in the list
combined = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined.to_csv( "./data/rma_combined.csv", index=False, encoding='utf-8-sig')


In [10]:
rma = pd.read_csv('./data/rma_combined.csv')
rma.shape

(28405, 9)

In [11]:
rma.head()

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Sale Type,Price
0,https://www.ratemyagent.com.au/real-estate-age...,7A Australia Street,Camperdown NSW 2050,2.0,2.0,1.0,Townhouse,Sold by Auction on 02 Nov 2021,"$1,950,000"
1,https://www.ratemyagent.com.au/real-estate-age...,9/46 Stewart St,Ermington NSW 2115,3.0,2.0,1.0,Townhouse,Sold by Private Sale on 01 Nov 2021,"$960,000"
2,https://www.ratemyagent.com.au/real-estate-age...,39 William Street,Granville NSW 2142,1.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
3,https://www.ratemyagent.com.au/real-estate-age...,7/12 Union Street,West Ryde NSW 2114,2.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
4,https://www.ratemyagent.com.au/real-estate-age...,404/2A Cooks Avenue,Canterbury NSW 2193,2.0,2.0,2.0,Unit,Sold by Private Sale on 01 Nov 2021,"$740,000"


In [12]:
# remove duplicates

rma1 = rma.drop_duplicates(keep='last')
rma1.shape

(28353, 9)

In [13]:
rma1.isnull().sum()

Link           74
Address         1
Suburb          1
Beds           38
Baths          71
Carpark      2159
Type          220
Sale Type       9
Price           1
dtype: int64

In [14]:
# Dropping rows that are missing important values

rma1.dropna(subset=['Price','Beds'],inplace=True)
rma1.head()

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Sale Type,Price
0,https://www.ratemyagent.com.au/real-estate-age...,7A Australia Street,Camperdown NSW 2050,2.0,2.0,1.0,Townhouse,Sold by Auction on 02 Nov 2021,"$1,950,000"
1,https://www.ratemyagent.com.au/real-estate-age...,9/46 Stewart St,Ermington NSW 2115,3.0,2.0,1.0,Townhouse,Sold by Private Sale on 01 Nov 2021,"$960,000"
2,https://www.ratemyagent.com.au/real-estate-age...,39 William Street,Granville NSW 2142,1.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
3,https://www.ratemyagent.com.au/real-estate-age...,7/12 Union Street,West Ryde NSW 2114,2.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
4,https://www.ratemyagent.com.au/real-estate-age...,404/2A Cooks Avenue,Canterbury NSW 2193,2.0,2.0,2.0,Unit,Sold by Private Sale on 01 Nov 2021,"$740,000"


In [15]:
# Fill null values in column: "Type". If there's a slash(/) in the address, it's an apartment, else it's a house.

nulls = rma1[rma1['Type'].isnull()]
apartments = nulls[nulls.Address.str.contains('/')]
houses = nulls[~nulls.Address.str.contains('/')]
for i in houses.index:
    rma1.at[i,'Type'] = 'House'
for i in apartments.index:
    rma1.at[i,'Type'] = 'Apartment'


In [16]:
# Fill null values in column: "Carpark". If it's an apartment, there's no parking. If it's a house, there's 2 parking spaces.

nulls = rma1[rma1['Carpark'].isnull()]
apartments = nulls[nulls.Address.str.contains('/')]
houses = nulls[~nulls.Address.str.contains('/')]
for i in houses.index:
    rma1.at[i,'Carpark'] = 1
for i in apartments.index:
    rma1.at[i,'Carpark'] = 0

In [17]:
rma1.isnull().sum()

Link         73
Address       0
Suburb        0
Beds          0
Baths        33
Carpark       0
Type          0
Sale Type     8
Price         0
dtype: int64

In [18]:
rma1.head()

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Sale Type,Price
0,https://www.ratemyagent.com.au/real-estate-age...,7A Australia Street,Camperdown NSW 2050,2.0,2.0,1.0,Townhouse,Sold by Auction on 02 Nov 2021,"$1,950,000"
1,https://www.ratemyagent.com.au/real-estate-age...,9/46 Stewart St,Ermington NSW 2115,3.0,2.0,1.0,Townhouse,Sold by Private Sale on 01 Nov 2021,"$960,000"
2,https://www.ratemyagent.com.au/real-estate-age...,39 William Street,Granville NSW 2142,1.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
3,https://www.ratemyagent.com.au/real-estate-age...,7/12 Union Street,West Ryde NSW 2114,2.0,1.0,1.0,Unit,Sold by Private Sale on 01 Nov 2021,Price unavailable
4,https://www.ratemyagent.com.au/real-estate-age...,404/2A Cooks Avenue,Canterbury NSW 2193,2.0,2.0,2.0,Unit,Sold by Private Sale on 01 Nov 2021,"$740,000"


In [19]:
# Split sale type and date

rma1['Sale Type'] = rma['Sale Type'].str.replace('Sold by ','' )

In [20]:
saletype = rma1['Sale Type'].str.split(' on ',expand=True)
for i in saletype.columns:
    rma1[i] = saletype[i]

In [21]:
rma2 = rma1.drop('Sale Type',axis=1)
rma2.rename({0:'By',1:'Date'},axis=1,inplace=True)

# trim price
rma2['Price'] = rma2['Price'].str.replace('$','').str.replace(',','')



In [22]:
sales_rma = rma2.copy()
sales_rma

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Price,By,Date
0,https://www.ratemyagent.com.au/real-estate-age...,7A Australia Street,Camperdown NSW 2050,2.0,2.0,1.0,Townhouse,1950000,Auction,02 Nov 2021
1,https://www.ratemyagent.com.au/real-estate-age...,9/46 Stewart St,Ermington NSW 2115,3.0,2.0,1.0,Townhouse,960000,Private Sale,01 Nov 2021
2,https://www.ratemyagent.com.au/real-estate-age...,39 William Street,Granville NSW 2142,1.0,1.0,1.0,Unit,Price unavailable,Private Sale,01 Nov 2021
3,https://www.ratemyagent.com.au/real-estate-age...,7/12 Union Street,West Ryde NSW 2114,2.0,1.0,1.0,Unit,Price unavailable,Private Sale,01 Nov 2021
4,https://www.ratemyagent.com.au/real-estate-age...,404/2A Cooks Avenue,Canterbury NSW 2193,2.0,2.0,2.0,Unit,740000,Private Sale,01 Nov 2021
...,...,...,...,...,...,...,...,...,...,...
28400,https://www.ratemyagent.com.au/real-estate-age...,301/7 Sevier Avenue,Rhodes NSW 2138,2.0,2.0,1.0,Apartment,Price unavailable,Auction,29 May 2021
28401,https://www.ratemyagent.com.au/real-estate-age...,36/132 Killeaton St,St Ives NSW 2075,2.0,2.0,1.0,Apartment,935000,Auction,29 May 2021
28402,https://www.ratemyagent.com.au/real-estate-age...,12/19 Selwyn St,Wollstonecraft NSW 2065,2.0,1.0,1.0,Apartment,Price unavailable,Auction,29 May 2021
28403,https://www.ratemyagent.com.au/real-estate-age...,5/62 Middle Head Road,Mosman NSW 2088,2.0,1.0,1.0,Apartment,Price unavailable,Auction,29 May 2021


## Prepare datasets before joining

Unify Street Names (Street -> St, etc.)

In [23]:
sales_gov.Address = sales_gov.Address.str.title().str.strip()
sales_gov.Suburb = sales_gov.Suburb.str.title().str.strip()
sales_rma.Address = sales_rma.Address.str.title().str.strip()
sales_rma.Suburb = sales_rma.Suburb.str.title().str.strip()


In [24]:
road_names = sales_rma.Address.str.split(' ', expand = True)


In [25]:
rd_names = road_names[2].unique()

filtered = []
for n in rd_names:
    if n == None:
        pass
    elif len(n)<3:
            filtered.append(n)

filtered


['St',
 'Rd',
 'Pl',
 'Cl',
 'Ln',
 'Dr',
 'Sp',
 'La',
 '16',
 'Ct',
 'N',
 'On',
 '&',
 'De',
 'Av',
 'Of',
 'Vw']

In [26]:
road_names[road_names[2]=='Cl']



Unnamed: 0,0,1,2,3,4,5,6,7,8
237,6,Goodlet,Cl,,,,,,
599,2,William,Cl,,,,,,
1308,7,Roberts,Cl,,,,,,
1508,12/3,Packard,Cl,,,,,,
2317,39/3,Ramu,Cl,,,,,,
...,...,...,...,...,...,...,...,...,...
23765,37/2,Belair,Cl,,,,,,
25893,11/6,Fairway,Cl,,,,,,
26523,23/5,Belair,Cl,,,,,,
27429,124/2,Dolphin,Cl,,,,,,


In [27]:
rma2.loc[237]

Link       https://www.ratemyagent.com.au/real-estate-age...
Address                                         6 Goodlet Cl
Suburb                              Lane Cove North NSW 2066
Beds                                                     3.0
Baths                                                    2.0
Carpark                                                  2.0
Type                                               Townhouse
Price                                      Price unavailable
By                                             Private Sale 
Date                                             21 Oct 2021
Name: 237, dtype: object

In [28]:
abbr = [
    [' Street',' St'],
    [' Road',' Rd'],
    [' Avenue',' Ave'],
    [' Place',' Pl'],
    [' Close',' Cl'],
    [' Lane',' Ln'],
    [' Drive',' Dr'],
    [' Highway',' Hwy'],
    [' Parade',' Pde'],
    [' Square',' Sq'],
    [' Court',' Ct'],
    [' Glade',' Gld'],

    ['Parkway','Pkwy'],
    ['Boulevard','Blvd'],
    ['Circuit','Cct'],
    
    [' st',' St'],
    [' st.',' St'],
    [' ST',' St'],
    [' RD',' Rd'],
    [' ave', 'Ave'],
    [' AVE', 'Ave'],
    [' road',' Rd'],
    [' ROAD',' Rd'],
    ['Crt','Ct'],
    ['Pde.','Pde'],

    [' Crescent',' Cres'],
    [' Glen',' Gln'],
    [' Plaza', 'Plz'],
    [' View', ' Vw'],

    ['Mallard La', 'Mallard Ln']
]

In [29]:
# replace all street address names with abbreviation
for row in abbr:
    sales_gov.Address = sales_gov.Address.str.replace(row[0],row[1])
    sales_rma.Address = sales_rma.Address.str.replace(row[0],row[1])

In [30]:
sales_gov.sample(5)

Unnamed: 0,Address,Suburb,Price,Size,Date
8992,17/110 Bourke Rd,Alexandria Nsw 2015,1100000,0.0,16 July 2018
295078,17 Sabre Cl,Fletcher Nsw 2287,980000,1056.0,21 December 2018
428622,304/8 Village Pl,Kirrawee Nsw 2232,765000,0.0,6 August 2017
55123,307/41 Yattenden Cres,Baulkham Hills Nsw 2153,712500,0.0,18 October 2019
410590,8 Singleton Ave,Kellyville Ridge Nsw 2155,1055000,581.4,24 April 2019


Unifying Suburb Names:

In [31]:
rma_suburbs = pd.DataFrame(sales_rma.Suburb.unique())
rma_suburbs.columns = ['Name']


In [32]:
gov_suburbs = pd.DataFrame(sales_gov.Suburb.unique())
gov_suburbs.columns = ['Name']

In [33]:
rma_suburbs.Name.isin(gov_suburbs.Name).astype(int).unique()

array([1])

Seems every suburb is included in the NSW dataset, which means there's no error in the Naming

## Join Two Datasets

In [34]:
sales = pd.merge(sales_rma,sales_gov, on=['Address','Suburb'])
sales.shape

(21403, 13)

In [35]:
sales

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Price_x,By,Date_x,Price_y,Size,Date_y
0,https://www.ratemyagent.com.au/real-estate-age...,9/46 Stewart St,Ermington Nsw 2115,3.0,2.0,1.0,Townhouse,960000,Private Sale,01 Nov 2021,960000,0.0,18 September 2021
1,https://www.ratemyagent.com.au/real-estate-age...,13/28A Henry St,Ashfield Nsw 2131,2.0,1.0,1.0,Unit,650000,Auction,01 Nov 2021,520000,0.0,2 November 2019
2,https://www.ratemyagent.com.au/real-estate-age...,4/29 Norfolk St,Blacktown Nsw 2148,2.0,2.0,1.0,Townhouse,670000,Private Sale,01 Nov 2021,630000,0.0,15 June 2018
3,https://www.ratemyagent.com.au/real-estate-age...,4/5 Thurston St,Penrith Nsw 2750,2.0,1.0,1.0,Unit,Price unavailable,Private Sale,01 Nov 2021,376000,0.0,15 July 2017
4,https://www.ratemyagent.com.au/real-estate-age...,2/43 Mackenzie St,Strathfield Nsw 2135,3.0,2.0,2.0,Townhouse,1677000,Auction,01 Nov 2021,1475000,0.0,30 May 2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21398,https://www.ratemyagent.com.au/real-estate-age...,36/132 Killeaton St,St Ives Nsw 2075,2.0,2.0,1.0,Apartment,935000,Auction,29 May 2021,935000,0.0,29 May 2021
21399,https://www.ratemyagent.com.au/real-estate-age...,12/19 Selwyn St,Wollstonecraft Nsw 2065,2.0,1.0,1.0,Apartment,Price unavailable,Auction,29 May 2021,1500000,0.0,29 May 2021
21400,https://www.ratemyagent.com.au/real-estate-age...,12/19 Selwyn St,Wollstonecraft Nsw 2065,2.0,1.0,1.0,Apartment,Price unavailable,Auction,29 May 2021,1180000,0.0,14 October 2017
21401,https://www.ratemyagent.com.au/real-estate-age...,5/62 Middle Head Rd,Mosman Nsw 2088,2.0,1.0,1.0,Apartment,Price unavailable,Auction,29 May 2021,1130000,0.0,29 May 2021


In [36]:
# sort by time
sales1=sales.copy()
sales1['Date_y'] = pd.to_datetime(sales['Date_y'])
sales1['Date_x'] = pd.to_datetime(sales['Date_x'])

sales1.sort_values(by='Date_y', ascending=False , inplace=True)
# sales1.set_index(keys='Date_y', drop=True, inplace=True)

In [37]:
sales1.sample(10)

Unnamed: 0,Link,Address,Suburb,Beds,Baths,Carpark,Type,Price_x,By,Date_x,Price_y,Size,Date_y
17988,https://www.ratemyagent.com.au/real-estate-age...,303/36 Bertram St,Chatswood Nsw 2067,1.0,1.0,1.0,Apartment,1038000,Auction,2021-07-29,1038000,0.0,2021-07-08
20154,https://www.ratemyagent.com.au/real-estate-age...,3/96 Coogee Bay Rd,Coogee Nsw 2034,2.0,1.0,0.0,Apartment,975000,Private Sale,2021-06-22,820000,0.0,2019-10-10
6419,https://www.ratemyagent.com.au/real-estate-age...,11/28 Kent St,Epping Nsw 2121,3.0,2.0,2.0,Townhouse,Price unavailable,Auction,2020-11-25,1490000,0.0,2020-11-21
9257,https://www.ratemyagent.com.au/real-estate-age...,210A Polding St,Smithfield Nsw 2164,4.0,2.0,2.0,House,840000,Private Sale,2021-09-20,840000,556.4,2021-09-13
20020,https://www.ratemyagent.com.au/real-estate-age...,2/20 Military Rd,North Bondi Nsw 2026,2.0,2.0,1.0,Apartment,1820000,Auction,2021-06-23,1820000,0.0,2021-06-23
12988,https://www.ratemyagent.com.au/real-estate-age...,21 Joey Cres,Denham Court Nsw 2565,4.0,2.0,1.0,House,1100000,Private Sale,2021-08-11,400000,300.0,2018-04-05
21158,https://www.ratemyagent.com.au/real-estate-age...,9/202 Victoria Rd,Punchbowl Nsw 2196,2.0,1.0,1.0,Apartment,386000,Auction,2021-06-03,390000,0.0,2021-05-20
16142,https://www.ratemyagent.com.au/real-estate-age...,4/43 Bond St,Maroubra Nsw 2035,2.0,1.0,0.0,Apartment,990000,Private Sale,2021-09-06,990000,0.0,2021-09-06
6713,https://www.ratemyagent.com.au/real-estate-age...,305/581 Kingsway,Miranda Nsw 2228,2.0,2.0,1.0,Unit,765000,Private Sale,2020-11-13,765000,0.0,2020-10-30
13881,https://www.ratemyagent.com.au/real-estate-age...,107 Falcon Cct,Green Valley Nsw 2168,3.0,1.0,1.0,House,813500,Auction,2021-07-31,813500,482.6,2021-07-31


In [38]:
cols = ['Address','Suburb','Beds','Baths','Carpark','Type','Size','By','Price_x','Price_y', 'Date_x','Date_y']
sales2 = sales1[cols]

In [39]:
sales2.rename({'Price_x':'Price_rma','Price_y':'Price_gov','Date_x':'Date_rma','Date_y':'Date_gov'},axis=1,inplace=True)
sales2.shape

(21403, 12)

In [40]:
sales2 = sales2.drop_duplicates(subset=['Address','Suburb','Price_gov'],keep='last')

In [41]:
sales2.head(10)

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
9423,5 Ryan St,Lilyfield Nsw 2040,5.0,3.0,1.0,House,219.7,Auction,2830000,2830000,2021-09-18,2021-11-17
11131,38 Nightingale Sq,Glossodia Nsw 2756,3.0,1.0,2.0,House,980.1,Private Sale,880000,880000,2021-08-31,2021-11-17
7566,20 Gatley Ct,Wattle Grove Nsw 2173,4.0,2.0,1.0,House,483.0,Private Sale,1130000,1130000,2021-10-14,2021-11-16
15210,2/17 Isabel Ave,Vaucluse Nsw 2030,3.0,2.0,2.0,Apartment,0.0,Auction,Price unavailable,3462000,2021-09-23,2021-11-10
9036,20 Prince St,Canley Heights Nsw 2166,3.0,1.0,1.0,House,442.6,Private Sale,850000,850000,2021-09-22,2021-11-08
8468,81 Coogee Bay Rd,Randwick Nsw 2031,3.0,2.0,1.0,House,297.2,Auction,Price unavailable,3080000,2021-09-29,2021-11-05
13966,913/187 Kent St,Millers Point Nsw 2000,1.0,1.0,0.0,Apartment,0.0,Private Sale,1100000,1100000,2021-11-01,2021-11-01
13085,629 Polding St,Bossley Park Nsw 2176,3.0,1.0,1.0,House,564.6,Auction,940000,940000,2021-08-10,2021-10-28
310,24/75 Broome St,Maroubra Nsw 2035,1.0,1.0,1.0,Unit,0.0,Private Sale,720000,720000,2021-10-07,2021-10-27
7092,7A South Creek Rd,Dee Why Nsw 2099,3.0,1.0,2.0,House,323.7,Private Sale,Price unavailable,1890000,2021-10-26,2021-10-26


## Showing the differences between the two datasets
1. RMA dataset is newer, Gov dataset ends 24 Oct, 2021
2. RMA dataset contains some of the sales that's not been submitted to the government. E.g., some of the sales from Sept 2021 are still not in the Gov dataset.
3. RMA dataset is not complete, some of the sales that's in the Gov dataset is not in the RMA dataset. Those properties were probably sold privately or were sold on other platforms.

In [42]:
sales_rma['Date']=pd.to_datetime(sales_rma['Date'])
sales_rma[(sales_rma['Suburb']=='West Ryde Nsw 2114')].sort_values('Date',ascending=False)[0:10].drop(['Link','Type','By','Beds','Baths','Carpark'],axis=1)

Unnamed: 0,Address,Suburb,Price,Date
3,7/12 Union St,West Ryde Nsw 2114,Price unavailable,2021-11-01
8672,23 Falconer St,West Ryde Nsw 2114,2580000,2021-10-30
9425,23 Reserve St,West Ryde Nsw 2114,2461000,2021-10-23
19217,7/15 Riverview St,West Ryde Nsw 2114,475000,2021-10-20
19588,17/20 Herbert St,West Ryde Nsw 2114,Price unavailable,2021-10-14
10770,54 Hermitage Rd,West Ryde Nsw 2114,Price unavailable,2021-10-13
10741,14 Moss St,West Ryde Nsw 2114,2270000,2021-10-13
19810,67/57 West Pde,West Ryde Nsw 2114,482000,2021-10-12
10935,15 Falconer St,West Ryde Nsw 2114,2880000,2021-10-12
689,2B Hermoyne St,West Ryde Nsw 2114,1560000,2021-10-06


In [43]:
sales_gov['Date']=pd.to_datetime(sales_gov['Date'])

In [44]:
sales_gov[(sales_gov['Suburb']=='West Ryde Nsw 2114')].sort_values('Date',ascending=False)[0:10].drop(['Size'],axis=1)

Unnamed: 0,Address,Suburb,Price,Date
867349,7/15 Riverview St,West Ryde Nsw 2114,475000,2021-10-08
867447,6/4 Union St,West Ryde Nsw 2114,580000,2021-10-05
867199,7/1A Macpherson St,West Ryde Nsw 2114,1400000,2021-10-02
867628,32/22 Herbert St,West Ryde Nsw 2114,760000,2021-10-01
866842,4/12 Adelaide St,West Ryde Nsw 2114,12200,2021-09-24
867397,4/84 Station St,West Ryde Nsw 2114,655000,2021-09-22
867542,67/61 West Pde,West Ryde Nsw 2114,482000,2021-09-21
867176,12 Lambert St,West Ryde Nsw 2114,1840000,2021-09-18
867024,9/2 Mulvihill St,West Ryde Nsw 2114,285000,2021-09-16
867543,30/61 West Pde,West Ryde Nsw 2114,660000,2021-09-15


In [45]:
sales2[sales2['Suburb']=='West Ryde Nsw 2114'].sort_values('Date_rma',ascending=False)[0:5]

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price_rma,Price_gov,Date_rma,Date_gov
7134,23 Reserve St,West Ryde Nsw 2114,5.0,4.0,2.0,House,809.4,Auction,2461000,2060000,2021-10-23,2017-12-09
14159,7/15 Riverview St,West Ryde Nsw 2114,1.0,1.0,1.0,Apartment,0.0,Private Sale,475000,475000,2021-10-20,2021-10-08
386,4/84 Station St,West Ryde Nsw 2114,2.0,1.0,1.0,Unit,0.0,Private Sale,655000,655000,2021-10-01,2021-09-22
15035,30/61 West Pde,West Ryde Nsw 2114,2.0,1.0,1.0,Apartment,0.0,Private Sale,660000,660000,2021-09-27,2021-09-15
9381,12 Lambert St,West Ryde Nsw 2114,3.0,1.0,2.0,House,742.6,Auction,1840000,1840000,2021-09-18,2021-09-18


In [46]:
# sales3 = sales2.drop_duplicates(subset=['Address','Suburb'],keep='first') #if we want to remove the flipped houses
sales3 = sales2.copy()
sales3.shape

(20937, 12)

## Fill "Price unavailable" if two Dates are within +-2 months

Some of the property prices are listed as "unavailable" in rma dataset but it is disclosed in the government dataset, so we can use the price from the government dataset to fill the price column.

In [47]:
from dateutil.relativedelta import *

for i in sales3.index:
    entry = sales3.loc[i]
    if entry.Price_rma == 'Price unavailable':
        date_rma = entry.Date_rma
        date_gov = entry.Date_gov

        date_1mon_before = date_gov + relativedelta(months=-2)
        date_1mon_after = date_gov + relativedelta(months=+2)

        if date_1mon_before<date_rma<date_1mon_after:
            sales3.at[i, 'Price_rma']=sales3.loc[i].Price_gov




In [48]:
# Removing the entries without Price

sales4 = sales3[sales3.Price_rma!="Price unavailable"] 

In [49]:
recent_sales=sales4.drop(columns=['Price_gov','Date_gov'])
recent_sales.rename({'Price_rma':'Price','Date_rma':'Date'},axis=1,inplace=True)
recent_sales.sort_values('Date',ascending=False,inplace=True)
recent_sales = recent_sales.dropna(subset=['Date'])


In [50]:
recent_sales

Unnamed: 0,Address,Suburb,Beds,Baths,Carpark,Type,Size,By,Price,Date
1,13/28A Henry St,Ashfield Nsw 2131,2.0,1.0,1.0,Unit,0.0,Auction,650000,2021-11-01
5,1201/6 Charles St,Parramatta Nsw 2150,2.0,2.0,1.0,Unit,0.0,Private Sale,605000,2021-11-01
6960,4 Yvette St,Baulkham Hills Nsw 2153,4.0,2.0,1.0,House,695.6,Private Sale,1625000,2021-11-01
0,9/46 Stewart St,Ermington Nsw 2115,3.0,2.0,1.0,Townhouse,0.0,Private Sale,960000,2021-11-01
13963,42/585 Canterbury Rd,Belmore Nsw 2192,2.0,2.0,1.0,Apartment,0.0,Private Sale,602000,2021-11-01
...,...,...,...,...,...,...,...,...,...,...
6952,6/3 Pine St,Rozelle Nsw 2039,3.0,1.0,1.0,Townhouse,0.0,Auction,1285000,2020-11-03
6944,3/240 Katoomba St,Katoomba Nsw 2780,3.0,1.0,1.0,Townhouse,0.0,Private Sale,565000,2020-11-03
6931,43/344 West Botany St,Brighton-Le-Sands Nsw 2216,4.0,3.0,2.0,Townhouse,0.0,Private Sale,1420000,2020-11-03
6933,6/19 Myall Rd,Casula Nsw 2170,3.0,2.0,1.0,Townhouse,0.0,Private Sale,612000,2020-11-03


In [51]:
%store recent_sales

Stored 'recent_sales' (DataFrame)


In [52]:
recent_sales.isnull().sum()

Address     0
Suburb      0
Beds        0
Baths      16
Carpark     0
Type        0
Size        2
By          0
Price       0
Date        0
dtype: int64