In [1]:
import os
import json
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from tqdm import tqdm, trange
from shapely.geometry import shape, Point

In [2]:
cwd = os.getcwd()
folder = os.path.join(cwd, '../final_clean_version_2') # folder to store sales data from 520 suburbs
files = os.listdir(folder)
len(files)

520

In [3]:
# inspect the first dataset

pd.read_csv(os.path.join(folder, files[0])).head(1).transpose()

Unnamed: 0,0
PropID,Abbotsford_VIC_3067_0
Street,7/29 Church Street
PropStreet,7/29 Church Street
Suburb,Abbotsford
PropSuburb,Abbotsford
Region,VIC
PostalCode,3067
Latitude,-37.809156
Longitude,145.000665
Type,APARTMENT


# Merge into one single dataset

In [4]:
# keep only data from 2013-2017 for SOLD HOUSE that have prices recorded; merge into one single dataset

b4, after = [], []
for i, file in enumerate(tqdm(files)):
    
    if i==0:
        data_all = pd.read_csv(os.path.join(folder, file))
        # print(f"b4: {len(data_all)}")
        b4.append(len(data_all))
        data_all = data_all.dropna(subset=['Price']).loc[
            (data_all['SoldOrRent'] == 'SOLD') & (data_all['Type'] == 'HOUSE')
        ]
        after.append(len(data_all))
        # print(f"after: {len(data_all)}")
    else:
        newdata = pd.read_csv(os.path.join(folder, file))
        # print(f"b4: {len(newdata)}")
        b4.append(len(newdata))
        newdata = newdata.dropna(subset=['Price']).loc[
            (newdata['SoldOrRent'] == 'SOLD') & (newdata['Type'] == 'HOUSE')
        ]
        # print(f"after: {len(newdata)}")
        after.append(len(newdata))
        data_all = pd.concat([data_all, newdata])

100%|██████████| 520/520 [00:42<00:00, 12.21it/s]


In [5]:
data_all.shape

(1255109, 24)

# First clean

1. remove redundant columns
2. remove instances having LandSize and BuildingArea both missing
3. remove instances having Bedrms, Bathrms, Cars all missing
4. remove price that is below 100,000

In [6]:
data_clean = data_all.drop(
    columns=['Street', 'PropStreet', 'PropSuburb', 'Region', 'Type', 'SoldOrRent', 'Source']
).dropna(
    subset=['LandSize', 'BuildingArea'], how='all'
).dropna(
    subset=['Bedrms', 'Bathrms', 'Cars'], how='all'
)

data_clean.shape

(1183760, 17)

In [7]:
# this is raw data after simple cleaning, so it may contain some useless columns

# data_clean.reset_index().drop(columns='index').to_csv(os.path.join(cwd, 'data\\ver2_house_sold.csv'), index=False)

  values = values.astype(str)


# Add Quantile and index

In [5]:
# get those features of interest
data_clean = pd.read_csv(os.path.join(os.getcwd(), 'data\\ver2_house_sold.csv'))

In [6]:
data_clean.shape

(1183760, 17)

In [13]:
years = data_clean['Year'].unique()
years.sort()
years

array([1954., 1955., 1959., 1963., 1965., 1966., 1970., 1974., 1975.,
       1976., 1977., 1978., 1979., 1980., 1981., 1982., 1983., 1984.,
       1985., 1986., 1987., 1988., 1989., 1990., 1991., 1992., 1993.,
       1994., 1995., 1996., 1997., 1998., 1999., 2000., 2001., 2002.,
       2003., 2004., 2005., 2006., 2007., 2008., 2009., 2010., 2011.,
       2012., 2013., 2014., 2015., 2016., 2017., 2018.,   nan])

In [24]:
data = pd.DataFrame()
for year in tqdm(years[:-1]):
    temp_data = data_clean[data_clean['Year']==year]
    for month in np.linspace(1.0,12.0,12):
        work_data = temp_data[temp_data['Month']==month]
        if work_data.shape[0] > 0:
            work_data[['Q1','Q2','Q3']] = np.quantile(work_data['Price'], [0.25,0.5,0.75])
            data = pd.concat([data,work_data], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  work_data[['Q1','Q2','Q3']] = np.quantile(work_data['Price'], [0.25,0.5,0.75])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  work_data[['Q1','Q2','Q3']] = np.quantile(work_data['Price'], [0.25,0.5,0.75])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  work_data[['Q1','Q2','Q3']] = np.quantile(work_

In [51]:
data['index'] = data.apply(lambda row:  0.0 if row['Price']<row['Q1'] else 
                        (1.0 if row['Price']<row['Q2'] else 
                        (2.0 if row['Price']<row['Q2'] else 3.0)), axis=1)

In [53]:
data['vQ1'] = data.apply(lambda row: row['Price']/row['Q1'], axis=1)
data['vQ2'] = data.apply(lambda row: row['Price']/row['Q2'], axis=1)
data['vQ3'] = data.apply(lambda row: row['Price']/row['Q3'], axis=1)
data

Unnamed: 0,PropID,Suburb,PostalCode,Latitude,Longitude,Bedrms,Bathrms,Cars,LandSize,BuildingArea,...,Year,AgentName,AgentAddr,Q1,Q2,Q3,index,vQ1,vQ2,vQ3
0,Clayton_South_VIC_3169_245,Clayton South,3169.0,-37.928675,145.113388,2.0,1.0,2.0,683.0,,...,1954.0,-,,3960.0,3960.0,3960.0,3.0,1.000000,1.00000,1.000000
1,Niddrie_VIC_3042_691,Niddrie,3042.0,-37.742809,144.884995,4.0,3.0,1.0,650.0,203.0,...,1955.0,-,,4556.0,4556.0,4556.0,3.0,1.000000,1.00000,1.000000
2,Laverton_VIC_3028_884,Laverton,3028.0,-37.862454,144.774447,3.0,1.0,2.0,700.0,121.0,...,1959.0,-,,7502.0,7502.0,7502.0,3.0,1.000000,1.00000,1.000000
3,Reservoir_VIC_3073.0_5154,Reservoir,3073.0,-37.717350,145.019813,2.0,1.0,1.0,663.0,,...,1963.0,-,,7714.0,7714.0,7714.0,3.0,1.000000,1.00000,1.000000
4,Heidelberg_West_VIC_3081_270,Heidelberg West,3081.0,-37.741441,145.043105,3.0,1.0,1.0,542.0,,...,1965.0,-,,9368.0,9368.0,9368.0,3.0,1.000000,1.00000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1183590,Pascoe_Vale_VIC_3044_1710,Pascoe Vale,3044.0,-37.726379,144.922393,4.0,2.0,2.0,556.0,139.0,...,2018.0,Oak Park Real Estate - Oak Park,,634000.0,800000.0,972500.0,3.0,1.522082,1.20625,0.992288
1183591,Rowville_VIC_3178_1331,Rowville,3178.0,-37.930205,145.242418,4.0,2.0,2.0,664.0,220.0,...,2018.0,Noel Jones - Wantirna,,634000.0,800000.0,972500.0,3.0,1.466877,1.16250,0.956298
1183592,Rye_VIC_3941_3694,Rye,3941.0,-38.378650,144.785992,3.0,2.0,1.0,862.0,124.0,...,2018.0,Eview Group - Southern Peninsula,,634000.0,800000.0,972500.0,1.0,1.048896,0.83125,0.683805
1183593,Sunbury_VIC_3429_5116,Sunbury,3429.0,-37.591411,144.696535,3.0,2.0,1.0,330.0,107.0,...,2018.0,Raine & Horne - Sunbury,,634000.0,800000.0,972500.0,0.0,0.670347,0.53125,0.437018


In [58]:
data

Unnamed: 0,PropID,Suburb,PostalCode,Latitude,Longitude,Bedrms,Bathrms,Cars,LandSize,BuildingArea,...,AgentName,AgentAddr,Q1,Q2,Q3,index,vQ1,vQ2,vQ3,Hindex
0,Clayton_South_VIC_3169_245,Clayton South,3169.0,-37.928675,145.113388,2.0,1.0,2.0,683.0,,...,-,,3960.0,3960.0,3960.0,3.0,1.000000,1.00000,1.000000,3.0
1,Niddrie_VIC_3042_691,Niddrie,3042.0,-37.742809,144.884995,4.0,3.0,1.0,650.0,203.0,...,-,,4556.0,4556.0,4556.0,3.0,1.000000,1.00000,1.000000,3.0
2,Laverton_VIC_3028_884,Laverton,3028.0,-37.862454,144.774447,3.0,1.0,2.0,700.0,121.0,...,-,,7502.0,7502.0,7502.0,3.0,1.000000,1.00000,1.000000,3.0
3,Reservoir_VIC_3073.0_5154,Reservoir,3073.0,-37.717350,145.019813,2.0,1.0,1.0,663.0,,...,-,,7714.0,7714.0,7714.0,3.0,1.000000,1.00000,1.000000,3.0
4,Heidelberg_West_VIC_3081_270,Heidelberg West,3081.0,-37.741441,145.043105,3.0,1.0,1.0,542.0,,...,-,,9368.0,9368.0,9368.0,3.0,1.000000,1.00000,1.000000,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1183590,Pascoe_Vale_VIC_3044_1710,Pascoe Vale,3044.0,-37.726379,144.922393,4.0,2.0,2.0,556.0,139.0,...,Oak Park Real Estate - Oak Park,,634000.0,800000.0,972500.0,3.0,1.522082,1.20625,0.992288,3.0
1183591,Rowville_VIC_3178_1331,Rowville,3178.0,-37.930205,145.242418,4.0,2.0,2.0,664.0,220.0,...,Noel Jones - Wantirna,,634000.0,800000.0,972500.0,3.0,1.466877,1.16250,0.956298,3.0
1183592,Rye_VIC_3941_3694,Rye,3941.0,-38.378650,144.785992,3.0,2.0,1.0,862.0,124.0,...,Eview Group - Southern Peninsula,,634000.0,800000.0,972500.0,1.0,1.048896,0.83125,0.683805,1.0
1183593,Sunbury_VIC_3429_5116,Sunbury,3429.0,-37.591411,144.696535,3.0,2.0,1.0,330.0,107.0,...,Raine & Horne - Sunbury,,634000.0,800000.0,972500.0,0.0,0.670347,0.53125,0.437018,0.0


In [60]:
data.reset_index().drop(columns='index').to_csv(os.path.join(os.getcwd(), 'data\\ver2_index.csv'), index=False)

KeyError: "['index'] not found in axis"

In [9]:
# num of entries with sold date
data.dropna(subset=['Day']).shape

(356820, 12)

In [17]:
# Houses sold more than once
data['PropID'].value_counts().count()-data['PropID'].value_counts().value_counts()[1]

311475