In [23]:
import math
import pandas as pd
import geopandas as gpd
import folium
import seaborn as sns
import matplotlib.pyplot as plt
import shapely

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

### 1. Load data

In [112]:
# shapefile
sf = gpd.read_file('../data/landing/SA2_2021_AUST_GDA2020.shp')
sf = sf[sf['STE_NAME21'] == 'Victoria']
sf = sf[['SA2_NAME21', 'AREASQKM21', 'geometry']]
sf = sf.iloc[:-2, :]
sf.columns = ['sa2', 'area', 'geometry']
sf['sa2'] = sf['sa2'].apply(lambda x: x.lower())
sf = sf.sort_values(by='sa2').reset_index(drop=True)

In [106]:
# properties
properties = pd.read_csv('../data/curated/curated_domain_properties.csv')
properties.rename(columns={'property_id': 'id', 'price_per_week': 'price'}, inplace=True) 
distance_duration = pd.read_csv('../data/spatial-data/distance_duration.csv')

properties.shape, distance_duration.shape

((8981, 12), (9077, 18))

In [26]:
# spatial data
infrastructure = pd.read_csv('../data/spatial-data/infrastructure.csv')
school = pd.read_csv('../data/curated/curated_school_location.csv')
mapper_matrix = pd.read_csv('../data/spatial-data/mapper_matrix.csv').rename(columns={'Unnamed: 0': 'sa2'})

infrastructure.shape, school.shape, mapper_matrix.shape

((8047, 4), (2301, 6), (522, 3316))

In [27]:
# other feature
crime = pd.read_csv('../data/curated/curated_crime.csv')
crime['count'] = crime['offence_A'] + crime['offence_B'] + crime['offence_C']\
                        + crime['offence_D'] + crime['offence_E'] + crime['offence_F']
                        
income = pd.read_csv('../data/curated/curated_income.csv')
population = pd.read_csv('../data/curated/curated_population.csv')

crime.shape, income.shape, population.shape

((25650, 10), (456, 10), (522, 10))

In [28]:
# past rent
flat_1_bed = pd.read_csv('../data/curated/annual_rent/1_bedroom_flat_annual_rent.csv')
flat_2_bed = pd.read_csv('../data/curated/annual_rent/2_bedroom_flat_annual_rent.csv')
flat_3_bed = pd.read_csv('../data/curated/annual_rent/3_bedroom_flat_annual_rent.csv')
house_2_bed = pd.read_csv('../data/curated/annual_rent/2_bedroom_house_annual_rent.csv')
house_3_bed = pd.read_csv('../data/curated/annual_rent/3_bedroom_house_annual_rent.csv')
house_4_bed = pd.read_csv('../data/curated/annual_rent/4_bedroom_house_annual_rent.csv')
average_rent = pd.read_csv('../data/curated/annual_rent/all_properties_annual_rent.csv')

all_same_region = set(flat_1_bed['suburb']) == set(flat_2_bed['suburb']) == set(flat_3_bed['suburb']) \
                                == set(house_2_bed['suburb']) == set(house_3_bed['suburb'])\
                                == set(house_4_bed['suburb']) == set(average_rent['suburb'])

flat_1_bed.shape, flat_2_bed.shape, flat_3_bed.shape, house_2_bed.shape, \
house_3_bed.shape, house_4_bed.shape, average_rent.shape, all_same_region


((145, 10),
 (145, 10),
 (145, 10),
 (145, 10),
 (145, 10),
 (145, 10),
 (145, 10),
 True)

In [113]:
# check dataframe here
flat_1_bed

Unnamed: 0,suburb,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,armadale,310.000000,315.000000,345.750000,377.00,360.00,360.750000,336.250000,330.000000,360.000000
1,carlton north,322.500000,340.000000,342.500000,349.50,367.50,355.750000,317.500000,317.500000,373.000000
2,carlton-parkville,329.250000,333.250000,358.250000,382.50,381.50,351.500000,273.750000,284.250000,352.000000
3,cbd-st kilda rd,380.000000,382.250000,403.500000,421.25,435.50,406.250000,312.500000,353.750000,430.000000
4,collingwood-abbotsford,366.250000,377.500000,388.750000,402.50,412.50,405.000000,357.500000,378.250000,420.000000
...,...,...,...,...,...,...,...,...,...,...
140,traralgon,164.500000,171.250000,163.750000,173.75,186.25,195.000000,209.750000,243.750000,260.000000
141,wanagaratta,148.750000,157.000000,172.250000,180.75,193.75,193.750000,197.000000,218.250000,220.000000
142,warragul,175.564957,180.487338,185.547729,190.75,190.75,196.098129,201.596206,207.248434,213.059136
143,warrnambool,197.500000,193.250000,188.750000,199.00,212.75,217.500000,242.500000,241.250000,250.000000


### 2. Checking data region

In [30]:
ALL_REGION = set(average_rent['suburb'].to_list()
                            +crime['suburb/town_name'].to_list()
                            +income['sa2 name'].to_list()
                            +population['sa2 name'].to_list()
                            +school['suburb'].to_list())
SA2_REGION = set(sf['sa2'].to_list())
NON_SA2_REGION = ALL_REGION - SA2_REGION

ALL_REGION = sorted(ALL_REGION)
SA2_REGION = sorted(SA2_REGION)
NON_SA2_REGION = sorted(NON_SA2_REGION)

In [31]:
def mapper_weird_region(regions, name):
    """This function is intended to check if a region is not included in mapper_matrix

    Args:
        regions (array): the array containing the region to check for
        name (str): name of the data variable

    Returns:
        None | set: return the set of unseen region if any
    """    
    weird_region = set()
    regions = set(regions)
    mapper_sa2, mapper_non_sa2 = mapper_matrix['sa2'], list(mapper_matrix.columns)[1:]
    
    for r in regions:
        if not any([r in p for p in mapper_non_sa2]) and not any([r in p for p in mapper_sa2]):
            weird_region.add(r)
    
    if len(weird_region) == 0:
        print(f'{name} check ✔')
    else:
        return weird_region

In [32]:
mapper_weird_region(crime['suburb/town_name'], 'Crime')
mapper_weird_region(average_rent['suburb'], 'Rent')
mapper_weird_region(income['sa2 name'], 'Income')
mapper_weird_region(population['sa2 name'], 'Population')
mapper_weird_region(school['suburb'], 'School')

Crime check ✔
Rent check ✔
Income check ✔
Population check ✔
School check ✔


#### 3. Aggregate region into SA2

In [109]:
file stop here

This section calculate the data for each SA2 district using the weight from the mapper matrix and the data from the population, income, ...
<br>`count: sum of offence`

<img src="../markdown-img/Region_to_SA2_demonstration.png" alt="Region to SA2 demonstration">

In [35]:
def to_sa2(data, region_col, col_list):
    """map a dataframe region into SA2 district

    Args:
        data (DataFrame): the data
        region_col (str): column name of the region column in data
        col_list (array): list of column from data to be transformed

    Returns:
        DataFrame: the mapped DataFrame
    """    
    sa2_data = {col: [] for col in col_list}
    sa2_data['sa2'] = SA2_REGION
    data_region = list(data[region_col].unique())

    for col in col_list:
        i = 1
        for sa2 in SA2_REGION:
            # progress check
            print(f'{col}: {i}/{len(SA2_REGION)}')
            i += 1

            # Mr -> row of mapper matrix (weight matrix) | r ϵ  SA2_REGION
            # Di -> row of data | i ϵ ALL_REGION
            # X -> matrix of data in sa2 region
            # Xr = sum(Di . Mr) / sum(Mr) | r ϵ SA2_REGION
            value_list, weight_list = [], []
            for region in data_region:
                    value = data[data[region_col] == region][col].values[0]
                    if region in mapper_matrix.columns:
                        weight = mapper_matrix[mapper_matrix['sa2'] == sa2][region].values[0]
                    else:
                        print(region)
                        weight = 0.005
                        
                    value_list.append(value)
                    weight_list.append(weight)


            # aggreagte by weighted sum
            if len(value_list) > 0:
                approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)
            else:
                approximate = None
            sa2_data[col].append(approximate)

    return sa2_data

 - Crime

In [36]:
crime2023 = to_sa2(crime[crime['year_ending'] == '03/2023'], 'suburb/town_name', ['count'])

count: 1/522
count: 2/522
count: 3/522
count: 4/522
count: 5/522
count: 6/522
count: 7/522
count: 8/522
count: 9/522
count: 10/522
count: 11/522
count: 12/522
count: 13/522
count: 14/522
count: 15/522
count: 16/522
count: 17/522
count: 18/522
count: 19/522
count: 20/522
count: 21/522
count: 22/522
count: 23/522
count: 24/522
count: 25/522
count: 26/522
count: 27/522
count: 28/522
count: 29/522
count: 30/522
count: 31/522
count: 32/522
count: 33/522
count: 34/522
count: 35/522
count: 36/522
count: 37/522
count: 38/522
count: 39/522
count: 40/522
count: 41/522
count: 42/522
count: 43/522
count: 44/522
count: 45/522
count: 46/522
count: 47/522
count: 48/522
count: 49/522
count: 50/522
count: 51/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


count: 52/522
count: 53/522
count: 54/522
count: 55/522
count: 56/522
count: 57/522
count: 58/522
count: 59/522
count: 60/522
count: 61/522
count: 62/522
count: 63/522
count: 64/522
count: 65/522
count: 66/522
count: 67/522
count: 68/522
count: 69/522
count: 70/522
count: 71/522
count: 72/522
count: 73/522
count: 74/522
count: 75/522
count: 76/522
count: 77/522
count: 78/522
count: 79/522
count: 80/522
count: 81/522
count: 82/522
count: 83/522
count: 84/522
count: 85/522
count: 86/522
count: 87/522
count: 88/522
count: 89/522
count: 90/522
count: 91/522
count: 92/522
count: 93/522
count: 94/522
count: 95/522
count: 96/522
count: 97/522
count: 98/522
count: 99/522
count: 100/522
count: 101/522
count: 102/522
count: 103/522
count: 104/522
count: 105/522
count: 106/522
count: 107/522
count: 108/522
count: 109/522
count: 110/522
count: 111/522
count: 112/522
count: 113/522
count: 114/522
count: 115/522
count: 116/522
count: 117/522
count: 118/522
count: 119/522
count: 120/522
count: 121/52

In [57]:
pd.DataFrame(crime2023).to_csv('../data/mapped/crime2023.csv', index=None)

In [39]:
crime2022 = to_sa2(crime[crime['year_ending'] == '03/2022'], 'suburb/town_name', ['count'])

count: 1/522
count: 2/522
count: 3/522
count: 4/522
count: 5/522
count: 6/522
count: 7/522
count: 8/522
count: 9/522
count: 10/522
count: 11/522
count: 12/522
count: 13/522
count: 14/522
count: 15/522
count: 16/522
count: 17/522
count: 18/522
count: 19/522
count: 20/522
count: 21/522
count: 22/522
count: 23/522
count: 24/522
count: 25/522
count: 26/522
count: 27/522
count: 28/522
count: 29/522
count: 30/522
count: 31/522
count: 32/522
count: 33/522
count: 34/522
count: 35/522
count: 36/522
count: 37/522
count: 38/522
count: 39/522
count: 40/522
count: 41/522
count: 42/522
count: 43/522
count: 44/522
count: 45/522
count: 46/522
count: 47/522
count: 48/522
count: 49/522
count: 50/522
count: 51/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


count: 52/522
count: 53/522
count: 54/522
count: 55/522
count: 56/522
count: 57/522
count: 58/522
count: 59/522
count: 60/522
count: 61/522
count: 62/522
count: 63/522
count: 64/522
count: 65/522
count: 66/522
count: 67/522
count: 68/522
count: 69/522
count: 70/522
count: 71/522
count: 72/522
count: 73/522
count: 74/522
count: 75/522
count: 76/522
count: 77/522
count: 78/522
count: 79/522
count: 80/522
count: 81/522
count: 82/522
count: 83/522
count: 84/522
count: 85/522
count: 86/522
count: 87/522
count: 88/522
count: 89/522
count: 90/522
count: 91/522
count: 92/522
count: 93/522
count: 94/522
count: 95/522
count: 96/522
count: 97/522
count: 98/522
count: 99/522
count: 100/522
count: 101/522
count: 102/522
count: 103/522
count: 104/522
count: 105/522
count: 106/522
count: 107/522
count: 108/522
count: 109/522
count: 110/522
count: 111/522
count: 112/522
count: 113/522
count: 114/522
count: 115/522
count: 116/522
count: 117/522
count: 118/522
count: 119/522
count: 120/522
count: 121/52

In [58]:
pd.DataFrame(crime2022).to_csv('../data/mapped/crime2022.csv', index=None)

In [40]:
crime2021 = to_sa2(crime[crime['year_ending'] == '03/2021'], 'suburb/town_name', ['count'])

count: 1/522
count: 2/522
count: 3/522
count: 4/522
count: 5/522
count: 6/522
count: 7/522
count: 8/522
count: 9/522
count: 10/522
count: 11/522
count: 12/522
count: 13/522
count: 14/522
count: 15/522
count: 16/522
count: 17/522
count: 18/522
count: 19/522
count: 20/522
count: 21/522
count: 22/522
count: 23/522
count: 24/522
count: 25/522
count: 26/522
count: 27/522
count: 28/522
count: 29/522
count: 30/522
count: 31/522
count: 32/522
count: 33/522
count: 34/522
count: 35/522
count: 36/522
count: 37/522
count: 38/522
count: 39/522
count: 40/522
count: 41/522
count: 42/522
count: 43/522
count: 44/522
count: 45/522
count: 46/522
count: 47/522
count: 48/522
count: 49/522
count: 50/522
count: 51/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


count: 52/522
count: 53/522
count: 54/522
count: 55/522
count: 56/522
count: 57/522
count: 58/522
count: 59/522
count: 60/522
count: 61/522
count: 62/522
count: 63/522
count: 64/522
count: 65/522
count: 66/522
count: 67/522
count: 68/522
count: 69/522
count: 70/522
count: 71/522
count: 72/522
count: 73/522
count: 74/522
count: 75/522
count: 76/522
count: 77/522
count: 78/522
count: 79/522
count: 80/522
count: 81/522
count: 82/522
count: 83/522
count: 84/522
count: 85/522
count: 86/522
count: 87/522
count: 88/522
count: 89/522
count: 90/522
count: 91/522
count: 92/522
count: 93/522
count: 94/522
count: 95/522
count: 96/522
count: 97/522
count: 98/522
count: 99/522
count: 100/522
count: 101/522
count: 102/522
count: 103/522
count: 104/522
count: 105/522
count: 106/522
count: 107/522
count: 108/522
count: 109/522
count: 110/522
count: 111/522
count: 112/522
count: 113/522
count: 114/522
count: 115/522
count: 116/522
count: 117/522
count: 118/522
count: 119/522
count: 120/522
count: 121/52

In [59]:
pd.DataFrame(crime2021).to_csv('../data/mapped/crime2021.csv', index=None)

In [41]:
crime2020 = to_sa2(crime[crime['year_ending'] == '03/2020'], 'suburb/town_name', ['count'])

count: 1/522
count: 2/522
count: 3/522
count: 4/522
count: 5/522
count: 6/522
count: 7/522
count: 8/522
count: 9/522
count: 10/522
count: 11/522
count: 12/522
count: 13/522
count: 14/522
count: 15/522
count: 16/522
count: 17/522
count: 18/522
count: 19/522
count: 20/522
count: 21/522
count: 22/522
count: 23/522
count: 24/522
count: 25/522
count: 26/522
count: 27/522
count: 28/522
count: 29/522
count: 30/522
count: 31/522
count: 32/522
count: 33/522
count: 34/522
count: 35/522
count: 36/522
count: 37/522
count: 38/522
count: 39/522
count: 40/522
count: 41/522
count: 42/522
count: 43/522
count: 44/522
count: 45/522
count: 46/522
count: 47/522
count: 48/522
count: 49/522
count: 50/522
count: 51/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


count: 52/522
count: 53/522
count: 54/522
count: 55/522
count: 56/522
count: 57/522
count: 58/522
count: 59/522
count: 60/522
count: 61/522
count: 62/522
count: 63/522
count: 64/522
count: 65/522
count: 66/522
count: 67/522
count: 68/522
count: 69/522
count: 70/522
count: 71/522
count: 72/522
count: 73/522
count: 74/522
count: 75/522
count: 76/522
count: 77/522
count: 78/522
count: 79/522
count: 80/522
count: 81/522
count: 82/522
count: 83/522
count: 84/522
count: 85/522
count: 86/522
count: 87/522
count: 88/522
count: 89/522
count: 90/522
count: 91/522
count: 92/522
count: 93/522
count: 94/522
count: 95/522
count: 96/522
count: 97/522
count: 98/522
count: 99/522
count: 100/522
count: 101/522
count: 102/522
count: 103/522
count: 104/522
count: 105/522
count: 106/522
count: 107/522
count: 108/522
count: 109/522
count: 110/522
count: 111/522
count: 112/522
count: 113/522
count: 114/522
count: 115/522
count: 116/522
count: 117/522
count: 118/522
count: 119/522
count: 120/522
count: 121/52

In [60]:
pd.DataFrame(crime2020).to_csv('../data/mapped/crime2020.csv', index=None)

In [42]:
crime2019 = to_sa2(crime[crime['year_ending'] == '03/2019'], 'suburb/town_name', ['count'])

count: 1/522
count: 2/522
count: 3/522
count: 4/522
count: 5/522
count: 6/522
count: 7/522
count: 8/522
count: 9/522
count: 10/522
count: 11/522
count: 12/522
count: 13/522
count: 14/522
count: 15/522
count: 16/522
count: 17/522
count: 18/522
count: 19/522
count: 20/522
count: 21/522
count: 22/522
count: 23/522
count: 24/522
count: 25/522
count: 26/522
count: 27/522
count: 28/522
count: 29/522
count: 30/522
count: 31/522
count: 32/522
count: 33/522
count: 34/522
count: 35/522
count: 36/522
count: 37/522
count: 38/522
count: 39/522
count: 40/522
count: 41/522
count: 42/522
count: 43/522
count: 44/522
count: 45/522
count: 46/522
count: 47/522
count: 48/522
count: 49/522
count: 50/522
count: 51/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


count: 52/522
count: 53/522
count: 54/522
count: 55/522
count: 56/522
count: 57/522
count: 58/522
count: 59/522
count: 60/522
count: 61/522
count: 62/522
count: 63/522
count: 64/522
count: 65/522
count: 66/522
count: 67/522
count: 68/522
count: 69/522
count: 70/522
count: 71/522
count: 72/522
count: 73/522
count: 74/522
count: 75/522
count: 76/522
count: 77/522
count: 78/522
count: 79/522
count: 80/522
count: 81/522
count: 82/522
count: 83/522
count: 84/522
count: 85/522
count: 86/522
count: 87/522
count: 88/522
count: 89/522
count: 90/522
count: 91/522
count: 92/522
count: 93/522
count: 94/522
count: 95/522
count: 96/522
count: 97/522
count: 98/522
count: 99/522
count: 100/522
count: 101/522
count: 102/522
count: 103/522
count: 104/522
count: 105/522
count: 106/522
count: 107/522
count: 108/522
count: 109/522
count: 110/522
count: 111/522
count: 112/522
count: 113/522
count: 114/522
count: 115/522
count: 116/522
count: 117/522
count: 118/522
count: 119/522
count: 120/522
count: 121/52

In [61]:
pd.DataFrame(crime2019).to_csv('../data/mapped/crime2019.csv', index=None)

In [43]:
crime2018 = to_sa2(crime[crime['year_ending'] == '03/2018'], 'suburb/town_name', ['count'])

count: 1/522
count: 2/522
count: 3/522
count: 4/522
count: 5/522
count: 6/522
count: 7/522
count: 8/522
count: 9/522
count: 10/522
count: 11/522
count: 12/522
count: 13/522
count: 14/522
count: 15/522
count: 16/522
count: 17/522
count: 18/522
count: 19/522
count: 20/522
count: 21/522
count: 22/522
count: 23/522
count: 24/522
count: 25/522
count: 26/522
count: 27/522
count: 28/522
count: 29/522
count: 30/522
count: 31/522
count: 32/522
count: 33/522
count: 34/522
count: 35/522
count: 36/522
count: 37/522
count: 38/522
count: 39/522
count: 40/522
count: 41/522
count: 42/522
count: 43/522
count: 44/522
count: 45/522
count: 46/522
count: 47/522
count: 48/522
count: 49/522
count: 50/522
count: 51/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


count: 52/522
count: 53/522
count: 54/522
count: 55/522
count: 56/522
count: 57/522
count: 58/522
count: 59/522
count: 60/522
count: 61/522
count: 62/522
count: 63/522
count: 64/522
count: 65/522
count: 66/522
count: 67/522
count: 68/522
count: 69/522
count: 70/522
count: 71/522
count: 72/522
count: 73/522
count: 74/522
count: 75/522
count: 76/522
count: 77/522
count: 78/522
count: 79/522
count: 80/522
count: 81/522
count: 82/522
count: 83/522
count: 84/522
count: 85/522
count: 86/522
count: 87/522
count: 88/522
count: 89/522
count: 90/522
count: 91/522
count: 92/522
count: 93/522
count: 94/522
count: 95/522
count: 96/522
count: 97/522
count: 98/522
count: 99/522
count: 100/522
count: 101/522
count: 102/522
count: 103/522
count: 104/522
count: 105/522
count: 106/522
count: 107/522
count: 108/522
count: 109/522
count: 110/522
count: 111/522
count: 112/522
count: 113/522
count: 114/522
count: 115/522
count: 116/522
count: 117/522
count: 118/522
count: 119/522
count: 120/522
count: 121/52

In [62]:
pd.DataFrame(crime2018).to_csv('../data/mapped/crime2018.csv', index=None)

In [44]:
crime2017 = to_sa2(crime[crime['year_ending'] == '03/2018'], 'suburb/town_name', ['count'])

count: 1/522
count: 2/522
count: 3/522
count: 4/522
count: 5/522
count: 6/522
count: 7/522
count: 8/522
count: 9/522
count: 10/522
count: 11/522
count: 12/522
count: 13/522
count: 14/522
count: 15/522
count: 16/522
count: 17/522
count: 18/522
count: 19/522
count: 20/522
count: 21/522
count: 22/522
count: 23/522
count: 24/522
count: 25/522
count: 26/522
count: 27/522
count: 28/522
count: 29/522
count: 30/522
count: 31/522
count: 32/522
count: 33/522
count: 34/522
count: 35/522
count: 36/522
count: 37/522
count: 38/522
count: 39/522
count: 40/522
count: 41/522
count: 42/522
count: 43/522
count: 44/522
count: 45/522
count: 46/522
count: 47/522
count: 48/522
count: 49/522
count: 50/522
count: 51/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


count: 52/522
count: 53/522
count: 54/522
count: 55/522
count: 56/522
count: 57/522
count: 58/522
count: 59/522
count: 60/522
count: 61/522
count: 62/522
count: 63/522
count: 64/522
count: 65/522
count: 66/522
count: 67/522
count: 68/522
count: 69/522
count: 70/522
count: 71/522
count: 72/522
count: 73/522
count: 74/522
count: 75/522
count: 76/522
count: 77/522
count: 78/522
count: 79/522
count: 80/522
count: 81/522
count: 82/522
count: 83/522
count: 84/522
count: 85/522
count: 86/522
count: 87/522
count: 88/522
count: 89/522
count: 90/522
count: 91/522
count: 92/522
count: 93/522
count: 94/522
count: 95/522
count: 96/522
count: 97/522
count: 98/522
count: 99/522
count: 100/522
count: 101/522
count: 102/522
count: 103/522
count: 104/522
count: 105/522
count: 106/522
count: 107/522
count: 108/522
count: 109/522
count: 110/522
count: 111/522
count: 112/522
count: 113/522
count: 114/522
count: 115/522
count: 116/522
count: 117/522
count: 118/522
count: 119/522
count: 120/522
count: 121/52

In [63]:
pd.DataFrame(crime2017).to_csv('../data/mapped/crime2017.csv', index=None)

In [45]:
crime2016 = to_sa2(crime[crime['year_ending'] == '03/2016'], 'suburb/town_name', ['count'])

count: 1/522
count: 2/522
count: 3/522
count: 4/522
count: 5/522
count: 6/522
count: 7/522
count: 8/522
count: 9/522
count: 10/522
count: 11/522
count: 12/522
count: 13/522
count: 14/522
count: 15/522
count: 16/522
count: 17/522
count: 18/522
count: 19/522
count: 20/522
count: 21/522
count: 22/522
count: 23/522
count: 24/522
count: 25/522
count: 26/522
count: 27/522
count: 28/522
count: 29/522
count: 30/522
count: 31/522
count: 32/522
count: 33/522
count: 34/522
count: 35/522
count: 36/522
count: 37/522
count: 38/522
count: 39/522
count: 40/522
count: 41/522
count: 42/522
count: 43/522
count: 44/522
count: 45/522
count: 46/522
count: 47/522
count: 48/522
count: 49/522
count: 50/522
count: 51/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


count: 52/522
count: 53/522
count: 54/522
count: 55/522
count: 56/522
count: 57/522
count: 58/522
count: 59/522
count: 60/522
count: 61/522
count: 62/522
count: 63/522
count: 64/522
count: 65/522
count: 66/522
count: 67/522
count: 68/522
count: 69/522
count: 70/522
count: 71/522
count: 72/522
count: 73/522
count: 74/522
count: 75/522
count: 76/522
count: 77/522
count: 78/522
count: 79/522
count: 80/522
count: 81/522
count: 82/522
count: 83/522
count: 84/522
count: 85/522
count: 86/522
count: 87/522
count: 88/522
count: 89/522
count: 90/522
count: 91/522
count: 92/522
count: 93/522
count: 94/522
count: 95/522
count: 96/522
count: 97/522
count: 98/522
count: 99/522
count: 100/522
count: 101/522
count: 102/522
count: 103/522
count: 104/522
count: 105/522
count: 106/522
count: 107/522
count: 108/522
count: 109/522
count: 110/522
count: 111/522
count: 112/522
count: 113/522
count: 114/522
count: 115/522
count: 116/522
count: 117/522
count: 118/522
count: 119/522
count: 120/522
count: 121/52

In [64]:
pd.DataFrame(crime2016).to_csv('../data/mapped/crime2016.csv', index=None)

In [46]:
crime2015 = to_sa2(crime[crime['year_ending'] == '03/2015'], 'suburb/town_name', ['count'])

count: 1/522
count: 2/522
count: 3/522
count: 4/522
count: 5/522
count: 6/522
count: 7/522
count: 8/522
count: 9/522
count: 10/522
count: 11/522
count: 12/522
count: 13/522
count: 14/522
count: 15/522
count: 16/522
count: 17/522
count: 18/522
count: 19/522
count: 20/522
count: 21/522
count: 22/522
count: 23/522
count: 24/522
count: 25/522
count: 26/522
count: 27/522
count: 28/522
count: 29/522
count: 30/522
count: 31/522
count: 32/522
count: 33/522
count: 34/522
count: 35/522
count: 36/522
count: 37/522
count: 38/522
count: 39/522
count: 40/522
count: 41/522
count: 42/522
count: 43/522
count: 44/522
count: 45/522
count: 46/522
count: 47/522
count: 48/522
count: 49/522
count: 50/522
count: 51/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


count: 52/522
count: 53/522
count: 54/522
count: 55/522
count: 56/522
count: 57/522
count: 58/522
count: 59/522
count: 60/522
count: 61/522
count: 62/522
count: 63/522
count: 64/522
count: 65/522
count: 66/522
count: 67/522
count: 68/522
count: 69/522
count: 70/522
count: 71/522
count: 72/522
count: 73/522
count: 74/522
count: 75/522
count: 76/522
count: 77/522
count: 78/522
count: 79/522
count: 80/522
count: 81/522
count: 82/522
count: 83/522
count: 84/522
count: 85/522
count: 86/522
count: 87/522
count: 88/522
count: 89/522
count: 90/522
count: 91/522
count: 92/522
count: 93/522
count: 94/522
count: 95/522
count: 96/522
count: 97/522
count: 98/522
count: 99/522
count: 100/522
count: 101/522
count: 102/522
count: 103/522
count: 104/522
count: 105/522
count: 106/522
count: 107/522
count: 108/522
count: 109/522
count: 110/522
count: 111/522
count: 112/522
count: 113/522
count: 114/522
count: 115/522
count: 116/522
count: 117/522
count: 118/522
count: 119/522
count: 120/522
count: 121/52

In [65]:
pd.DataFrame(crime2015).to_csv('../data/mapped/crime2015.csv', index=None)

 - Rent

In [47]:
rent_f1 = to_sa2(flat_1_bed, 'suburb', ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'])

2015: 1/522
2015: 2/522
2015: 3/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


2015: 4/522
2015: 5/522
2015: 6/522
2015: 7/522
2015: 8/522
2015: 9/522
2015: 10/522
2015: 11/522
2015: 12/522
2015: 13/522
2015: 14/522
2015: 15/522
2015: 16/522
2015: 17/522
2015: 18/522
2015: 19/522
2015: 20/522
2015: 21/522
2015: 22/522
2015: 23/522
2015: 24/522
2015: 25/522
2015: 26/522
2015: 27/522
2015: 28/522
2015: 29/522
2015: 30/522
2015: 31/522
2015: 32/522
2015: 33/522
2015: 34/522
2015: 35/522
2015: 36/522
2015: 37/522
2015: 38/522
2015: 39/522
2015: 40/522
2015: 41/522
2015: 42/522
2015: 43/522
2015: 44/522
2015: 45/522
2015: 46/522
2015: 47/522
2015: 48/522
2015: 49/522
2015: 50/522
2015: 51/522
2015: 52/522
2015: 53/522
2015: 54/522
2015: 55/522
2015: 56/522
2015: 57/522
2015: 58/522
2015: 59/522
2015: 60/522
2015: 61/522
2015: 62/522
2015: 63/522
2015: 64/522
2015: 65/522
2015: 66/522
2015: 67/522
2015: 68/522
2015: 69/522
2015: 70/522
2015: 71/522
2015: 72/522
2015: 73/522
2015: 74/522
2015: 75/522
2015: 76/522
2015: 77/522
2015: 78/522
2015: 79/522
2015: 80/522
2015:

In [66]:
pd.DataFrame(rent_f1).to_csv('../data/mapped/rent_f1.csv', index=None)

In [48]:
rent_f2 = to_sa2(flat_2_bed, 'suburb', ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'])

2015: 1/522
2015: 2/522
2015: 3/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


2015: 4/522
2015: 5/522
2015: 6/522
2015: 7/522
2015: 8/522
2015: 9/522
2015: 10/522
2015: 11/522
2015: 12/522
2015: 13/522
2015: 14/522
2015: 15/522
2015: 16/522
2015: 17/522
2015: 18/522
2015: 19/522
2015: 20/522
2015: 21/522
2015: 22/522
2015: 23/522
2015: 24/522
2015: 25/522
2015: 26/522
2015: 27/522
2015: 28/522
2015: 29/522
2015: 30/522
2015: 31/522
2015: 32/522
2015: 33/522
2015: 34/522
2015: 35/522
2015: 36/522
2015: 37/522
2015: 38/522
2015: 39/522
2015: 40/522
2015: 41/522
2015: 42/522
2015: 43/522
2015: 44/522
2015: 45/522
2015: 46/522
2015: 47/522
2015: 48/522
2015: 49/522
2015: 50/522
2015: 51/522
2015: 52/522
2015: 53/522
2015: 54/522
2015: 55/522
2015: 56/522
2015: 57/522
2015: 58/522
2015: 59/522
2015: 60/522
2015: 61/522
2015: 62/522
2015: 63/522
2015: 64/522
2015: 65/522
2015: 66/522
2015: 67/522
2015: 68/522
2015: 69/522
2015: 70/522
2015: 71/522
2015: 72/522
2015: 73/522
2015: 74/522
2015: 75/522
2015: 76/522
2015: 77/522
2015: 78/522
2015: 79/522
2015: 80/522
2015:

In [69]:
pd.DataFrame(rent_f2).to_csv('../data/mapped/rent_f2.csv', index=None)

In [70]:
rent_f3 = to_sa2(flat_3_bed, 'suburb', ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'])

2015: 1/522
2015: 2/522
2015: 3/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


2015: 4/522
2015: 5/522
2015: 6/522
2015: 7/522
2015: 8/522
2015: 9/522
2015: 10/522
2015: 11/522
2015: 12/522
2015: 13/522
2015: 14/522
2015: 15/522
2015: 16/522
2015: 17/522
2015: 18/522
2015: 19/522
2015: 20/522
2015: 21/522
2015: 22/522
2015: 23/522
2015: 24/522
2015: 25/522
2015: 26/522
2015: 27/522
2015: 28/522
2015: 29/522
2015: 30/522
2015: 31/522
2015: 32/522
2015: 33/522
2015: 34/522
2015: 35/522
2015: 36/522
2015: 37/522
2015: 38/522
2015: 39/522
2015: 40/522
2015: 41/522
2015: 42/522
2015: 43/522
2015: 44/522
2015: 45/522
2015: 46/522
2015: 47/522
2015: 48/522
2015: 49/522
2015: 50/522
2015: 51/522
2015: 52/522
2015: 53/522
2015: 54/522
2015: 55/522
2015: 56/522
2015: 57/522
2015: 58/522
2015: 59/522
2015: 60/522
2015: 61/522
2015: 62/522
2015: 63/522
2015: 64/522
2015: 65/522
2015: 66/522
2015: 67/522
2015: 68/522
2015: 69/522
2015: 70/522
2015: 71/522
2015: 72/522
2015: 73/522
2015: 74/522
2015: 75/522
2015: 76/522
2015: 77/522
2015: 78/522
2015: 79/522
2015: 80/522
2015:

In [71]:
pd.DataFrame(rent_f3).to_csv('../data/mapped/rent_f3.csv', index=None)

In [50]:
rent_h2 = to_sa2(house_2_bed, 'suburb', ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'])

2015: 1/522
2015: 2/522
2015: 3/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


2015: 4/522
2015: 5/522
2015: 6/522
2015: 7/522
2015: 8/522
2015: 9/522
2015: 10/522
2015: 11/522
2015: 12/522
2015: 13/522
2015: 14/522
2015: 15/522
2015: 16/522
2015: 17/522
2015: 18/522
2015: 19/522
2015: 20/522
2015: 21/522
2015: 22/522
2015: 23/522
2015: 24/522
2015: 25/522
2015: 26/522
2015: 27/522
2015: 28/522
2015: 29/522
2015: 30/522
2015: 31/522
2015: 32/522
2015: 33/522
2015: 34/522
2015: 35/522
2015: 36/522
2015: 37/522
2015: 38/522
2015: 39/522
2015: 40/522
2015: 41/522
2015: 42/522
2015: 43/522
2015: 44/522
2015: 45/522
2015: 46/522
2015: 47/522
2015: 48/522
2015: 49/522
2015: 50/522
2015: 51/522
2015: 52/522
2015: 53/522
2015: 54/522
2015: 55/522
2015: 56/522
2015: 57/522
2015: 58/522
2015: 59/522
2015: 60/522
2015: 61/522
2015: 62/522
2015: 63/522
2015: 64/522
2015: 65/522
2015: 66/522
2015: 67/522
2015: 68/522
2015: 69/522
2015: 70/522
2015: 71/522
2015: 72/522
2015: 73/522
2015: 74/522
2015: 75/522
2015: 76/522
2015: 77/522
2015: 78/522
2015: 79/522
2015: 80/522
2015:

In [72]:
pd.DataFrame(rent_h2).to_csv('../data/mapped/rent_h2.csv', index=None)

In [51]:
rent_h3 = to_sa2(house_3_bed, 'suburb', ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'])

2015: 1/522
2015: 2/522
2015: 3/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


2015: 4/522
2015: 5/522
2015: 6/522
2015: 7/522
2015: 8/522
2015: 9/522
2015: 10/522
2015: 11/522
2015: 12/522
2015: 13/522
2015: 14/522
2015: 15/522
2015: 16/522
2015: 17/522
2015: 18/522
2015: 19/522
2015: 20/522
2015: 21/522
2015: 22/522
2015: 23/522
2015: 24/522
2015: 25/522
2015: 26/522
2015: 27/522
2015: 28/522
2015: 29/522
2015: 30/522
2015: 31/522
2015: 32/522
2015: 33/522
2015: 34/522
2015: 35/522
2015: 36/522
2015: 37/522
2015: 38/522
2015: 39/522
2015: 40/522
2015: 41/522
2015: 42/522
2015: 43/522
2015: 44/522
2015: 45/522
2015: 46/522
2015: 47/522
2015: 48/522
2015: 49/522
2015: 50/522
2015: 51/522
2015: 52/522
2015: 53/522
2015: 54/522
2015: 55/522
2015: 56/522
2015: 57/522
2015: 58/522
2015: 59/522
2015: 60/522
2015: 61/522
2015: 62/522
2015: 63/522
2015: 64/522
2015: 65/522
2015: 66/522
2015: 67/522
2015: 68/522
2015: 69/522
2015: 70/522
2015: 71/522
2015: 72/522
2015: 73/522
2015: 74/522
2015: 75/522
2015: 76/522
2015: 77/522
2015: 78/522
2015: 79/522
2015: 80/522
2015:

In [73]:
pd.DataFrame(rent_h3).to_csv('../data/mapped/rent_h3.csv', index=None)

In [52]:
rent_h4 = to_sa2(house_4_bed, 'suburb', ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'])

2015: 1/522
2015: 2/522
2015: 3/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


2015: 4/522
2015: 5/522
2015: 6/522
2015: 7/522
2015: 8/522
2015: 9/522
2015: 10/522
2015: 11/522
2015: 12/522
2015: 13/522
2015: 14/522
2015: 15/522
2015: 16/522
2015: 17/522
2015: 18/522
2015: 19/522
2015: 20/522
2015: 21/522
2015: 22/522
2015: 23/522
2015: 24/522
2015: 25/522
2015: 26/522
2015: 27/522
2015: 28/522
2015: 29/522
2015: 30/522
2015: 31/522
2015: 32/522
2015: 33/522
2015: 34/522
2015: 35/522
2015: 36/522
2015: 37/522
2015: 38/522
2015: 39/522
2015: 40/522
2015: 41/522
2015: 42/522
2015: 43/522
2015: 44/522
2015: 45/522
2015: 46/522
2015: 47/522
2015: 48/522
2015: 49/522
2015: 50/522
2015: 51/522
2015: 52/522
2015: 53/522
2015: 54/522
2015: 55/522
2015: 56/522
2015: 57/522
2015: 58/522
2015: 59/522
2015: 60/522
2015: 61/522
2015: 62/522
2015: 63/522
2015: 64/522
2015: 65/522
2015: 66/522
2015: 67/522
2015: 68/522
2015: 69/522
2015: 70/522
2015: 71/522
2015: 72/522
2015: 73/522
2015: 74/522
2015: 75/522
2015: 76/522
2015: 77/522
2015: 78/522
2015: 79/522
2015: 80/522
2015:

In [74]:
pd.DataFrame(rent_h4).to_csv('../data/mapped/rent_h4.csv', index=None)

In [78]:
rent_av = to_sa2(average_rent, 'suburb', ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'])

2015: 1/522
2015: 2/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


2015: 3/522
2015: 4/522
2015: 5/522
2015: 6/522
2015: 7/522
2015: 8/522
2015: 9/522
2015: 10/522
2015: 11/522
2015: 12/522
2015: 13/522
2015: 14/522
2015: 15/522
2015: 16/522
2015: 17/522
2015: 18/522
2015: 19/522
2015: 20/522
2015: 21/522
2015: 22/522
2015: 23/522
2015: 24/522
2015: 25/522
2015: 26/522
2015: 27/522
2015: 28/522
2015: 29/522
2015: 30/522
2015: 31/522
2015: 32/522
2015: 33/522
2015: 34/522
2015: 35/522
2015: 36/522
2015: 37/522
2015: 38/522
2015: 39/522
2015: 40/522
2015: 41/522
2015: 42/522
2015: 43/522
2015: 44/522
2015: 45/522
2015: 46/522
2015: 47/522
2015: 48/522
2015: 49/522
2015: 50/522
2015: 51/522
2015: 52/522
2015: 53/522
2015: 54/522
2015: 55/522
2015: 56/522
2015: 57/522
2015: 58/522
2015: 59/522
2015: 60/522
2015: 61/522
2015: 62/522
2015: 63/522
2015: 64/522
2015: 65/522
2015: 66/522
2015: 67/522
2015: 68/522
2015: 69/522
2015: 70/522
2015: 71/522
2015: 72/522
2015: 73/522
2015: 74/522
2015: 75/522
2015: 76/522
2015: 77/522
2015: 78/522
2015: 79/522
2015: 

In [79]:
pd.DataFrame(rent_av).to_csv('../data/mapped/rent_av.csv', index=None)

 - Income

In [54]:
mapped_income = to_sa2(income, 'sa2 name', ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'])

2015: 1/522
2015: 2/522
2015: 3/522
2015: 4/522
2015: 5/522
2015: 6/522
2015: 7/522


  approximate = sum([v * w for v, w in zip(value_list, weight_list)]) / sum(weight_list)


2015: 8/522
2015: 9/522
2015: 10/522
2015: 11/522
2015: 12/522
2015: 13/522
2015: 14/522
2015: 15/522
2015: 16/522
2015: 17/522
2015: 18/522
2015: 19/522
2015: 20/522
2015: 21/522
2015: 22/522
2015: 23/522
2015: 24/522
2015: 25/522
2015: 26/522
2015: 27/522
2015: 28/522
2015: 29/522
2015: 30/522
2015: 31/522
2015: 32/522
2015: 33/522
2015: 34/522
2015: 35/522
2015: 36/522
2015: 37/522
2015: 38/522
2015: 39/522
2015: 40/522
2015: 41/522
2015: 42/522
2015: 43/522
2015: 44/522
2015: 45/522
2015: 46/522
2015: 47/522
2015: 48/522
2015: 49/522
2015: 50/522
2015: 51/522
2015: 52/522
2015: 53/522
2015: 54/522
2015: 55/522
2015: 56/522
2015: 57/522
2015: 58/522
2015: 59/522
2015: 60/522
2015: 61/522
2015: 62/522
2015: 63/522
2015: 64/522
2015: 65/522
2015: 66/522
2015: 67/522
2015: 68/522
2015: 69/522
2015: 70/522
2015: 71/522
2015: 72/522
2015: 73/522
2015: 74/522
2015: 75/522
2015: 76/522
2015: 77/522
2015: 78/522
2015: 79/522
2015: 80/522
2015: 81/522
2015: 82/522
2015: 83/522
2015: 84/522
2

In [76]:
pd.DataFrame(mapped_income).to_csv('../data/mapped/income.csv', index=None)

 - Population

In [55]:
mapped_population = to_sa2(population, 'sa2 name', ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'])

2015: 1/522
2015: 2/522
2015: 3/522
2015: 4/522
2015: 5/522
2015: 6/522
2015: 7/522
2015: 8/522
2015: 9/522
2015: 10/522
2015: 11/522
2015: 12/522
2015: 13/522
2015: 14/522
2015: 15/522
2015: 16/522
2015: 17/522
2015: 18/522
2015: 19/522
2015: 20/522
2015: 21/522
2015: 22/522
2015: 23/522
2015: 24/522
2015: 25/522
2015: 26/522
2015: 27/522
2015: 28/522
2015: 29/522
2015: 30/522
2015: 31/522
2015: 32/522
2015: 33/522
2015: 34/522
2015: 35/522
2015: 36/522
2015: 37/522
2015: 38/522
2015: 39/522
2015: 40/522
2015: 41/522
2015: 42/522
2015: 43/522
2015: 44/522
2015: 45/522
2015: 46/522
2015: 47/522
2015: 48/522
2015: 49/522
2015: 50/522
2015: 51/522
2015: 52/522
2015: 53/522
2015: 54/522
2015: 55/522
2015: 56/522
2015: 57/522
2015: 58/522
2015: 59/522
2015: 60/522
2015: 61/522
2015: 62/522
2015: 63/522
2015: 64/522
2015: 65/522
2015: 66/522
2015: 67/522
2015: 68/522
2015: 69/522
2015: 70/522
2015: 71/522
2015: 72/522
2015: 73/522
2015: 74/522
2015: 75/522
2015: 76/522
2015: 77/522
2015: 78

In [77]:
pd.DataFrame(mapped_population).to_csv('../data/mapped/population.csv', index=None)

### 4. Fill SA2 missing data

##### The original data might not cover all SA2 district. This section fill in the missing data of those SA2

In [116]:
# read in the mapped data
crime2015 = pd.read_csv('../data/mapped/crime2015.csv')
crime2016 = pd.read_csv('../data/mapped/crime2016.csv')
crime2017 = pd.read_csv('../data/mapped/crime2017.csv')
crime2018 = pd.read_csv('../data/mapped/crime2018.csv')
crime2019 = pd.read_csv('../data/mapped/crime2019.csv')
crime2020 = pd.read_csv('../data/mapped/crime2020.csv')
crime2021 = pd.read_csv('../data/mapped/crime2021.csv')
crime2022 = pd.read_csv('../data/mapped/crime2022.csv')
crime2023 = pd.read_csv('../data/mapped/crime2023.csv')

income = pd.read_csv('../data/mapped/income.csv')
population = pd.read_csv('../data/mapped/population.csv')

rent_f1 = pd.read_csv('../data/mapped/rent_f1.csv')
rent_f2 = pd.read_csv('../data/mapped/rent_f2.csv')
rent_f3 = pd.read_csv('../data/mapped/rent_f3.csv')
rent_h2 = pd.read_csv('../data/mapped/rent_h2.csv')
rent_h3 = pd.read_csv('../data/mapped/rent_h3.csv')
rent_h4 = pd.read_csv('../data/mapped/rent_h4.csv')
rent_av = pd.read_csv('../data/mapped/rent_av.csv')

In [117]:
# get centroid
gdf = sf.to_crs(epsg=2263)
gdf['centroid'] = gdf.centroid
gdf = gdf.to_crs(epsg=7844)
gdf["centroid"] = gdf["centroid"].to_crs(epsg=7844)
gdf = gdf.sort_values(by='sa2').reset_index(drop=True)
gdf

Unnamed: 0,sa2,area,geometry,centroid
0,abbotsford,1.7405,"POLYGON ((144.99255 -37.80249, 144.99266 -37.8...",POINT (144.99977 -37.80459)
1,airport west,3.7194,"POLYGON ((144.86706 -37.72471, 144.86798 -37.7...",POINT (144.88130 -37.72381)
2,albert park,4.6747,"POLYGON ((144.96767 -37.83737, 144.96789 -37.8...",POINT (144.96405 -37.84562)
3,alexandra,2118.9554,"POLYGON ((145.59015 -37.22477, 145.58638 -37.2...",POINT (145.79514 -37.32004)
4,alfredton,52.7109,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5...",POINT (143.74934 -37.54174)
...,...,...,...,...
517,yarram,1931.8628,"MULTIPOLYGON (((146.62190 -38.75021, 146.62185...",POINT (146.75104 -38.49471)
518,yarraville,5.6200,"POLYGON ((144.85915 -37.81764, 144.85984 -37.8...",POINT (144.88198 -37.81796)
519,yarrawonga,94.6968,"POLYGON ((146.00051 -36.00877, 146.00128 -36.0...",POINT (146.00270 -36.03763)
520,yarriambiack,7139.5816,"POLYGON ((142.23900 -35.99787, 142.23898 -35.9...",POINT (142.42469 -35.99791)


In [118]:
R = 6378

def dist_in_km(point1, point2):
    """This function calculate the distance of two point in coordinate and return the distance in km.
        This is used instead of geopandas and shaply because we were testing some stuff
        
    Args:
        lat1 (float): latitude of point 1
        lon1 (float): longitude of point 1
        lat2 (float): latitude of point 2
        lon2 (float): longitude of point 2

    Returns:
        float: distance in km
    """   
    # Convert latitude and longitude from degrees to radians
    lat1 = math.radians(point1.y)
    lon1 = math.radians(point1.x)
    lat2 = math.radians(point2.y)
    lon2 = math.radians(point2.x)
    
    # distance
    a = math.sin((lat2 - lat1) / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin((lon2 - lon1) / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    
    return distance

In [119]:
def fill_closest(col_name, data, n=3):
    """Fill in region with nan with data from the n closest region that isn't nan
        
    Args:
        col_name (str): the column name to check for missing data
        data (DataFrame): the dataframe containning the all the region to be considered from
        n (int, optional): the number of closet region to take average from. Defaults to 3.

    Returns:
        DataFrame: the nan filled input DataFrame
    """    
    non_nan_region = gdf[data[col_name].notna()]
    nan_region = gdf[data[col_name].isna()]
    filled_data = []
    
    for i in range(len(nan_region)):
        r = nan_region.iloc[i]
        # closet n region
        non_nan_region['dist'] = non_nan_region.apply(lambda x: dist_in_km(r['centroid'], x['centroid']), 
                                                                                    axis=1)
        top_n = non_nan_region.sort_values(by='dist')\
                                               .iloc[:n, 0]\
                                               .to_list()
        # fill
        top_n_data = data.loc[data['sa2'].isin(top_n)].mean()    
        filled_data.append(top_n_data) 
    
    result = pd.DataFrame(filled_data)
    nan_region = nan_region.reset_index(drop=True)
    result['sa2'] = nan_region['sa2']
    
    non_nan_data = data[data[col_name].notna()]
    return pd.concat([non_nan_data, result]).sort_values(by='sa2').reset_index(drop=True)

##### Fill in data

In [129]:
crime2015_filled = fill_closest('count', crime2015)
crime2016_filled = fill_closest('count', crime2016)
crime2017_filled = fill_closest('count', crime2017)
crime2018_filled = fill_closest('count', crime2018)
crime2019_filled = fill_closest('count', crime2019)
crime2020_filled = fill_closest('count', crime2020)
crime2021_filled = fill_closest('count', crime2021)
crime2022_filled = fill_closest('count', crime2022)
crime2023_filled = fill_closest('count', crime2023)

crime2023_filled['year'] = 2023
crime2022_filled['year'] = 2022
crime2021_filled['year'] = 2021
crime2020_filled['year'] = 2020
crime2019_filled['year'] = 2019
crime2018_filled['year'] = 2018
crime2017_filled['year'] = 2017
crime2016_filled['year'] = 2016
crime2015_filled['year'] = 2015

crime_filled = pd.concat([crime2015_filled, crime2016_filled, crime2017_filled, crime2018_filled, crime2019_filled, 
                                        crime2020_filled, crime2021_filled, crime2022_filled, crime2023_filled])\
                                        .reset_index(drop=True)

In [86]:
income_filled = fill_closest('2015', income)
population_filled = fill_closest('2015', population)

In [88]:
rent_f1_filled = fill_closest('2015', rent_f1)
rent_f2_filled = fill_closest('2015', rent_f2)
rent_f3_filled = fill_closest('2015', rent_f3)
rent_h2_filled = fill_closest('2015', rent_h2)
rent_h3_filled = fill_closest('2015', rent_h3)
rent_h4_filled = fill_closest('2015', rent_h4)
rent_av_filled = fill_closest('2015', rent_av)

### 5. Merge SA2 data

In [156]:
def sa2_data_by_year(year):
    """merge population, income, ... data to one DataFrame by year

    Args:
        year (int): the year to select from each DataFrame

    Returns:
        DataFrame: the merged DataFrame
    """    
    gdf_year = gdf[['sa2']]
    gdf_year['year'] = year
    gdf_year['crime_density'] = crime_filled[crime_filled['year'] == year]['count']\
                                                                   .reset_index(drop=True) / gdf['area']
    gdf_year['population_density'] = population_filled[str(year)] / gdf['area']
    gdf_year['income'] = income_filled[str(year)]
    gdf_year['rent'] = rent_av_filled[str(year)]
    gdf_year = gdf_year[['sa2', 'year', 'income', 'population_density', 'crime_density', 'rent']]
    return gdf_year

In [157]:
# merging data by year
gdf2023 = sa2_data_by_year(2023)
gdf2022 = sa2_data_by_year(2022)
gdf2021 = sa2_data_by_year(2021)
gdf2020 = sa2_data_by_year(2020)
gdf2019 = sa2_data_by_year(2019)
gdf2018 = sa2_data_by_year(2018)
gdf2017 = sa2_data_by_year(2017)
gdf2016 = sa2_data_by_year(2016)
gdf2015 = sa2_data_by_year(2015)

In [None]:
gdf_all = pd.concat([gdf2015, gdf2016, gdf2017, gdf2018, gdf2019,
                                   gdf2020, gdf2021, gdf2022, gdf2023])
gdf_all

Unnamed: 0,sa2,year,income,population_density,crime_density,rent
0,abbotsford,2015,57501.00000,4641.195059,668.198793,430.000000
1,airport west,2015,52329.00000,2068.613217,280.886871,356.026523
2,albert park,2015,67627.00000,3434.017156,84.604360,440.000000
3,alexandra,2015,35244.00000,3.022244,0.008853,343.333333
4,alfredton,2015,49385.00000,209.425375,3.403327,280.000000
...,...,...,...,...,...,...
517,yarram,2023,44127.00000,2.915321,0.007494,385.000000
518,yarraville,2023,82822.00000,2877.046263,174.021352,463.333333
519,yarrawonga,2023,50407.00000,93.762408,1.908644,390.000000
520,yarriambiack,2023,60316.93264,1.476967,0.008769,350.000000


##### Save all SA2 data

In [None]:
gdf_all.to_csv('../data/curated/sa2_data_by_year.csv', index=None)

##### SA2 data of 2023

In [167]:
sa2_2023 = gpd.GeoDataFrame(gdf2023)
sa2_2023['geometry'] = gdf['geometry']
sa2_2023['area'] = gdf['area']
sa2_2023

Unnamed: 0,sa2,year,income,population_density,crime_density,rent,geometry,area
0,abbotsford,2023,78606.000000,5847.744901,782.533755,520.000000,"POLYGON ((144.99255 -37.80249, 144.99266 -37.8...",1.7405
1,airport west,2023,69567.000000,2321.342152,190.808067,450.546861,"POLYGON ((144.86706 -37.72471, 144.86798 -37.7...",3.7194
2,albert park,2023,77185.000000,3592.316084,71.234518,500.000000,"POLYGON ((144.96767 -37.83737, 144.96789 -37.8...",4.6747
3,alexandra,2023,46318.000000,3.256793,0.007993,465.333333,"POLYGON ((145.59015 -37.22477, 145.58638 -37.2...",2118.9554
4,alfredton,2023,63334.000000,349.567167,4.915357,400.000000,"POLYGON ((143.78282 -37.56666, 143.75558 -37.5...",52.7109
...,...,...,...,...,...,...,...,...
517,yarram,2023,44127.000000,2.915321,0.007494,385.000000,"MULTIPOLYGON (((146.62190 -38.75021, 146.62185...",1931.8628
518,yarraville,2023,82822.000000,2877.046263,174.021352,463.333333,"POLYGON ((144.85915 -37.81764, 144.85984 -37.8...",5.6200
519,yarrawonga,2023,50407.000000,93.762408,1.908644,390.000000,"POLYGON ((146.00051 -36.00877, 146.00128 -36.0...",94.6968
520,yarriambiack,2023,60316.932639,1.476967,0.008769,350.000000,"POLYGON ((142.23900 -35.99787, 142.23898 -35.9...",7139.5816


In [168]:
sa2_2023.to_file('../data/curated/sa2_data_2023.json', driver='GeoJSON')

### 6. Merge property data

In [179]:
prop_feature = properties.merge(distance_duration, on='id', how='inner')
prop_feature = prop_feature.merge(gdf2023, on='sa2', how='inner')
prop_feature = prop_feature[['id', 'postcode', 'type', 'price', 'bond', 'bed', 'bath',
                                                'parking', 'is_furnished', 'latitude', 'longitude', 'sa2', 'dist_CBD',
                                                'dist_public_transport', 'dist_hospital', 'dist_police_station',
                                                'dist_supermarket', 'dist_market', 'dist_shopping_center',
                                                'dist_school', 'dur_CBD', 'dur_public_transport', 'dur_hospital',
                                                'dur_police_station', 'dur_supermarket', 'dur_market',
                                                'dur_shopping_center', 'dur_school', 'income',
                                                'population_density', 'crime_density', 'rent']]
prop_feature

Unnamed: 0,id,postcode,type,price,bond,bed,bath,parking,is_furnished,latitude,...,dur_hospital,dur_police_station,dur_supermarket,dur_market,dur_shopping_center,dur_school,income,population_density,crime_density,rent
0,12,3223,House,4700.0,1000.0,4.0,3.0,0.0,0,-38.143992,...,4347,434,222,228,942,264,46819.0,138.848105,1.314240,458.333333
1,51,3223,House,2500.0,1000.0,4.0,2.0,2.0,0,-38.179202,...,6577,738,166,166,712,150,46819.0,138.848105,1.314240,458.333333
2,261,3223,Apartment / Unit / Flat,1750.0,1000.0,2.0,1.0,0.0,0,-38.120924,...,4027,96,112,128,589,99,46819.0,138.848105,1.314240,458.333333
3,260,3223,Apartment / Unit / Flat,1750.0,1000.0,2.0,1.0,0.0,0,-38.120924,...,4027,96,112,128,589,99,46819.0,138.848105,1.314240,458.333333
4,259,3223,Apartment / Unit / Flat,1750.0,1000.0,2.0,1.0,0.0,0,-38.120924,...,4027,96,112,128,589,99,46819.0,138.848105,1.314240,458.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8969,8663,3465,House,300.0,1200.0,3.0,1.0,3.0,0,-36.942684,...,1334,872,795,809,1307,9,45051.0,2.070306,0.003232,386.666667
8970,8656,3737,House,300.0,1200.0,2.0,1.0,0.0,0,-36.588283,...,454,378,347,413,1612,475,50464.0,8.563985,0.052397,395.000000
8971,8750,3885,House,290.0,1160.0,2.0,1.0,0.0,1,-37.332661,...,4308,1443,1413,1453,3794,1418,43892.0,0.589176,0.001010,400.000000
8972,8889,3477,House,250.0,1000.0,2.0,1.0,1.0,0,-36.843227,...,1429,829,299,1261,1311,350,50809.0,1.140739,0.002494,364.415998


#### Save merged property data

In [180]:
prop_feature.to_csv('../data/curated/prop_feature.csv', index=None)