In [2]:
import numpy as np
import pandas as pd

In [3]:
import glob
import os
# csv_files = glob.glob(os.path.join("../data/raw/domain/suburbs", "*.csv"))
# joined_suburbs = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True).sort_values(by='listing_id').reset_index(drop=True)

# print(len(joined_suburbs))
# joined_suburbs.head()

### Data reading

In [4]:
all_suburbs = pd.read_csv("../data/raw/domain/vic_rentals_all.csv").sort_values(by='listing_id').reset_index(drop=True)
print(len(all_suburbs))
all_suburbs.head()

12717


Unnamed: 0,listing_id,suburb,postcode,weekly_rent,bond,available_date,date_listed,days_listed,bedrooms,bathrooms,...,floorplans_count,virtual_tour,primary_type,secondary_type,agency,agency_id,agent_names,structured_features,url,land_area
0,5470976,ASCOT VALE,3032,660.0,2868.0,"Wednesday, 10 September 2025",2025-07-21,50.0,2.0,1.0,...,0.0,False,House,House,Keyhole Property Investments,8749.0,"Rentals, Reception","Air conditioning, Bath, Built in wardrobes, Cl...",https://www.domain.com.au/68-francis-street-as...,
1,5604062,MELTON,3337,,150.0,"Tuesday, 15 January 2008",2008-01-15,6446.0,,,...,0.0,False,Apartment,Apartment / Unit / Flat,Raine & Horne Melton,22328.0,,,https://www.domain.com.au/10-glenville-drive-m...,
2,6168570,MELBOURNE,3000,310.0,1347.0,"Monday, 03 July 2023",2025-09-02,7.0,,1.0,...,0.0,False,Apartment,Studio,Match Property Group,8668.0,Lisbeth Rosborg-Winter,"Built in wardrobes, Furnished, Broadband inter...",https://www.domain.com.au/32-546-flinders-stre...,
3,7117948,MOONEE PONDS,3039,500.0,2173.0,"Friday, 19 September 2025",2025-09-06,3.0,2.0,1.0,...,0.0,False,Apartment,Apartment / Unit / Flat,Simone Bullen,7896.0,Ebonnie Reid,Ground floor,https://www.domain.com.au/7-64-holmes-road-moo...,
4,7455074,PRAHRAN,3181,,340.0,"Thursday, 22 May 2014",2012-03-02,4938.0,1.0,1.0,...,0.0,False,Apartment,Apartment / Unit / Flat,Prime Property Partners Australia,2231.0,Maia Weinberg,,https://www.domain.com.au/1-60-the-avenue-prah...,


### Initial feature selection

In [5]:
all_suburbs.columns

Index(['listing_id', 'suburb', 'postcode', 'weekly_rent', 'bond',
       'available_date', 'date_listed', 'days_listed', 'bedrooms', 'bathrooms',
       'carspaces', 'property_type', 'address', 'lat', 'lon', 'scraped_date',
       'domain_page_id', 'property_id', 'photo_count', 'video_count',
       'floorplans_count', 'virtual_tour', 'primary_type', 'secondary_type',
       'agency', 'agency_id', 'agent_names', 'structured_features', 'url',
       'land_area'],
      dtype='object')

In [6]:
DROPPED_COLUMNS = [
    "scraped_date",
    "domain_page_id",
    "property_id",
    "agency_id",
    "structured_features",
    "url",
]

In [7]:
all_suburbs = all_suburbs.drop(columns=DROPPED_COLUMNS)
all_suburbs.head()

Unnamed: 0,listing_id,suburb,postcode,weekly_rent,bond,available_date,date_listed,days_listed,bedrooms,bathrooms,...,lon,photo_count,video_count,floorplans_count,virtual_tour,primary_type,secondary_type,agency,agent_names,land_area
0,5470976,ASCOT VALE,3032,660.0,2868.0,"Wednesday, 10 September 2025",2025-07-21,50.0,2.0,1.0,...,144.9182,12.0,0.0,0.0,False,House,House,Keyhole Property Investments,"Rentals, Reception",
1,5604062,MELTON,3337,,150.0,"Tuesday, 15 January 2008",2008-01-15,6446.0,,,...,144.59305,6.0,0.0,0.0,False,Apartment,Apartment / Unit / Flat,Raine & Horne Melton,,
2,6168570,MELBOURNE,3000,310.0,1347.0,"Monday, 03 July 2023",2025-09-02,7.0,,1.0,...,144.95618,1.0,0.0,0.0,False,Apartment,Studio,Match Property Group,Lisbeth Rosborg-Winter,
3,7117948,MOONEE PONDS,3039,500.0,2173.0,"Friday, 19 September 2025",2025-09-06,3.0,2.0,1.0,...,144.91553,7.0,0.0,0.0,False,Apartment,Apartment / Unit / Flat,Simone Bullen,Ebonnie Reid,
4,7455074,PRAHRAN,3181,,340.0,"Thursday, 22 May 2014",2012-03-02,4938.0,1.0,1.0,...,144.9986,12.0,0.0,0.0,False,Apartment,Apartment / Unit / Flat,Prime Property Partners Australia,Maia Weinberg,


### Feature Engineering

#### Merging with transport data

In [8]:
def find_num_stops_within_radius(property_lat, property_lon, stops_df, radius_km):
    """
    Calculate the number of transport stops within a given radius (in km) of a property.

    Parameters:
    property_lat (float): Latitude of the property
    property_lon (float): Longitude of the property
    stops_df (DataFrame): DataFrame containing transport stops with 'Latitude' and 'Longitude' columns
    radius_km (float): Radius in kilometers

    Returns:
    int: Number of stops within the specified radius
    """
    # Haversine formula
    lat_diff = np.radians(stops_df['Latitude'] - property_lat)
    lon_diff = np.radians(stops_df['Longitude'] - property_lon)

    a = np.sin(lat_diff / 2)**2 + np.cos(np.radians(property_lat)) * \
        np.cos(np.radians(stops_df['Latitude'])) * np.sin(lon_diff / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    distances_km = 6371 * c  # Earth radius in km

    # Mask for rows within radius
    mask = distances_km <= radius_km
    stops_within_radius = stops_df[mask]

    # Print matching rows
    # print(stops_within_radius)

    return np.sum(mask)

def return_stop_insights_per_type(property_lat, property_lon, stops_df, radius):
    output = dict()
    output["NumMetroBusStops"] = find_num_stops_within_radius(
        property_lat, property_lon, stops_df[stops_df["StopType"] == "Metro Bus"], radius)
    output["NumMetroTramStops"] = find_num_stops_within_radius(
        property_lat, property_lon, stops_df[stops_df["StopType"] == "Metro Tram"], radius)
    output["NumMetroTrainStops"] = find_num_stops_within_radius(
        property_lat, property_lon, stops_df[stops_df["StopType"] == "Metro Train"], radius)
    output["NumRegionalTrainStops"] = find_num_stops_within_radius(
        property_lat, property_lon, stops_df[stops_df["StopType"] == "Regional Train"], radius)
    output["NumRegionalBusStops"] = find_num_stops_within_radius(
        property_lat, property_lon, stops_df[stops_df["StopType"] == "Regional Bus"], radius)
    return output

def return_stop_insights_metro_bus(property_lat, property_lon, stops_df, radius):
    return find_num_stops_within_radius(property_lat, property_lon, stops_df[stops_df["StopType"] == "Metro Bus"], radius)

def return_stop_insights_metro_tram(property_lat, property_lon, stops_df, radius):
    return find_num_stops_within_radius(property_lat, property_lon, stops_df[stops_df["StopType"] == "Metro Tram"], radius)

def return_stop_insights_metro_train(property_lat, property_lon, stops_df, radius):
    return find_num_stops_within_radius(property_lat, property_lon, stops_df[stops_df["StopType"] == "Metro Train"], radius)

def return_stop_insights_regional_train(property_lat, property_lon, stops_df, radius):
    return find_num_stops_within_radius(property_lat, property_lon, stops_df[stops_df["StopType"] == "Regional Train"], radius)

def return_stop_insights_regional_bus(property_lat, property_lon, stops_df, radius):
    return find_num_stops_within_radius(property_lat, property_lon, stops_df[stops_df["StopType"] == "Regional Bus"], radius)

In [9]:
RADIUS = 2 # 2km
transport_data = pd.read_csv("../data/processed/transport/transport_stops.csv")
transport_data.head()

Unnamed: 0,StopName,StopType,Latitude,Longitude
0,10 Jarrah Dr,Metro Bus,-38.002837,145.110716
1,10 Oban Rd,Metro Bus,-37.796342,145.252047
2,10 Queens Pde,Metro Bus,-37.719582,144.971255
3,100 South Gippsland Hwy,Metro Bus,-38.008999,145.229229
4,1000 Steps/Mount Dandenong Tourist Rd,Metro Bus,-37.889758,145.318343


In [10]:
# 3 mins to run
all_suburbs["num_metro_bus_stops"] = all_suburbs.apply(lambda row: return_stop_insights_metro_bus(row["lat"], row["lon"], transport_data, RADIUS), axis=1)
all_suburbs["num_metro_tram_stops"] = all_suburbs.apply(lambda row: return_stop_insights_metro_tram(row["lat"], row["lon"], transport_data, RADIUS), axis=1)
all_suburbs["num_metro_train_stops"] = all_suburbs.apply(lambda row: return_stop_insights_metro_train(row["lat"], row["lon"], transport_data, RADIUS), axis=1)
all_suburbs["num_regional_bus_stops"] = all_suburbs.apply(lambda row: return_stop_insights_regional_bus(row["lat"], row["lon"], transport_data, RADIUS), axis=1)
all_suburbs["num_regional_train_stops"] = all_suburbs.apply(lambda row: return_stop_insights_regional_train(row["lat"], row["lon"], transport_data, RADIUS), axis=1)
all_suburbs.head()

Unnamed: 0,listing_id,suburb,postcode,weekly_rent,bond,available_date,date_listed,days_listed,bedrooms,bathrooms,...,primary_type,secondary_type,agency,agent_names,land_area,num_metro_bus_stops,num_metro_tram_stops,num_metro_train_stops,num_regional_bus_stops,num_regional_train_stops
0,5470976,ASCOT VALE,3032,660.0,2868.0,"Wednesday, 10 September 2025",2025-07-21,50.0,2.0,1.0,...,House,House,Keyhole Property Investments,"Rentals, Reception",,83,38,4,0,0
1,5604062,MELTON,3337,,150.0,"Tuesday, 15 January 2008",2008-01-15,6446.0,,,...,Apartment,Apartment / Unit / Flat,Raine & Horne Melton,,,35,0,0,0,0
2,6168570,MELBOURNE,3000,310.0,1347.0,"Monday, 03 July 2023",2025-09-02,7.0,,1.0,...,Apartment,Studio,Match Property Group,Lisbeth Rosborg-Winter,,85,108,6,0,3
3,7117948,MOONEE PONDS,3039,500.0,2173.0,"Friday, 19 September 2025",2025-09-06,3.0,2.0,1.0,...,Apartment,Apartment / Unit / Flat,Simone Bullen,Ebonnie Reid,,129,40,3,0,1
4,7455074,PRAHRAN,3181,,340.0,"Thursday, 22 May 2014",2012-03-02,4938.0,1.0,1.0,...,Apartment,Apartment / Unit / Flat,Prime Property Partners Australia,Maia Weinberg,,85,100,7,0,0


#### Merging with school location data


In [11]:
# Load processed school location data
school_data = pd.read_csv("../data/processed/schools/school_locations.csv")
school_data.head()

# Function to count schools within a radius of each property using haversine formula
from numpy import radians, cos, sin, sqrt, arctan2
def count_schools_within_radius(lat, lon, school_df, radius_km=2):
    R = 6371  # Earth radius in km
    lat1 = radians(lat)
    lon1 = radians(lon)
    lats2 = radians(school_df['Latitude'])
    lons2 = radians(school_df['Longitude'])
    dlat = lats2 - lat1
    dlon = lons2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lats2) * sin(dlon/2)**2
    c = 2 * arctan2(sqrt(a), sqrt(1-a))
    distances = R * c
    return (distances < radius_km).sum()

# Add feature: number of schools within 2km of each property
all_suburbs['num_schools_2km'] = all_suburbs.apply(lambda row: count_schools_within_radius(row['lat'], row['lon'], school_data, radius_km=2), axis=1)
all_suburbs.head()

Unnamed: 0,listing_id,suburb,postcode,weekly_rent,bond,available_date,date_listed,days_listed,bedrooms,bathrooms,...,secondary_type,agency,agent_names,land_area,num_metro_bus_stops,num_metro_tram_stops,num_metro_train_stops,num_regional_bus_stops,num_regional_train_stops,num_schools_2km
0,5470976,ASCOT VALE,3032,660.0,2868.0,"Wednesday, 10 September 2025",2025-07-21,50.0,2.0,1.0,...,House,Keyhole Property Investments,"Rentals, Reception",,83,38,4,0,0,13
1,5604062,MELTON,3337,,150.0,"Tuesday, 15 January 2008",2008-01-15,6446.0,,,...,Apartment / Unit / Flat,Raine & Horne Melton,,,35,0,0,0,0,6
2,6168570,MELBOURNE,3000,310.0,1347.0,"Monday, 03 July 2023",2025-09-02,7.0,,1.0,...,Studio,Match Property Group,Lisbeth Rosborg-Winter,,85,108,6,0,3,11
3,7117948,MOONEE PONDS,3039,500.0,2173.0,"Friday, 19 September 2025",2025-09-06,3.0,2.0,1.0,...,Apartment / Unit / Flat,Simone Bullen,Ebonnie Reid,,129,40,3,0,1,13
4,7455074,PRAHRAN,3181,,340.0,"Thursday, 22 May 2014",2012-03-02,4938.0,1.0,1.0,...,Apartment / Unit / Flat,Prime Property Partners Australia,Maia Weinberg,,85,100,7,0,0,21


### Merging with census data

Merging all_subrubs with census data based on the shape file coordinates

In [12]:
import geopandas as gdp
from shapely.geometry import Point
import pandas as pd

census=gdp.read_file("../data/processed/demographics/sa2_census.geojson")

geometry = [Point(xy) for xy in zip(all_suburbs["lon"], all_suburbs["lat"])]
suburbs_gdf=gdp.GeoDataFrame(all_suburbs, geometry=geometry, crs="EPSG:7844")

# Reproject real estate to census CRS
suburbs_gdf=suburbs_gdf.to_crs(census.crs)

# Each point inherits the SA2 attributes from census polygons
merged=gdp.sjoin(suburbs_gdf, census, how="left", predicate="within")

dropped_columns=["SA2_CODE_2021", "SA2_NAME21", "SA3_CODE21", "SA3_NAME21", "geometry", "index_right"]

all_suburbs=merged.drop(columns=dropped_columns)

all_suburbs.head()

Unnamed: 0,listing_id,suburb,postcode,weekly_rent,bond,available_date,date_listed,days_listed,bedrooms,bathrooms,...,Mortgage (%),Total rented (%),Other tenure (%),Unemployment,post_gradutae (%),Graduate_diploma_certificate(%),Bachelor (%),Advanced_&_Diploma (%),Certificate_level (%),Total_persons
0,5470976,ASCOT VALE,3032,660.0,2868.0,"Wednesday, 10 September 2025",2025-07-21,50.0,2.0,1.0,...,0.327294,0.379467,0.00973,0.046495,0.168135,0.065559,0.382874,0.119857,0.158769,8969.0
1,5604062,MELTON,3337,,150.0,"Tuesday, 15 January 2008",2008-01-15,6446.0,,,...,0.297929,0.346025,0.011022,0.095581,0.030909,0.025455,0.129697,0.143939,0.413333,3300.0
2,6168570,MELBOURNE,3000,310.0,1347.0,"Monday, 03 July 2023",2025-09-02,7.0,,1.0,...,0.123647,0.761911,0.018286,0.083853,0.219751,0.025098,0.41604,0.12361,0.081344,12232.0
3,7117948,MOONEE PONDS,3039,500.0,2173.0,"Friday, 19 September 2025",2025-09-06,3.0,2.0,1.0,...,0.28942,0.385893,0.013116,0.038438,0.170695,0.073617,0.393366,0.134493,0.153807,9889.0
4,7455074,PRAHRAN,3181,,340.0,"Thursday, 22 May 2014",2012-03-02,4938.0,1.0,1.0,...,0.237092,0.54936,0.014147,0.036782,0.164398,0.050746,0.458828,0.118529,0.110812,13735.0


### Merging with population growth

In [13]:
population = pd.read_csv("../data/processed/population/population_final.csv")
all_suburbs = all_suburbs.merge(population, how="left", on="postcode")
all_suburbs = all_suburbs.drop(columns=["SA2 code"])

In [14]:
all_suburbs.head()

Unnamed: 0,listing_id,suburb,postcode,weekly_rent,bond,available_date,date_listed,days_listed,bedrooms,bathrooms,...,Total rented (%),Other tenure (%),Unemployment,post_gradutae (%),Graduate_diploma_certificate(%),Bachelor (%),Advanced_&_Diploma (%),Certificate_level (%),Total_persons,Population-2023
0,5470976,ASCOT VALE,3032,660.0,2868.0,"Wednesday, 10 September 2025",2025-07-21,50.0,2.0,1.0,...,0.379467,0.00973,0.046495,0.168135,0.065559,0.382874,0.119857,0.158769,8969.0,13408
1,5604062,MELTON,3337,,150.0,"Tuesday, 15 January 2008",2008-01-15,6446.0,,,...,0.346025,0.011022,0.095581,0.030909,0.025455,0.129697,0.143939,0.413333,3300.0,8011
2,6168570,MELBOURNE,3000,310.0,1347.0,"Monday, 03 July 2023",2025-09-02,7.0,,1.0,...,0.761911,0.018286,0.083853,0.219751,0.025098,0.41604,0.12361,0.081344,12232.0,18017
3,7117948,MOONEE PONDS,3039,500.0,2173.0,"Friday, 19 September 2025",2025-09-06,3.0,2.0,1.0,...,0.385893,0.013116,0.038438,0.170695,0.073617,0.393366,0.134493,0.153807,9889.0,17203
4,7455074,PRAHRAN,3181,,340.0,"Thursday, 22 May 2014",2012-03-02,4938.0,1.0,1.0,...,0.54936,0.014147,0.036782,0.164398,0.050746,0.458828,0.118529,0.110812,13735.0,20268


### Merging with crime

In [15]:
# suburb_crime = pd.read_csv("../data/processed/crime/suburb_crime_final.csv")
# all_suburbs = all_suburbs.merge(
#     suburb_crime,
#     how="left",
#     left_on="suburb",     # from rental dataset
#     right_on="loc_name"   # from crime dataset
# )

In [16]:
all_suburbs

Unnamed: 0,listing_id,suburb,postcode,weekly_rent,bond,available_date,date_listed,days_listed,bedrooms,bathrooms,...,Total rented (%),Other tenure (%),Unemployment,post_gradutae (%),Graduate_diploma_certificate(%),Bachelor (%),Advanced_&_Diploma (%),Certificate_level (%),Total_persons,Population-2023
0,5470976,ASCOT VALE,3032,660.0,2868.0,"Wednesday, 10 September 2025",2025-07-21,50.0,2.0,1.0,...,0.379467,0.009730,0.046495,0.168135,0.065559,0.382874,0.119857,0.158769,8969.0,13408
1,5604062,MELTON,3337,,150.0,"Tuesday, 15 January 2008",2008-01-15,6446.0,,,...,0.346025,0.011022,0.095581,0.030909,0.025455,0.129697,0.143939,0.413333,3300.0,8011
2,6168570,MELBOURNE,3000,310.0,1347.0,"Monday, 03 July 2023",2025-09-02,7.0,,1.0,...,0.761911,0.018286,0.083853,0.219751,0.025098,0.416040,0.123610,0.081344,12232.0,18017
3,7117948,MOONEE PONDS,3039,500.0,2173.0,"Friday, 19 September 2025",2025-09-06,3.0,2.0,1.0,...,0.385893,0.013116,0.038438,0.170695,0.073617,0.393366,0.134493,0.153807,9889.0,17203
4,7455074,PRAHRAN,3181,,340.0,"Thursday, 22 May 2014",2012-03-02,4938.0,1.0,1.0,...,0.549360,0.014147,0.036782,0.164398,0.050746,0.458828,0.118529,0.110812,13735.0,20268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12712,17754387,CRANBOURNE,3977,460.0,1999.0,"Monday, 29 September 2025",2025-09-09,0.0,2.0,1.0,...,0.338007,0.008190,0.066765,0.059695,0.023620,0.168563,0.170067,0.396929,9314.0,13530
12713,17754389,NORTH MELBOURNE,3051,720.0,3128.0,"Tuesday, 09 September 2025",2025-09-09,0.0,2.0,2.0,...,0.639825,0.018818,0.070373,0.245423,0.048634,0.391513,0.092480,0.087128,10651.0,18017
12714,17754400,SPRINGVALE,3171,650.0,2824.0,"Friday, 10 October 2025",2025-09-09,0.0,3.0,2.0,...,0.393775,0.036960,0.081065,0.121439,0.024136,0.313112,0.162318,0.199609,9198.0,23509
12715,17754418,NORTH MELBOURNE,3051,640.0,2780.0,"Tuesday, 09 September 2025",2025-09-09,0.0,2.0,2.0,...,0.639825,0.018818,0.070373,0.245423,0.048634,0.391513,0.092480,0.087128,10651.0,18017


### Exporting for preprocessing

In [17]:
all_suburbs.to_csv("../data/processed/real_estate/vic_rentals_all_enriched.csv", index=False)