In [248]:
import pandas as pd
import numpy as np

In [249]:
data = pd.read_csv('../data/processed/real_estate/vic_rentals_all_enriched.csv')
data.shape

(12717, 57)

In [250]:
data = data.drop(columns=["listing_id", "date_listed", "address", "photo_count", "video_count",
                          "floorplans_count", "virtual_tour", "primary_type", "secondary_type",
                          "agent_names",])
data.shape

(12717, 47)

#### Find how many null values per feature

In [251]:
def find_nans(data):
    missing_list = [(col, data[col].isnull().sum()) for col in data.columns]
    non_nans = [(col, cnt) for col, cnt in missing_list if cnt != 0]
    return sorted(non_nans, key=lambda x: x[1], reverse=True)  # sort by column name
print(find_nans(data))

[('land_area', np.int64(12715)), ('SAL_NAME21', np.int64(3267)), ('incidents_recorded', np.int64(3267)), ('rate_per_100000_population', np.int64(3267)), ('population_est', np.int64(3267)), ('crime_per_person', np.int64(3267)), ('crime_index', np.int64(3267)), ('crime_rank', np.int64(3267)), ('carspaces', np.int64(1803)), ('bond', np.int64(782)), ('weekly_rent', np.int64(279)), ('available_date', np.int64(136)), ('bedrooms', np.int64(125)), ('bathrooms', np.int64(51)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (

#### Drop rows with small number of missing values

In [252]:
# data = data.dropna(subset=['date_listed', 'lat', 'lon', 'bedrooms', 'bathrooms'])

### Imputation

#### Average rent

In [253]:
#Impute average rent for each combination of suburb, property_type, bedrooms, and bathrooms
# Create a lookup dictionary for average weekly rent
rent_lookup = (
    data.groupby(['suburb', 'property_type', 'bedrooms', 'bathrooms'])['weekly_rent']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [254]:
rent_lookup

{('ABBOTSFORD', 'Apartment / Unit / Flat', 1.0, 1.0): 552.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 2.0, 1.0): 714.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 2.0, 2.0): 692.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 3.0, 2.0): 958.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 4.0, 4.0): 1225.0,
 ('ABBOTSFORD', 'House', 2.0, 1.0): 775.0,
 ('ABBOTSFORD', 'House', 3.0, 1.0): 870.0,
 ('ABBOTSFORD', 'Townhouse', 2.0, 1.0): 650.0,
 ('ABBOTSFORD', 'Townhouse', 3.0, 2.0): 935.0,
 ('ABERFELDIE', 'Apartment / Unit / Flat', 1.0, 1.0): 420.0,
 ('ABERFELDIE', 'Apartment / Unit / Flat', 2.0, 1.0): 430.0,
 ('ABERFELDIE', 'House', 2.0, 1.0): 550.0,
 ('ABERFELDIE', 'Townhouse', 2.0, 2.0): 580.0,
 ('ABERFELDIE', 'Townhouse', 4.0, 3.0): 1000.0,
 ('AIRPORT WEST', 'Apartment / Unit / Flat', 2.0, 1.0): 458.0,
 ('AIRPORT WEST', 'Apartment / Unit / Flat', 3.0, 1.0): 580.0,
 ('AIRPORT WEST', 'House', 2.0, 1.0): 530.0,
 ('AIRPORT WEST', 'House', 3.0, 1.0): 591.0,
 ('AIRPORT WEST', 'House', 3.0, 2.0): 

In [255]:
#Impute missing weekly_rent values
for idx, row in data.iterrows():
    if pd.isnull(row['weekly_rent']):
        key = (row['suburb'], row['property_type'], row['bedrooms'], row['bathrooms'])
        if key in rent_lookup:  # only impute if lookup exists
            data.at[idx, 'weekly_rent'] = rent_lookup[key]

In [256]:
#Check for missing values again
print(find_nans(data))

[('land_area', np.int64(12715)), ('SAL_NAME21', np.int64(3267)), ('incidents_recorded', np.int64(3267)), ('rate_per_100000_population', np.int64(3267)), ('population_est', np.int64(3267)), ('crime_per_person', np.int64(3267)), ('crime_index', np.int64(3267)), ('crime_rank', np.int64(3267)), ('carspaces', np.int64(1803)), ('bond', np.int64(782)), ('available_date', np.int64(136)), ('bedrooms', np.int64(125)), ('weekly_rent', np.int64(78)), ('bathrooms', np.int64(51)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%

In [257]:
#Impute average rent with relaxed contraints (without bathrooms and suburb) to fill rest of missing values
#Create a lookup dictionary for average weekly rent
rent_lookup = (
    data.groupby(['property_type', 'bedrooms'])['weekly_rent']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [258]:
#Impute missing weekly_rent values
for idx, row in data.iterrows():
    if pd.isnull(row['weekly_rent']):
        key = (row['property_type'], row['bedrooms'])
        if key in rent_lookup:  # only impute if lookup exists
            data.at[idx, 'weekly_rent'] = rent_lookup[key]

In [259]:
#Check for missing values again
print(find_nans(data))

[('land_area', np.int64(12715)), ('SAL_NAME21', np.int64(3267)), ('incidents_recorded', np.int64(3267)), ('rate_per_100000_population', np.int64(3267)), ('population_est', np.int64(3267)), ('crime_per_person', np.int64(3267)), ('crime_index', np.int64(3267)), ('crime_rank', np.int64(3267)), ('carspaces', np.int64(1803)), ('bond', np.int64(782)), ('available_date', np.int64(136)), ('bedrooms', np.int64(125)), ('bathrooms', np.int64(51)), ('weekly_rent', np.int64(19)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%

In [260]:
data = data.dropna(subset=['weekly_rent'])

#### Imputing carspaces

In [261]:
#Impute average carspaces for each combination of suburb, property_type, bedrooms, and bathrooms
# Create a lookup dictionary for average carspaces
carspace_lookup = (
    data.groupby(['suburb', 'property_type', 'bedrooms', 'bathrooms'])['carspaces']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [262]:
#Impute missing carspaces values
for idx, row in data.iterrows():
    if pd.isnull(row['carspaces']):
        key = (row['suburb'], row['property_type'], row['bedrooms'], row['bathrooms'])
        if key in carspace_lookup:  # only impute if lookup exists
            data.at[idx, 'carspaces'] = carspace_lookup[key]

In [263]:
#Check for missing values again
print(find_nans(data))

[('land_area', np.int64(12696)), ('SAL_NAME21', np.int64(3264)), ('incidents_recorded', np.int64(3264)), ('rate_per_100000_population', np.int64(3264)), ('population_est', np.int64(3264)), ('crime_per_person', np.int64(3264)), ('crime_index', np.int64(3264)), ('crime_rank', np.int64(3264)), ('bond', np.int64(779)), ('carspaces', np.int64(457)), ('available_date', np.int64(130)), ('bedrooms', np.int64(108)), ('bathrooms', np.int64(38)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure

In [264]:
#Impute average carspaces with relaxed contraints (without bathrooms) to fill rest of missing values
# Create a lookup dictionary for average carspaces
carspace_lookup = (
    data.groupby(['property_type', 'bedrooms'])['carspaces']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [265]:
#Impute missing carspaces values
for idx, row in data.iterrows():
    if pd.isnull(row['carspaces']):
        key = ( row['property_type'], row['bedrooms'])
        if key in carspace_lookup:  # only impute if lookup exists
            data.at[idx, 'carspaces'] = carspace_lookup[key]

In [266]:
#Check for missing values again
print(find_nans(data))

[('land_area', np.int64(12696)), ('SAL_NAME21', np.int64(3264)), ('incidents_recorded', np.int64(3264)), ('rate_per_100000_population', np.int64(3264)), ('population_est', np.int64(3264)), ('crime_per_person', np.int64(3264)), ('crime_index', np.int64(3264)), ('crime_rank', np.int64(3264)), ('bond', np.int64(779)), ('available_date', np.int64(130)), ('bedrooms', np.int64(108)), ('carspaces', np.int64(73)), ('bathrooms', np.int64(38)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure 

In [267]:
data = data.dropna(subset=['carspaces'])

#### Redundant columns

In [270]:
data[data["SAL_NAME21"].isna()]

Unnamed: 0,suburb,postcode,weekly_rent,bond,available_date,days_listed,bedrooms,bathrooms,carspaces,property_type,...,Certificate_level (%),Total_persons,Population-2023,SAL_NAME21,incidents_recorded,rate_per_100000_population,population_est,crime_per_person,crime_index,crime_rank
5,BRIGHTON,3186,1575.0,,"Saturday, 18 June 2022",4839.0,2.0,2.0,1.0,Apartment / Unit / Flat,...,0.106986,13871.0,18828,,,,,,,
8,ARMADALE,3143,1575.0,,"Saturday, 18 June 2022",4522.0,2.0,2.0,1.0,Apartment / Unit / Flat,...,0.083032,6359.0,20268,,,,,,,
11,CARLTON,3053,350.0,1521.0,"Thursday, 12 December 2024",4455.0,1.0,1.0,2.0,Studio,...,0.075892,11569.0,21376,,,,,,,
13,BURWOOD,3125,320.0,,"Tuesday, 09 December 2014",4455.0,,1.0,8.0,Studio,...,0.127570,7392.0,20600,,,,,,,
14,BURWOOD,3125,300.0,,"Monday, 16 March 2020",553.0,1.0,1.0,20.0,Apartment / Unit / Flat,...,0.127570,7392.0,20600,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12704,RESERVOIR,3073,480.0,2086.0,"Tuesday, 09 September 2025",0.0,3.0,1.0,2.0,Apartment / Unit / Flat,...,0.209614,7676.0,14728,,,,,,,
12705,DONNYBROOK,3064,490.0,2129.0,"Wednesday, 08 October 2025",0.0,3.0,2.0,2.0,House,...,0.391460,5293.0,15185,,,,,,,
12707,DONNYBROOK,3064,530.0,2303.0,"Tuesday, 09 September 2025",0.0,4.0,2.0,2.0,House,...,0.391460,5293.0,15185,,,,,,,
12714,SPRINGVALE,3171,650.0,2824.0,"Friday, 10 October 2025",0.0,3.0,2.0,2.0,Townhouse,...,0.199609,9198.0,23509,,,,,,,


In [None]:
data = data.drop(columns=["land_area", "SAL_NAME21", ])
print(find_nans(data))

[('SAL_NAME21', np.int64(3247)), ('incidents_recorded', np.int64(3247)), ('rate_per_100000_population', np.int64(3247)), ('population_est', np.int64(3247)), ('crime_per_person', np.int64(3247)), ('crime_index', np.int64(3247)), ('crime_rank', np.int64(3247)), ('bond', np.int64(764)), ('available_date', np.int64(130)), ('bedrooms', np.int64(45)), ('bathrooms', np.int64(26)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure (%)', np.int64(4)), ('Unemployment', np.int64(4)), ('post_grad