In [107]:
import pandas as pd
import numpy as np

In [108]:
data = pd.read_csv('../data/processed/real_estate/vic_rentals_all_enriched.csv')
data.shape

(12717, 50)

#### Find how many null values per feature

In [109]:
missing_list = [(col, data[col].isnull().sum()) for col in data.columns]
missing_list

[('listing_id', np.int64(0)),
 ('suburb', np.int64(0)),
 ('postcode', np.int64(0)),
 ('weekly_rent', np.int64(279)),
 ('bond', np.int64(782)),
 ('available_date', np.int64(136)),
 ('date_listed', np.int64(4)),
 ('days_listed', np.int64(4)),
 ('bedrooms', np.int64(125)),
 ('bathrooms', np.int64(51)),
 ('carspaces', np.int64(1803)),
 ('property_type', np.int64(0)),
 ('address', np.int64(104)),
 ('lat', np.int64(4)),
 ('lon', np.int64(4)),
 ('photo_count', np.int64(4)),
 ('video_count', np.int64(4)),
 ('floorplans_count', np.int64(4)),
 ('virtual_tour', np.int64(4)),
 ('primary_type', np.int64(4)),
 ('secondary_type', np.int64(4)),
 ('agency', np.int64(5)),
 ('agent_names', np.int64(248)),
 ('land_area', np.int64(12715)),
 ('num_metro_bus_stops', np.int64(0)),
 ('num_metro_tram_stops', np.int64(0)),
 ('num_metro_train_stops', np.int64(0)),
 ('num_regional_bus_stops', np.int64(0)),
 ('num_regional_train_stops', np.int64(0)),
 ('num_schools_2km', np.int64(0)),
 ('Median_age_persons', np.int

#### Drop rows with small number of missing values

In [110]:
# data = data.dropna(subset=['date_listed', 'lat', 'lon', 'bedrooms', 'bathrooms'])

In [111]:
data.shape

(12717, 50)

### Imputation

In [112]:
#Impute average rent for each combination of suburb, property_type, bedrooms, and bathrooms
# Create a lookup dictionary for average weekly rent
rent_lookup = (
    data.groupby(['suburb', 'property_type', 'bedrooms', 'bathrooms'])['weekly_rent']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [113]:
#Impute missing weekly_rent values
for idx, row in data.iterrows():
    if pd.isnull(row['weekly_rent']):
        key = (row['suburb'], row['property_type'], row['bedrooms'], row['bathrooms'])
        if key in rent_lookup:  # only impute if lookup exists
            data.at[idx, 'weekly_rent'] = rent_lookup[key]

In [114]:
#Check for missing values again
missing_list = [(col, data[col].isnull().sum()) for col in data.columns]
print(missing_list)

[('listing_id', np.int64(0)), ('suburb', np.int64(0)), ('postcode', np.int64(0)), ('weekly_rent', np.int64(78)), ('bond', np.int64(782)), ('available_date', np.int64(136)), ('date_listed', np.int64(4)), ('days_listed', np.int64(4)), ('bedrooms', np.int64(125)), ('bathrooms', np.int64(51)), ('carspaces', np.int64(1803)), ('property_type', np.int64(0)), ('address', np.int64(104)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('photo_count', np.int64(4)), ('video_count', np.int64(4)), ('floorplans_count', np.int64(4)), ('virtual_tour', np.int64(4)), ('primary_type', np.int64(4)), ('secondary_type', np.int64(4)), ('agency', np.int64(5)), ('agent_names', np.int64(248)), ('land_area', np.int64(12715)), ('num_metro_bus_stops', np.int64(0)), ('num_metro_tram_stops', np.int64(0)), ('num_metro_train_stops', np.int64(0)), ('num_regional_bus_stops', np.int64(0)), ('num_regional_train_stops', np.int64(0)), ('num_schools_2km', np.int64(0)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay

In [115]:
#Impute average rent with relaxed contraints (without bathrooms and suburb) to fill rest of missing values
#Create a lookup dictionary for average weekly rent
rent_lookup = (
    data.groupby(['property_type', 'bedrooms'])['weekly_rent']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [116]:
#Impute missing weekly_rent values
for idx, row in data.iterrows():
    if pd.isnull(row['weekly_rent']):
        key = (row['property_type'], row['bedrooms'])
        if key in rent_lookup:  # only impute if lookup exists
            data.at[idx, 'weekly_rent'] = rent_lookup[key]

In [117]:
#Check for missing values again
missing_list = [(col, data[col].isnull().sum()) for col in data.columns]
print(missing_list)

[('listing_id', np.int64(0)), ('suburb', np.int64(0)), ('postcode', np.int64(0)), ('weekly_rent', np.int64(19)), ('bond', np.int64(782)), ('available_date', np.int64(136)), ('date_listed', np.int64(4)), ('days_listed', np.int64(4)), ('bedrooms', np.int64(125)), ('bathrooms', np.int64(51)), ('carspaces', np.int64(1803)), ('property_type', np.int64(0)), ('address', np.int64(104)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('photo_count', np.int64(4)), ('video_count', np.int64(4)), ('floorplans_count', np.int64(4)), ('virtual_tour', np.int64(4)), ('primary_type', np.int64(4)), ('secondary_type', np.int64(4)), ('agency', np.int64(5)), ('agent_names', np.int64(248)), ('land_area', np.int64(12715)), ('num_metro_bus_stops', np.int64(0)), ('num_metro_tram_stops', np.int64(0)), ('num_metro_train_stops', np.int64(0)), ('num_regional_bus_stops', np.int64(0)), ('num_regional_train_stops', np.int64(0)), ('num_schools_2km', np.int64(0)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay

In [118]:
data = data.dropna(subset=['weekly_rent'])

In [119]:
#Impute average carspaces for each combination of suburb, property_type, bedrooms, and bathrooms
# Create a lookup dictionary for average carspaces
carspace_lookup = (
    data.groupby(['suburb', 'property_type', 'bedrooms', 'bathrooms'])['carspaces']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [120]:
#Impute missing carspaces values
for idx, row in data.iterrows():
    if pd.isnull(row['carspaces']):
        key = (row['suburb'], row['property_type'], row['bedrooms'], row['bathrooms'])
        if key in carspace_lookup:  # only impute if lookup exists
            data.at[idx, 'carspaces'] = carspace_lookup[key]

In [121]:
#Check for missing values again
missing_list = [(col, data[col].isnull().sum()) for col in data.columns]
print(missing_list)

[('listing_id', np.int64(0)), ('suburb', np.int64(0)), ('postcode', np.int64(0)), ('weekly_rent', np.int64(0)), ('bond', np.int64(779)), ('available_date', np.int64(130)), ('date_listed', np.int64(4)), ('days_listed', np.int64(4)), ('bedrooms', np.int64(108)), ('bathrooms', np.int64(38)), ('carspaces', np.int64(458)), ('property_type', np.int64(0)), ('address', np.int64(104)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('photo_count', np.int64(4)), ('video_count', np.int64(4)), ('floorplans_count', np.int64(4)), ('virtual_tour', np.int64(4)), ('primary_type', np.int64(4)), ('secondary_type', np.int64(4)), ('agency', np.int64(5)), ('agent_names', np.int64(244)), ('land_area', np.int64(12696)), ('num_metro_bus_stops', np.int64(0)), ('num_metro_tram_stops', np.int64(0)), ('num_metro_train_stops', np.int64(0)), ('num_regional_bus_stops', np.int64(0)), ('num_regional_train_stops', np.int64(0)), ('num_schools_2km', np.int64(0)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_m

In [122]:
#Impute average carspaces with relaxed contraints (without bathrooms) to fill rest of missing values
# Create a lookup dictionary for average carspaces
carspace_lookup = (
    data.groupby(['property_type', 'bedrooms'])['carspaces']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [123]:
#Impute missing carspaces values
for idx, row in data.iterrows():
    if pd.isnull(row['carspaces']):
        key = ( row['property_type'], row['bedrooms'])
        if key in carspace_lookup:  # only impute if lookup exists
            data.at[idx, 'carspaces'] = carspace_lookup[key]

In [124]:
#Check for missing values again
missing_list = [(col, data[col].isnull().sum()) for col in data.columns]
print(missing_list)

[('listing_id', np.int64(0)), ('suburb', np.int64(0)), ('postcode', np.int64(0)), ('weekly_rent', np.int64(0)), ('bond', np.int64(779)), ('available_date', np.int64(130)), ('date_listed', np.int64(4)), ('days_listed', np.int64(4)), ('bedrooms', np.int64(108)), ('bathrooms', np.int64(38)), ('carspaces', np.int64(73)), ('property_type', np.int64(0)), ('address', np.int64(104)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('photo_count', np.int64(4)), ('video_count', np.int64(4)), ('floorplans_count', np.int64(4)), ('virtual_tour', np.int64(4)), ('primary_type', np.int64(4)), ('secondary_type', np.int64(4)), ('agency', np.int64(5)), ('agent_names', np.int64(244)), ('land_area', np.int64(12696)), ('num_metro_bus_stops', np.int64(0)), ('num_metro_tram_stops', np.int64(0)), ('num_metro_train_stops', np.int64(0)), ('num_regional_bus_stops', np.int64(0)), ('num_regional_train_stops', np.int64(0)), ('num_schools_2km', np.int64(0)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_mo

In [125]:
data = data.dropna(subset=['carspaces'])