In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../data/processed/real_estate/vic_rentals_all_enriched.csv')
data.shape

(12331, 57)

In [3]:
data = data.drop(columns=["listing_id", "date_listed", "address", "photo_count", "video_count",
                          "floorplans_count", "virtual_tour", "primary_type", "secondary_type",
                          "agent_names",])
data.shape

(12331, 47)

#### Find how many null values per feature

In [4]:
def find_nans(data):
    missing_list = [(col, data[col].isnull().sum()) for col in data.columns]
    non_nans = [(col, cnt) for col, cnt in missing_list if cnt != 0]
    return sorted(non_nans, key=lambda x: x[1], reverse=True)  # sort by column name
print(find_nans(data))

[('land_area', np.int64(12329)), ('carspaces', np.int64(1771)), ('bond', np.int64(763)), ('weekly_rent', np.int64(270)), ('available_date', np.int64(133)), ('bedrooms', np.int64(124)), ('bathrooms', np.int64(51)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure (%)', np.int64(4)), ('Unemployment', np.int64(4)), ('post_gradutae (%)', np.int64(4)), ('Graduate_diploma_certificate(%)', np.int64(4)), ('Bachelor (%)', np.int64(4)), ('Advanced_&_Diploma (%)', np.int64(4)), ('Certificate_le

#### Drop rows with small number of missing values

In [5]:
# data = data.dropna(subset=['date_listed', 'lat', 'lon', 'bedrooms', 'bathrooms'])

### Imputation

#### Average rent

In [6]:
#Impute average rent for each combination of suburb, property_type, bedrooms, and bathrooms
# Create a lookup dictionary for average weekly rent
rent_lookup = (
    data.groupby(['suburb', 'property_type', 'bedrooms', 'bathrooms'])['weekly_rent']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [7]:
rent_lookup

{('ABBOTSFORD', 'Apartment / Unit / Flat', 1.0, 1.0): 552.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 2.0, 1.0): 714.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 2.0, 2.0): 692.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 3.0, 2.0): 958.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 4.0, 4.0): 1225.0,
 ('ABBOTSFORD', 'House', 2.0, 1.0): 775.0,
 ('ABBOTSFORD', 'House', 3.0, 1.0): 870.0,
 ('ABBOTSFORD', 'Townhouse', 2.0, 1.0): 650.0,
 ('ABBOTSFORD', 'Townhouse', 3.0, 2.0): 935.0,
 ('ABERFELDIE', 'Apartment / Unit / Flat', 1.0, 1.0): 420.0,
 ('ABERFELDIE', 'Apartment / Unit / Flat', 2.0, 1.0): 430.0,
 ('ABERFELDIE', 'House', 2.0, 1.0): 550.0,
 ('ABERFELDIE', 'Townhouse', 2.0, 2.0): 580.0,
 ('ABERFELDIE', 'Townhouse', 4.0, 3.0): 1000.0,
 ('AIRPORT WEST', 'Apartment / Unit / Flat', 2.0, 1.0): 458.0,
 ('AIRPORT WEST', 'Apartment / Unit / Flat', 3.0, 1.0): 580.0,
 ('AIRPORT WEST', 'House', 2.0, 1.0): 530.0,
 ('AIRPORT WEST', 'House', 3.0, 1.0): 591.0,
 ('AIRPORT WEST', 'House', 3.0, 2.0): 

In [8]:
#Impute missing weekly_rent values
for idx, row in data.iterrows():
    if pd.isnull(row['weekly_rent']):
        key = (row['suburb'], row['property_type'], row['bedrooms'], row['bathrooms'])
        if key in rent_lookup:  # only impute if lookup exists
            data.at[idx, 'weekly_rent'] = rent_lookup[key]

In [9]:
#Check for missing values again
print(find_nans(data))

[('land_area', np.int64(12329)), ('carspaces', np.int64(1771)), ('bond', np.int64(763)), ('available_date', np.int64(133)), ('bedrooms', np.int64(124)), ('weekly_rent', np.int64(73)), ('bathrooms', np.int64(51)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure (%)', np.int64(4)), ('Unemployment', np.int64(4)), ('post_gradutae (%)', np.int64(4)), ('Graduate_diploma_certificate(%)', np.int64(4)), ('Bachelor (%)', np.int64(4)), ('Advanced_&_Diploma (%)', np.int64(4)), ('Certificate_lev

In [10]:
#Impute average rent with relaxed contraints (without bathrooms and suburb) to fill rest of missing values
#Create a lookup dictionary for average weekly rent
rent_lookup = (
    data.groupby(['property_type', 'bedrooms'])['weekly_rent']
    .mean()
    .round(0)
    .to_dict()
)

In [11]:
#Impute missing weekly_rent values
for idx, row in data.iterrows():
    if pd.isnull(row['weekly_rent']):
        key = (row['property_type'], row['bedrooms'])
        if key in rent_lookup:  # only impute if lookup exists
            data.at[idx, 'weekly_rent'] = rent_lookup[key]

In [12]:
#Check for missing values again
print(find_nans(data))

[('land_area', np.int64(12329)), ('carspaces', np.int64(1771)), ('bond', np.int64(763)), ('available_date', np.int64(133)), ('bedrooms', np.int64(124)), ('bathrooms', np.int64(51)), ('weekly_rent', np.int64(18)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure (%)', np.int64(4)), ('Unemployment', np.int64(4)), ('post_gradutae (%)', np.int64(4)), ('Graduate_diploma_certificate(%)', np.int64(4)), ('Bachelor (%)', np.int64(4)), ('Advanced_&_Diploma (%)', np.int64(4)), ('Certificate_lev

In [13]:
#Drop any remaining rows with missing weekly_rent values
data = data.dropna(subset=['weekly_rent'])

#### Imputing carspaces

In [14]:
#Impute average carspaces for each combination of suburb, property_type, bedrooms, and bathrooms
# Create a lookup dictionary for average carspaces
carspace_lookup = (
    data.groupby(['suburb', 'property_type', 'bedrooms', 'bathrooms'])['carspaces']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [15]:
#Impute missing carspaces values
for idx, row in data.iterrows():
    if pd.isnull(row['carspaces']):
        key = (row['suburb'], row['property_type'], row['bedrooms'], row['bathrooms'])
        if key in carspace_lookup:  # only impute if lookup exists
            data.at[idx, 'carspaces'] = carspace_lookup[key]

In [16]:
#Check for missing values again
print(find_nans(data))

[('land_area', np.int64(12311)), ('bond', np.int64(760)), ('carspaces', np.int64(451)), ('available_date', np.int64(127)), ('bedrooms', np.int64(108)), ('bathrooms', np.int64(38)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure (%)', np.int64(4)), ('Unemployment', np.int64(4)), ('post_gradutae (%)', np.int64(4)), ('Graduate_diploma_certificate(%)', np.int64(4)), ('Bachelor (%)', np.int64(4)), ('Advanced_&_Diploma (%)', np.int64(4)), ('Certificate_level (%)', np.int64(4)), ('Total_p

In [17]:
#Impute average carspaces with relaxed contraints (without bathrooms) to fill rest of missing values
# Create a lookup dictionary for average carspaces
carspace_lookup = (
    data.groupby(['property_type', 'bedrooms'])['carspaces']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [18]:
#Impute missing carspaces values
for idx, row in data.iterrows():
    if pd.isnull(row['carspaces']):
        key = ( row['property_type'], row['bedrooms'])
        if key in carspace_lookup:  # only impute if lookup exists
            data.at[idx, 'carspaces'] = carspace_lookup[key]

In [19]:
#Check for missing values again
print(find_nans(data))

[('land_area', np.int64(12311)), ('bond', np.int64(760)), ('available_date', np.int64(127)), ('bedrooms', np.int64(108)), ('carspaces', np.int64(73)), ('bathrooms', np.int64(38)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure (%)', np.int64(4)), ('Unemployment', np.int64(4)), ('post_gradutae (%)', np.int64(4)), ('Graduate_diploma_certificate(%)', np.int64(4)), ('Bachelor (%)', np.int64(4)), ('Advanced_&_Diploma (%)', np.int64(4)), ('Certificate_level (%)', np.int64(4)), ('Total_pe

In [20]:
#Drop any remaining rows with missing carspaces values
data = data.dropna(subset=['carspaces'])

#### Imputing bonds

In [21]:
#Impute average bond for each combination of suburb, property_type, bedrooms, and bathrooms
# Create a lookup dictionary for average bond
bond_lookup = (
    data.groupby(['suburb', 'property_type', 'bedrooms', 'bathrooms'])['bond']
    .mean()
    .round(0)
    .to_dict()
)

#Impute missing bond values
for idx, row in data.iterrows():
    if pd.isnull(row['bond']):
        key = (row['suburb'], row['property_type'], row['bedrooms'], row['bathrooms'])
        if key in bond_lookup:  # only impute if lookup exists
            data.at[idx, 'bond'] = bond_lookup[key]

In [22]:
#Check for missing values again
print(find_nans(data))

[('land_area', np.int64(12238)), ('available_date', np.int64(127)), ('bond', np.int64(126)), ('bedrooms', np.int64(45)), ('bathrooms', np.int64(26)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure (%)', np.int64(4)), ('Unemployment', np.int64(4)), ('post_gradutae (%)', np.int64(4)), ('Graduate_diploma_certificate(%)', np.int64(4)), ('Bachelor (%)', np.int64(4)), ('Advanced_&_Diploma (%)', np.int64(4)), ('Certificate_level (%)', np.int64(4)), ('Total_persons', np.int64(4))]


In [23]:
#Impute average bond with relaxed contraints (without bathrooms) to fill rest of missing values
# Create a lookup dictionary for average bond
bond_lookup = (
    data.groupby(['property_type', 'bedrooms'])['bond']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

#Impute missing bond values
for idx, row in data.iterrows():
    if pd.isnull(row['bond']):
        key = (row['property_type'], row['bedrooms'])
        if key in bond_lookup:  # only impute if lookup exists
            data.at[idx, 'bond'] = bond_lookup[key]

In [24]:
#Check for missing values again
print(find_nans(data))

[('land_area', np.int64(12238)), ('available_date', np.int64(127)), ('bedrooms', np.int64(45)), ('bathrooms', np.int64(26)), ('bond', np.int64(6)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure (%)', np.int64(4)), ('Unemployment', np.int64(4)), ('post_gradutae (%)', np.int64(4)), ('Graduate_diploma_certificate(%)', np.int64(4)), ('Bachelor (%)', np.int64(4)), ('Advanced_&_Diploma (%)', np.int64(4)), ('Certificate_level (%)', np.int64(4)), ('Total_persons', np.int64(4))]


#### Redundant columns

In [25]:
data[data["SAL_NAME21"].isna()]

Unnamed: 0,suburb,postcode,weekly_rent,bond,available_date,days_listed,bedrooms,bathrooms,carspaces,property_type,...,Certificate_level (%),Total_persons,Population-2023,SAL_NAME21,incidents_recorded,rate_per_100000_population,population_est,crime_per_person,crime_index,crime_rank


In [26]:
data = data.drop(columns=["land_area", "SAL_NAME21", 'suburb', 'bond'])
print(find_nans(data))

[('available_date', np.int64(127)), ('bedrooms', np.int64(45)), ('bathrooms', np.int64(26)), ('agency', np.int64(5)), ('days_listed', np.int64(4)), ('lat', np.int64(4)), ('lon', np.int64(4)), ('Median_age_persons', np.int64(4)), ('Median_mortgage_repay_monthly', np.int64(4)), ('Median_tot_prsnl_inc_weekly', np.int64(4)), ('Median_rent_weekly', np.int64(4)), ('Median_tot_fam_inc_weekly', np.int64(4)), ('Average_num_psns_per_bedroom', np.int64(4)), ('Median_tot_hhd_inc_weekly', np.int64(4)), ('Average_household_size', np.int64(4)), ('Owner occupied (%)', np.int64(4)), ('Mortgage (%)', np.int64(4)), ('Total rented (%)', np.int64(4)), ('Other tenure (%)', np.int64(4)), ('Unemployment', np.int64(4)), ('post_gradutae (%)', np.int64(4)), ('Graduate_diploma_certificate(%)', np.int64(4)), ('Bachelor (%)', np.int64(4)), ('Advanced_&_Diploma (%)', np.int64(4)), ('Certificate_level (%)', np.int64(4)), ('Total_persons', np.int64(4))]


In [27]:
#Shape of data after imputation
data.shape

(12240, 43)

In [28]:
#Drop any remaining rows with missing values
data = data.dropna()

In [29]:
data.shape

(12059, 43)

#### Outlier preprocess

In [30]:
#Looking at numerical variables
data[['weekly_rent', 'bedrooms', 'bathrooms', 'carspaces', 'num_metro_bus_stops', 'num_metro_tram_stops', 'num_schools_2km', 'incidents_recorded']].describe()

Unnamed: 0,weekly_rent,bedrooms,bathrooms,carspaces,num_metro_bus_stops,num_metro_tram_stops,num_schools_2km,incidents_recorded
count,12059.0,12059.0,12059.0,12059.0,12059.0,12059.0,12059.0,12059.0
mean,744.445642,2.727589,1.590513,1.628742,61.783813,20.992205,8.068331,13261.477726
std,9138.23482,1.16659,0.637208,0.952726,43.211624,35.03203,4.798146,5826.993856
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0,77.0
25%,490.0,2.0,1.0,1.0,22.0,0.0,4.0,9525.0
50%,560.0,3.0,2.0,1.0,66.0,0.0,8.0,13140.5
75%,685.0,4.0,2.0,2.0,96.0,35.0,12.0,17495.333333
max,808500.0,50.0,12.0,22.0,183.0,127.0,23.0,34620.0


In [31]:
#Find how many 0 weekly_rent values there are
zero_rent_count = (data["weekly_rent"] == 0).sum()
print("Zero rent count:", zero_rent_count)

#Find how many high outlier weekly_rent values there are i.e above 3000
highoutlier_rent_count = (data["weekly_rent"] >= 3000).sum()
print("High outlier rent count:", highoutlier_rent_count)

#Find how many data points with 50 or more bedrooms
high_bedroom_count = (data["bedrooms"] >= 50).sum()
print("High bedroom count:", high_bedroom_count)

Zero rent count: 16
High outlier rent count: 27
High bedroom count: 1


In [32]:
#Remove outliers rows
data = data[(data["weekly_rent"] > 0) & (data["weekly_rent"] <= 3000) & (data["bedrooms"] < 50)]
#Looking at numerical variables
data[['weekly_rent', 'bedrooms', 'bathrooms', 'carspaces', 'num_metro_bus_stops', 'num_metro_tram_stops', 'num_schools_2km', 'incidents_recorded']].describe()

Unnamed: 0,weekly_rent,bedrooms,bathrooms,carspaces,num_metro_bus_stops,num_metro_tram_stops,num_schools_2km,incidents_recorded
count,12018.0,12018.0,12018.0,12018.0,12018.0,12018.0,12018.0,12018.0
mean,621.903977,2.720835,1.587203,1.626061,61.83791,20.94891,8.061075,13259.928384
std,249.176926,1.081993,0.629443,0.946937,43.221829,35.017988,4.798088,5828.544925
min,33.0,1.0,1.0,1.0,0.0,0.0,0.0,77.0
25%,490.0,2.0,1.0,1.0,22.0,0.0,4.0,9525.0
50%,560.0,3.0,2.0,1.0,66.0,0.0,8.0,13140.5
75%,683.0,4.0,2.0,2.0,96.0,35.0,12.0,17495.333333
max,3000.0,11.0,12.0,22.0,183.0,127.0,23.0,34620.0


#### Feature Engineering time data

In [33]:
data['available_date'] = pd.to_datetime(data['available_date'], errors='coerce')
data['available_day'] = data['available_date'].dt.day
data['available_month'] = data['available_date'].dt.month   
data['available_year'] = data['available_date'].dt.year
data = data.drop(columns=['available_date'])

#### Save Cleaned data to folder

In [34]:
data.shape

(12018, 45)

In [35]:
# Use this version of data for EDA as lat and long will be removed for modeling
data.columns

Index(['postcode', 'weekly_rent', 'days_listed', 'bedrooms', 'bathrooms',
       'carspaces', 'property_type', 'lat', 'lon', 'agency',
       'num_metro_bus_stops', 'num_metro_tram_stops', 'num_metro_train_stops',
       'num_regional_bus_stops', 'num_regional_train_stops', 'num_schools_2km',
       'Median_age_persons', 'Median_mortgage_repay_monthly',
       'Median_tot_prsnl_inc_weekly', 'Median_rent_weekly',
       'Median_tot_fam_inc_weekly', 'Average_num_psns_per_bedroom',
       'Median_tot_hhd_inc_weekly', 'Average_household_size',
       'Owner occupied (%)', 'Mortgage (%)', 'Total rented (%)',
       'Other tenure (%)', 'Unemployment', 'post_gradutae (%)',
       'Graduate_diploma_certificate(%)', 'Bachelor (%)',
       'Advanced_&_Diploma (%)', 'Certificate_level (%)', 'Total_persons',
       'Population-2023', 'incidents_recorded', 'rate_per_100000_population',
       'population_est', 'crime_per_person', 'crime_index', 'crime_rank',
       'available_day', 'available_month

In [36]:
data.to_csv("../data/curated/cleaned_real_estate_data.csv", index=False)

#### Encoding

In [37]:
# Encode month cyclically 
data['month_sin'] = np.sin(data['available_month'] / 12 * 2 * np.pi)
data['month_cos'] = np.cos(data['available_month'] / 12 * 2 * np.pi)
data = data.drop(columns=['available_month'])

#Encode day cyclically
data['day_sin'] = np.sin(data['available_day'] / 31 * 2 * np.pi)
data['day_cos'] = np.cos(data['available_day'] / 31 * 2 * np.pi)
data = data.drop(columns=['available_day'])

#Frequency encoding for Non-numericeal columns
post_freq = data['postcode'].value_counts(normalize=True)
data['postcode'] = data['postcode'].map(post_freq)
property_freq = data['property_type'].value_counts(normalize=True)
data['property_type'] = data['property_type'].map(property_freq)
agency_freq = data['agency'].value_counts(normalize=True)
data['agency'] = data['agency'].map(agency_freq)

#Drop lat and long for modeling
data = data.drop(columns=['lat', 'lon'])

#### Modeling

In [38]:
#Train/test split based on available_year
data = data[data['available_year'] == 2025]

y = data['weekly_rent']
X = data.drop(columns=['weekly_rent'])

#### Normalization/Scaling

In [39]:
#Standardize scalar, resacling all data. (can target specific columns if needed)
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [40]:
gxboost = GradientBoostingClassifier()
gxboost.fit(X,y)

rf_model = RandomForestRegressor()
rf_model.fit(X, y)

KeyboardInterrupt: 

##### Feature importance

In [None]:
# Random Forest feature importances
importances = rf_model.feature_importances_
feature_names = X.columns

rf_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

rf_importance_df

NameError: name 'rf_model' is not defined

In [None]:
# GX Boost feature importances
importances = gxboost.feature_importances_
feature_names = X.columns

gxboost_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

gxboost_importance_df

Unnamed: 0,feature,importance
1,bond,0.132008
8,num_metro_bus_stops,0.05741
2,days_listed,0.048011
36,population_est,0.043219
13,num_schools_2km,0.042732
43,day_sin,0.037162
6,property_type,0.036338
9,num_metro_tram_stops,0.03198
33,Population-2023,0.031719
7,agency,0.02845


##### Corrrelation Analysis

In [None]:
# correlation matrix
corr_matrix = data.corr(numeric_only=True)

print(corr_matrix)