In [50]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error, r2_score

In [51]:
data = pd.read_csv('../../data/processed/real_estate/vic_rentals_all_enriched.csv')
data.shape

(12331, 57)

In [52]:
data = data.drop(columns=["listing_id", "date_listed", "address", "photo_count", "video_count",
                          "floorplans_count", "virtual_tour", "primary_type", "secondary_type",
                          "agent_names",])
data.shape

(12331, 47)

#### Find how many null values per feature

In [53]:
def find_nans(data):
    missing_list = [(col, data[col].isnull().sum()) for col in data.columns]
    non_nans = [(col, cnt) for col, cnt in missing_list if cnt != 0]
    return sorted(non_nans, key=lambda x: x[1], reverse=True)  # sort by column name
print(find_nans(data))

[('land_area', 12329), ('carspaces', 1771), ('bond', 763), ('weekly_rent', 270), ('available_date', 133), ('bedrooms', 124), ('bathrooms', 51), ('agency', 5), ('days_listed', 4), ('lat', 4), ('lon', 4), ('Median_age_persons', 4), ('Median_mortgage_repay_monthly', 4), ('Median_tot_prsnl_inc_weekly', 4), ('Median_rent_weekly', 4), ('Median_tot_fam_inc_weekly', 4), ('Average_num_psns_per_bedroom', 4), ('Median_tot_hhd_inc_weekly', 4), ('Average_household_size', 4), ('Owner occupied (%)', 4), ('Mortgage (%)', 4), ('Total rented (%)', 4), ('Other tenure (%)', 4), ('Unemployment', 4), ('post_gradutae (%)', 4), ('Graduate_diploma_certificate(%)', 4), ('Bachelor (%)', 4), ('Advanced_&_Diploma (%)', 4), ('Certificate_level (%)', 4), ('Total_persons', 4)]


### Imputation

#### Average rent

In [54]:
#Impute average rent for each combination of suburb, property_type, bedrooms, and bathrooms
# Create a lookup dictionary for average weekly rent
rent_lookup = (
    data.groupby(['suburb', 'property_type', 'bedrooms', 'bathrooms'])['weekly_rent']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [55]:
rent_lookup

{('ABBOTSFORD', 'Apartment / Unit / Flat', 1.0, 1.0): 552.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 2.0, 1.0): 714.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 2.0, 2.0): 692.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 3.0, 2.0): 958.0,
 ('ABBOTSFORD', 'Apartment / Unit / Flat', 4.0, 4.0): 1225.0,
 ('ABBOTSFORD', 'House', 2.0, 1.0): 775.0,
 ('ABBOTSFORD', 'House', 3.0, 1.0): 870.0,
 ('ABBOTSFORD', 'Townhouse', 2.0, 1.0): 650.0,
 ('ABBOTSFORD', 'Townhouse', 3.0, 2.0): 935.0,
 ('ABERFELDIE', 'Apartment / Unit / Flat', 1.0, 1.0): 420.0,
 ('ABERFELDIE', 'Apartment / Unit / Flat', 2.0, 1.0): 430.0,
 ('ABERFELDIE', 'House', 2.0, 1.0): 550.0,
 ('ABERFELDIE', 'Townhouse', 2.0, 2.0): 580.0,
 ('ABERFELDIE', 'Townhouse', 4.0, 3.0): 1000.0,
 ('AIRPORT WEST', 'Apartment / Unit / Flat', 2.0, 1.0): 458.0,
 ('AIRPORT WEST', 'Apartment / Unit / Flat', 3.0, 1.0): 580.0,
 ('AIRPORT WEST', 'House', 2.0, 1.0): 530.0,
 ('AIRPORT WEST', 'House', 3.0, 1.0): 591.0,
 ('AIRPORT WEST', 'House', 3.0, 2.0): 

In [56]:
#Impute missing weekly_rent values
for idx, row in data.iterrows():
    if pd.isnull(row['weekly_rent']):
        key = (row['suburb'], row['property_type'], row['bedrooms'], row['bathrooms'])
        if key in rent_lookup:  # only impute if lookup exists
            data.at[idx, 'weekly_rent'] = rent_lookup[key]

In [57]:
#Check for missing values again
print(find_nans(data))

[('land_area', 12329), ('carspaces', 1771), ('bond', 763), ('available_date', 133), ('bedrooms', 124), ('weekly_rent', 73), ('bathrooms', 51), ('agency', 5), ('days_listed', 4), ('lat', 4), ('lon', 4), ('Median_age_persons', 4), ('Median_mortgage_repay_monthly', 4), ('Median_tot_prsnl_inc_weekly', 4), ('Median_rent_weekly', 4), ('Median_tot_fam_inc_weekly', 4), ('Average_num_psns_per_bedroom', 4), ('Median_tot_hhd_inc_weekly', 4), ('Average_household_size', 4), ('Owner occupied (%)', 4), ('Mortgage (%)', 4), ('Total rented (%)', 4), ('Other tenure (%)', 4), ('Unemployment', 4), ('post_gradutae (%)', 4), ('Graduate_diploma_certificate(%)', 4), ('Bachelor (%)', 4), ('Advanced_&_Diploma (%)', 4), ('Certificate_level (%)', 4), ('Total_persons', 4)]


In [58]:
#Impute average rent with relaxed contraints (without bathrooms and suburb) to fill rest of missing values
#Create a lookup dictionary for average weekly rent
rent_lookup = (
    data.groupby(['property_type', 'bedrooms'])['weekly_rent']
    .mean()
    .round(0)
    .to_dict()
)

In [59]:
#Impute missing weekly_rent values
for idx, row in data.iterrows():
    if pd.isnull(row['weekly_rent']):
        key = (row['property_type'], row['bedrooms'])
        if key in rent_lookup:  # only impute if lookup exists
            data.at[idx, 'weekly_rent'] = rent_lookup[key]

In [60]:
#Check for missing values again
print(find_nans(data))

[('land_area', 12329), ('carspaces', 1771), ('bond', 763), ('available_date', 133), ('bedrooms', 124), ('bathrooms', 51), ('weekly_rent', 18), ('agency', 5), ('days_listed', 4), ('lat', 4), ('lon', 4), ('Median_age_persons', 4), ('Median_mortgage_repay_monthly', 4), ('Median_tot_prsnl_inc_weekly', 4), ('Median_rent_weekly', 4), ('Median_tot_fam_inc_weekly', 4), ('Average_num_psns_per_bedroom', 4), ('Median_tot_hhd_inc_weekly', 4), ('Average_household_size', 4), ('Owner occupied (%)', 4), ('Mortgage (%)', 4), ('Total rented (%)', 4), ('Other tenure (%)', 4), ('Unemployment', 4), ('post_gradutae (%)', 4), ('Graduate_diploma_certificate(%)', 4), ('Bachelor (%)', 4), ('Advanced_&_Diploma (%)', 4), ('Certificate_level (%)', 4), ('Total_persons', 4)]


In [61]:
#Drop any remaining rows with missing weekly_rent values
data = data.dropna(subset=['weekly_rent'])

#### Imputing carspaces

In [62]:
#Impute average carspaces for each combination of suburb, property_type, bedrooms, and bathrooms
# Create a lookup dictionary for average carspaces
carspace_lookup = (
    data.groupby(['suburb', 'property_type', 'bedrooms', 'bathrooms'])['carspaces']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [63]:
#Impute missing carspaces values
for idx, row in data.iterrows():
    if pd.isnull(row['carspaces']):
        key = (row['suburb'], row['property_type'], row['bedrooms'], row['bathrooms'])
        if key in carspace_lookup:  # only impute if lookup exists
            data.at[idx, 'carspaces'] = carspace_lookup[key]

In [64]:
#Check for missing values again
print(find_nans(data))

[('land_area', 12311), ('bond', 760), ('carspaces', 451), ('available_date', 127), ('bedrooms', 108), ('bathrooms', 38), ('agency', 5), ('days_listed', 4), ('lat', 4), ('lon', 4), ('Median_age_persons', 4), ('Median_mortgage_repay_monthly', 4), ('Median_tot_prsnl_inc_weekly', 4), ('Median_rent_weekly', 4), ('Median_tot_fam_inc_weekly', 4), ('Average_num_psns_per_bedroom', 4), ('Median_tot_hhd_inc_weekly', 4), ('Average_household_size', 4), ('Owner occupied (%)', 4), ('Mortgage (%)', 4), ('Total rented (%)', 4), ('Other tenure (%)', 4), ('Unemployment', 4), ('post_gradutae (%)', 4), ('Graduate_diploma_certificate(%)', 4), ('Bachelor (%)', 4), ('Advanced_&_Diploma (%)', 4), ('Certificate_level (%)', 4), ('Total_persons', 4)]


In [65]:
#Impute average carspaces with relaxed contraints (without bathrooms) to fill rest of missing values
# Create a lookup dictionary for average carspaces
carspace_lookup = (
    data.groupby(['property_type', 'bedrooms'])['carspaces']
    .mean()
    .round(0)   # optional: round to 0 decimals
    .to_dict()
)

In [66]:
#Impute missing carspaces values
for idx, row in data.iterrows():
    if pd.isnull(row['carspaces']):
        key = ( row['property_type'], row['bedrooms'])
        if key in carspace_lookup:  # only impute if lookup exists
            data.at[idx, 'carspaces'] = carspace_lookup[key]

In [67]:
#Check for missing values again
print(find_nans(data))

[('land_area', 12311), ('bond', 760), ('available_date', 127), ('bedrooms', 108), ('carspaces', 73), ('bathrooms', 38), ('agency', 5), ('days_listed', 4), ('lat', 4), ('lon', 4), ('Median_age_persons', 4), ('Median_mortgage_repay_monthly', 4), ('Median_tot_prsnl_inc_weekly', 4), ('Median_rent_weekly', 4), ('Median_tot_fam_inc_weekly', 4), ('Average_num_psns_per_bedroom', 4), ('Median_tot_hhd_inc_weekly', 4), ('Average_household_size', 4), ('Owner occupied (%)', 4), ('Mortgage (%)', 4), ('Total rented (%)', 4), ('Other tenure (%)', 4), ('Unemployment', 4), ('post_gradutae (%)', 4), ('Graduate_diploma_certificate(%)', 4), ('Bachelor (%)', 4), ('Advanced_&_Diploma (%)', 4), ('Certificate_level (%)', 4), ('Total_persons', 4)]


In [68]:
#Drop any remaining rows with missing carspaces values
data = data.dropna(subset=['carspaces'])

In [69]:
#Check for missing values again
print(find_nans(data))

[('land_area', 12238), ('bond', 745), ('available_date', 127), ('bedrooms', 45), ('bathrooms', 26), ('agency', 5), ('days_listed', 4), ('lat', 4), ('lon', 4), ('Median_age_persons', 4), ('Median_mortgage_repay_monthly', 4), ('Median_tot_prsnl_inc_weekly', 4), ('Median_rent_weekly', 4), ('Median_tot_fam_inc_weekly', 4), ('Average_num_psns_per_bedroom', 4), ('Median_tot_hhd_inc_weekly', 4), ('Average_household_size', 4), ('Owner occupied (%)', 4), ('Mortgage (%)', 4), ('Total rented (%)', 4), ('Other tenure (%)', 4), ('Unemployment', 4), ('post_gradutae (%)', 4), ('Graduate_diploma_certificate(%)', 4), ('Bachelor (%)', 4), ('Advanced_&_Diploma (%)', 4), ('Certificate_level (%)', 4), ('Total_persons', 4)]


#### Redundant columns

In [70]:
data[data["SAL_NAME21"].isna()]

Unnamed: 0,suburb,postcode,weekly_rent,bond,available_date,days_listed,bedrooms,bathrooms,carspaces,property_type,...,Certificate_level (%),Total_persons,Population-2023,SAL_NAME21,incidents_recorded,rate_per_100000_population,population_est,crime_per_person,crime_index,crime_rank


In [71]:
data = data.drop(columns=["land_area", "SAL_NAME21", 'suburb', 'bond'])
print(find_nans(data))

[('available_date', 127), ('bedrooms', 45), ('bathrooms', 26), ('agency', 5), ('days_listed', 4), ('lat', 4), ('lon', 4), ('Median_age_persons', 4), ('Median_mortgage_repay_monthly', 4), ('Median_tot_prsnl_inc_weekly', 4), ('Median_rent_weekly', 4), ('Median_tot_fam_inc_weekly', 4), ('Average_num_psns_per_bedroom', 4), ('Median_tot_hhd_inc_weekly', 4), ('Average_household_size', 4), ('Owner occupied (%)', 4), ('Mortgage (%)', 4), ('Total rented (%)', 4), ('Other tenure (%)', 4), ('Unemployment', 4), ('post_gradutae (%)', 4), ('Graduate_diploma_certificate(%)', 4), ('Bachelor (%)', 4), ('Advanced_&_Diploma (%)', 4), ('Certificate_level (%)', 4), ('Total_persons', 4)]


In [72]:
#Shape of data after imputation
data.shape

(12240, 43)

In [73]:
#Drop any remaining rows with missing values
data = data.dropna()

In [74]:
data.shape

(12059, 43)

#### Outlier preprocess

In [75]:
#Looking at numerical variables
data[['weekly_rent', 'bedrooms', 'bathrooms', 'carspaces', 'num_metro_bus_stops', 'num_metro_tram_stops', 'num_schools_2km', 'incidents_recorded']].describe()

Unnamed: 0,weekly_rent,bedrooms,bathrooms,carspaces,num_metro_bus_stops,num_metro_tram_stops,num_schools_2km,incidents_recorded
count,12059.0,12059.0,12059.0,12059.0,12059.0,12059.0,12059.0,12059.0
mean,744.445642,2.727589,1.590513,1.628742,61.783813,20.992205,8.068331,13261.477726
std,9138.23482,1.16659,0.637208,0.952726,43.211624,35.03203,4.798146,5826.993856
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0,77.0
25%,490.0,2.0,1.0,1.0,22.0,0.0,4.0,9525.0
50%,560.0,3.0,2.0,1.0,66.0,0.0,8.0,13140.5
75%,685.0,4.0,2.0,2.0,96.0,35.0,12.0,17495.333333
max,808500.0,50.0,12.0,22.0,183.0,127.0,23.0,34620.0


In [76]:
#Find how many 0 weekly_rent values there are
zero_rent_count = (data["weekly_rent"] == 0).sum()
print("Zero rent count:", zero_rent_count)

#Find how many high outlier weekly_rent values there are i.e above 3000
highoutlier_rent_count = (data["weekly_rent"] >= 3000).sum()
print("High outlier rent count:", highoutlier_rent_count)

#Find how many data points with 50 or more bedrooms
high_bedroom_count = (data["bedrooms"] >= 50).sum()
print("High bedroom count:", high_bedroom_count)

Zero rent count: 16
High outlier rent count: 27
High bedroom count: 1


In [77]:
#Remove outliers rows
data = data[(data["weekly_rent"] > 0) & (data["weekly_rent"] <= 3000) & (data["bedrooms"] < 50)]
#Looking at numerical variables
data[['weekly_rent', 'bedrooms', 'bathrooms', 'carspaces', 'num_metro_bus_stops', 'num_metro_tram_stops', 'num_schools_2km', 'incidents_recorded']].describe()

Unnamed: 0,weekly_rent,bedrooms,bathrooms,carspaces,num_metro_bus_stops,num_metro_tram_stops,num_schools_2km,incidents_recorded
count,12018.0,12018.0,12018.0,12018.0,12018.0,12018.0,12018.0,12018.0
mean,621.903977,2.720835,1.587203,1.626061,61.83791,20.94891,8.061075,13259.928384
std,249.176926,1.081993,0.629443,0.946937,43.221829,35.017988,4.798088,5828.544925
min,33.0,1.0,1.0,1.0,0.0,0.0,0.0,77.0
25%,490.0,2.0,1.0,1.0,22.0,0.0,4.0,9525.0
50%,560.0,3.0,2.0,1.0,66.0,0.0,8.0,13140.5
75%,683.0,4.0,2.0,2.0,96.0,35.0,12.0,17495.333333
max,3000.0,11.0,12.0,22.0,183.0,127.0,23.0,34620.0


#### Feature Engineering time data

In [78]:
data['available_date'] = pd.to_datetime(data['available_date'], errors='coerce')
data['available_day'] = data['available_date'].dt.day
data['available_month'] = data['available_date'].dt.month   
data['available_year'] = data['available_date'].dt.year
data = data.drop(columns=['available_date'])

#### Save Cleaned data to folder

In [79]:
data.shape

(12018, 45)

In [80]:
# Use this version of data for EDA as lat and long will be removed for modeling
data.columns

Index(['postcode', 'weekly_rent', 'days_listed', 'bedrooms', 'bathrooms',
       'carspaces', 'property_type', 'lat', 'lon', 'agency',
       'num_metro_bus_stops', 'num_metro_tram_stops', 'num_metro_train_stops',
       'num_regional_bus_stops', 'num_regional_train_stops', 'num_schools_2km',
       'Median_age_persons', 'Median_mortgage_repay_monthly',
       'Median_tot_prsnl_inc_weekly', 'Median_rent_weekly',
       'Median_tot_fam_inc_weekly', 'Average_num_psns_per_bedroom',
       'Median_tot_hhd_inc_weekly', 'Average_household_size',
       'Owner occupied (%)', 'Mortgage (%)', 'Total rented (%)',
       'Other tenure (%)', 'Unemployment', 'post_gradutae (%)',
       'Graduate_diploma_certificate(%)', 'Bachelor (%)',
       'Advanced_&_Diploma (%)', 'Certificate_level (%)', 'Total_persons',
       'Population-2023', 'incidents_recorded', 'rate_per_100000_population',
       'population_est', 'crime_per_person', 'crime_index', 'crime_rank',
       'available_day', 'available_month

In [81]:
# data.to_csv("../../data/curated/cleaned_real_estate_data.csv", index=False)

#### Encoding

In [82]:
# Encode month cyclically 
data['month_sin'] = np.sin(data['available_month'] / 12 * 2 * np.pi)
data['month_cos'] = np.cos(data['available_month'] / 12 * 2 * np.pi)
data = data.drop(columns=['available_month'])

#Encode day cyclically
data['day_sin'] = np.sin(data['available_day'] / 31 * 2 * np.pi)
data['day_cos'] = np.cos(data['available_day'] / 31 * 2 * np.pi)
data = data.drop(columns=['available_day'])

#Frequency encoding for Non-numericeal columns
post_freq = data['postcode'].value_counts(normalize=True)
data['postcode'] = data['postcode'].map(post_freq)
property_freq = data['property_type'].value_counts(normalize=True)
data['property_type'] = data['property_type'].map(property_freq)
agency_freq = data['agency'].value_counts(normalize=True)
data['agency'] = data['agency'].map(agency_freq)

#Drop lat and long for modeling
data = data.drop(columns=['lat', 'lon', 'Median_rent_weekly', 'Median_mortgage_repay_monthly'])

#### Modeling

In [83]:
#Train/test split based on available_year
data = data[data['available_year'] == 2025]

y = data['weekly_rent']
X = data.drop(columns=['weekly_rent'])


##### Feature Selection

In [84]:
# Compute MI
mi = mutual_info_regression(X, y, discrete_features="auto", random_state=0)
mi_scores = pd.Series(mi, index=X.columns).sort_values(ascending=False)

k = 20
selected_features = mi_scores.head(k).index.tolist()

X_selection = X[selected_features]

In [85]:
X_selection

Unnamed: 0,Bachelor (%),Median_tot_fam_inc_weekly,Median_tot_hhd_inc_weekly,population_est,Certificate_level (%),incidents_recorded,post_gradutae (%),Graduate_diploma_certificate(%),Advanced_&_Diploma (%),Population-2023,Median_tot_prsnl_inc_weekly,Unemployment,crime_index,rate_per_100000_population,crime_per_person,crime_rank,Owner occupied (%),Mortgage (%),Total rented (%),Total_persons
0,0.382874,3024.0,2192.0,152460.693763,0.158769,16262.250000,0.168135,0.065559,0.119857,13408,1080.0,0.046495,1.813199,10220.714480,0.102207,137.750000,0.270928,0.327294,0.379467,8969.0
3,0.393366,2926.0,2111.0,138420.725117,0.153807,10143.000000,0.170695,0.073617,0.134493,17203,1113.0,0.038438,1.365466,7696.915003,0.076969,183.000000,0.299913,0.289420,0.385893,9889.0
38,0.458828,3113.0,2098.0,141511.437067,0.110812,19274.333333,0.164398,0.050746,0.118529,20268,1343.0,0.036782,2.245682,12658.554200,0.126586,25.333333,0.189694,0.237092,0.549360,13735.0
39,0.352016,1630.0,1278.0,190296.130450,0.093904,14568.500000,0.247296,0.020813,0.111111,14178,503.0,0.123011,1.401696,7901.140204,0.079011,189.500000,0.151699,0.176589,0.638076,6102.0
45,0.365364,1914.0,1441.0,185521.681300,0.118189,9282.000000,0.192183,0.043979,0.146189,25286,682.0,0.069917,0.887587,5003.188811,0.050032,467.000000,0.241095,0.238566,0.473175,13893.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12326,0.168563,1685.0,1477.0,417618.665300,0.396929,21416.000000,0.059695,0.023620,0.170067,13530,707.0,0.066765,0.909751,5128.123281,0.051281,439.000000,0.230983,0.404359,0.338007,9314.0
12327,0.391513,2475.0,1718.0,194580.599700,0.087128,34620.000000,0.245423,0.048634,0.092480,18017,953.0,0.070373,3.156398,17792.112910,0.177921,2.000000,0.130270,0.198541,0.639825,10651.0
12328,0.313112,1470.0,1401.0,190296.130450,0.199609,14568.500000,0.121439,0.024136,0.162318,23509,558.0,0.081065,1.401696,7901.140204,0.079011,189.500000,0.316660,0.229401,0.393775,9198.0
12329,0.391513,2475.0,1718.0,194580.599700,0.087128,34620.000000,0.245423,0.048634,0.092480,18017,953.0,0.070373,3.156398,17792.112910,0.177921,2.000000,0.130270,0.198541,0.639825,10651.0


In [86]:
#Split test/train
X_train, X_test, y_train, y_test = train_test_split(X_selection, y, test_size=0.2, random_state=42)

#### Normalization/Scaling

In [87]:
#Standardize scalar, resacling all data. (can target specific columns if needed)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [88]:
xgboost = GradientBoostingRegressor(random_state=42)
xgboost.fit(X_train, y_train)

In [89]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

#### Model performance Metrics

In [90]:
# Gradient Boosting
y_pred_gb = xgboost.predict(X_test)
gb_r2 = r2_score(y_test, y_pred_gb)
gb_mae = mean_absolute_error(y_test, y_pred_gb)

# Random Forest
y_pred_rf = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, y_pred_rf)
rf_mae = mean_absolute_error(y_test, y_pred_rf)

print("GradientBoosting -> R^2:", gb_r2, " MAE:", gb_mae)
print("RandomForest     -> R^2:", rf_r2, " MAE:", rf_mae)


GradientBoosting -> R^2: 0.2645192217151513  MAE: 126.02714754576765
RandomForest     -> R^2: 0.25075189877351656  MAE: 127.14904897145918


##### Feature importance

In [91]:
# Random Forest feature importances
importances = rf_model.feature_importances_
feature_names = X_selection.columns

rf_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

rf_importance_df

Unnamed: 0,feature,importance
0,Bachelor (%),0.332057
1,Median_tot_fam_inc_weekly,0.160335
2,Median_tot_hhd_inc_weekly,0.067359
16,Owner occupied (%),0.050626
3,population_est,0.042061
9,Population-2023,0.040331
8,Advanced_&_Diploma (%),0.03834
7,Graduate_diploma_certificate(%),0.037158
5,incidents_recorded,0.027864
11,Unemployment,0.027025


In [92]:
# XG Boost feature importances
importances = xgboost.feature_importances_
feature_names = X_selection.columns

xgboost_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

xgboost_importance_df

Unnamed: 0,feature,importance
0,Bachelor (%),0.33982
1,Median_tot_fam_inc_weekly,0.171085
16,Owner occupied (%),0.074216
4,Certificate_level (%),0.060064
3,population_est,0.050871
2,Median_tot_hhd_inc_weekly,0.044413
9,Population-2023,0.037434
8,Advanced_&_Diploma (%),0.034573
7,Graduate_diploma_certificate(%),0.030458
6,post_gradutae (%),0.028056


#### Results without feature selection

In [93]:
#Split test/train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Standardize scalar, resacling all data. (can target specific columns if needed)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

xgboost = GradientBoostingRegressor(random_state=42)
xgboost.fit(X_train, y_train)

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)


In [94]:
# Gradient Boosting
y_pred_gb = xgboost.predict(X_test)
gb_r2 = r2_score(y_test, y_pred_gb)
gb_mae = mean_absolute_error(y_test, y_pred_gb)

# Random Forest
y_pred_rf = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, y_pred_rf)
rf_mae = mean_absolute_error(y_test, y_pred_rf)

print("GradientBoosting -> R^2:", gb_r2, " MAE:", gb_mae)
print("RandomForest     -> R^2:", rf_r2, " MAE:", rf_mae)


GradientBoosting -> R^2: 0.7186174325262387  MAE: 73.81170324329315
RandomForest     -> R^2: 0.7292647628876037  MAE: 67.96639191705792


In [95]:
# Random Forest feature importances
importances = rf_model.feature_importances_
feature_names = X.columns

rf_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

rf_importance_df.rename(columns={"importance": "Importance(%) for Prediction"}, inplace=True)
rf_importance_df["Importance(%) for Prediction"] = (rf_importance_df["Importance(%) for Prediction"] * 100).round(2)
rf_importance_df

Unnamed: 0,Feature,Importance(%) for Prediction
3,bathrooms,21.96
26,Bachelor (%),16.2
2,bedrooms,13.21
1,days_listed,3.66
15,Median_tot_fam_inc_weekly,2.91
14,Median_tot_prsnl_inc_weekly,2.73
7,num_metro_bus_stops,2.56
8,num_metro_tram_stops,2.55
6,agency,2.16
0,postcode,2.13


In [99]:
rf_importance_df.to_csv("../../data/outputs/rf_importance_features.csv", index=False)

In [97]:
# XG Boost feature importances
importances = xgboost.feature_importances_
feature_names = X.columns

xgboost_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

xgboost_importance_df.rename(columns={"importance": "Importance(%) for Prediction"}, inplace=True)
xgboost_importance_df["Importance(%) for Prediction"] = (xgboost_importance_df["Importance(%) for Prediction"] * 100).round(2)
xgboost_importance_df

Unnamed: 0,Feature,Importance(%) for Prediction
3,bathrooms,26.63
2,bedrooms,20.99
26,Bachelor (%),14.36
15,Median_tot_fam_inc_weekly,8.52
28,Certificate_level (%),3.06
14,Median_tot_prsnl_inc_weekly,2.65
25,Graduate_diploma_certificate(%),2.46
8,num_metro_tram_stops,2.27
33,population_est,2.18
4,carspaces,1.93


In [100]:
xgboost_importance_df.to_csv("../../data/outputs/xgboost_importance_features.csv", index=False)