In [None]:
df1 = pd.read_excel("../../data/raw/moving-annual-rent-2025.xlsx", sheet_name="1 bedroom flat")
df2 = pd.read_excel("../../data/raw/moving-annual-rent-2025.xlsx", sheet_name="2 bedroom flat")
df3 = pd.read_excel("../../data/raw/moving-annual-rent-2025.xlsx", sheet_name="3 bedroom flat")
df4 = pd.read_excel("../../data/raw/moving-annual-rent-2025.xlsx", sheet_name="2 bedroom house")
df5 = pd.read_excel("../../data/raw/moving-annual-rent-2025.xlsx", sheet_name="3 bedroom house")
df6 = pd.read_excel("../../data/raw/moving-annual-rent-2025.xlsx", sheet_name="4 bedroom house")

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import numpy as np
import pandas as pd
import pmdarima as pm
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
import os

file_path = "../../data/raw/moving-annual-rent-2025.xlsx"
output_dir = "../../data/processed/forecast"
os.makedirs(output_dir, exist_ok=True)

xls = pd.ExcelFile(file_path)

# Get all sheets except "All properties"
sheets = [s for s in xls.sheet_names if s != "All properties"]
print("Sheets included:", sheets)

for sheet in sheets:
    df_raw = pd.read_excel(file_path, sheet_name=sheet, header=None)

    # Suburbs
    suburbs = df_raw.iloc[3:, 1].reset_index(drop=True)
    suburbs.name = "Suburb"

    # Date row and type row
    date_row = df_raw.iloc[1]
    type_row = df_raw.iloc[2]

    # Keep only Median columns
    median_cols = [i for i, t in enumerate(type_row) if t == "Median"]

    data = {}
    data["Suburb"] = suburbs

    for col in median_cols:
        date_str = str(date_row[col]).strip()
        try:
            date = pd.to_datetime(date_str, format="%b %Y")
            date_label = date.strftime("%Y-%m")  # e.g. "2000-03"
        except:
            continue

        data[date_label] = pd.to_numeric(df_raw.iloc[3:, col].reset_index(drop=True), errors="coerce")

    df_wide = pd.DataFrame(data)

    # Drop "Group Total" if present
    df_wide = df_wide[df_wide["Suburb"] != "Group Total"]

    out_file = os.path.join(output_dir, f"{sheet.replace(' ', '_')}.csv")
    df_wide.to_csv(out_file, index=False)
    print(f"Saved {out_file} with shape {df_wide.shape}")

Sheets included: ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house']
Saved ../../data/processed/forecast/1_bedroom_flat.csv with shape (146, 102)
Saved ../../data/processed/forecast/2_bedroom_flat.csv with shape (146, 102)
Saved ../../data/processed/forecast/3_bedroom_flat.csv with shape (146, 102)
Saved ../../data/processed/forecast/2_bedroom_house.csv with shape (146, 102)
Saved ../../data/processed/forecast/3_bedroom_house.csv with shape (146, 102)
Saved ../../data/processed/forecast/4_bedroom_house.csv with shape (146, 102)


In [23]:
import pandas as pd
import pmdarima as pm
import os

# Directories
input_dir = "../../data/processed/forecast"
output_dir = "../../data/processed/forecast"
os.makedirs(output_dir, exist_ok=True)

# Collect all property-type CSV files (exclude already-generated forecasts)
files = [f for f in os.listdir(input_dir) if f.endswith(".csv") and "forecast" not in f]
print("Files found:", files)

for file in files:
    input_file = os.path.join(input_dir, file)
    print(f"\nProcessing file: {file}")
    
    historical_data = pd.read_csv(input_file)
    extended_data = {}

    for suburb in historical_data['Suburb'].unique():
        row = historical_data[historical_data['Suburb'] == suburb].drop(columns=['Suburb'])
        row = row.T
        row.index = pd.to_datetime(row.index, format='%Y-%m', errors='coerce')
        row.index = row.index.to_period("Q")   # quarterly index
        row.columns = [suburb]

        ts = row[suburb].astype(float).dropna()

        # Skip if too little history
        if ts.empty or len(ts) < 8:
            continue

        # Ensure quarterly frequency and fill small gaps
        ts = ts.asfreq("Q").interpolate(limit_direction="both")

        # Skip if still too short after cleaning
        if len(ts) < 12:  # less than 3 years of data
            continue

        # Fit ARIMA with fallback
        try:
            model = pm.auto_arima(
                ts,
                seasonal=True,
                m=4,                  # 4 quarters per year
                trace=False,
                error_action='ignore',
                suppress_warnings=True
            )
        except Exception as e:
            print(f"⚠️ {suburb} in {file}: fallback to non-seasonal ARIMA ({e})")
            model = pm.auto_arima(
                ts,
                seasonal=False,
                trace=False,
                error_action='ignore',
                suppress_warnings=True
            )

        # Forecast next 20 quarters (5 years)
        forecast = model.predict(n_periods=20)
        forecast_index = pd.period_range(ts.index[-1] + 1, periods=20, freq='Q')
        forecast_series = pd.Series(forecast, index=forecast_index)

        # Combine history + forecast
        all_series = pd.concat([ts, forecast_series])
        extended_data[suburb] = all_series

    # Build wide DataFrame (suburb × date)
    extended_df = pd.DataFrame(extended_data).T
    extended_df.columns = extended_df.columns.to_timestamp().strftime("%Y-%m")
    extended_df.index.name = "Suburb"

    # Save
    out_file = os.path.join(output_dir, file.replace(".csv", "_forecast.csv"))
    extended_df.to_csv(out_file)
    print(f"✅ Saved extended forecast: {out_file} with shape {extended_df.shape}")

Files found: ['2_bedroom_house.csv', '1_bedroom_flat.csv', '3_bedroom_house.csv', '4_bedroom_house.csv', '3_bedroom_flat.csv', '2_bedroom_flat.csv']

Processing file: 2_bedroom_house.csv
✅ Saved extended forecast: ../../data/processed/forecast/2_bedroom_house_forecast.csv with shape (143, 121)

Processing file: 1_bedroom_flat.csv
✅ Saved extended forecast: ../../data/processed/forecast/1_bedroom_flat_forecast.csv with shape (144, 121)

Processing file: 3_bedroom_house.csv
✅ Saved extended forecast: ../../data/processed/forecast/3_bedroom_house_forecast.csv with shape (143, 121)

Processing file: 4_bedroom_house.csv
✅ Saved extended forecast: ../../data/processed/forecast/4_bedroom_house_forecast.csv with shape (143, 121)

Processing file: 3_bedroom_flat.csv
✅ Saved extended forecast: ../../data/processed/forecast/3_bedroom_flat_forecast.csv with shape (143, 121)

Processing file: 2_bedroom_flat.csv
✅ Saved extended forecast: ../../data/processed/forecast/2_bedroom_flat_forecast.csv wit

In [None]:
import pandas as pd
import pmdarima as pm
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Directories
input_dir = "../../data/processed/forecast"

# Collect property-type CSV files 
files = [f for f in os.listdir(input_dir) if f.endswith(".csv") and "forecast" not in f]
print("Files found:", files)

results = []  

for file in files:
    input_file = os.path.join(input_dir, file)
    print(f"\nEvaluating file: {file}")
    
    historical_data = pd.read_csv(input_file)

    for suburb in historical_data['Suburb'].unique():
        row = historical_data[historical_data['Suburb'] == suburb].drop(columns=['Suburb'])
        row = row.T
        row.index = pd.to_datetime(row.index, format='%Y-%m', errors='coerce')
        row.index = row.index.to_period("Q")
        row.columns = [suburb]

        ts = row[suburb].astype(float).dropna()
        if ts.empty or len(ts) < 40:  # skip very short series (<10 years)
            continue

        # Train/test split
        train = ts[ts.index <= pd.Period("2023Q4", freq="Q")]
        test = ts[ts.index.year == 2024]

        if len(test) == 0:  # no test data available
            continue

        # Fit ARIMA on training data
        try:
            model = pm.auto_arima(
                train,
                seasonal=True,
                m=4,
                trace=False,
                error_action='ignore',
                suppress_warnings=True
            )
        except Exception as e:
            print(f"⚠️ {suburb} in {file}: fallback to non-seasonal ARIMA ({e})")
            model = pm.auto_arima(
                train,
                seasonal=False,
                trace=False,
                error_action='ignore',
                suppress_warnings=True
            )

        # Forecast next 4 quarters (2024)
        forecast = model.predict(n_periods=4)
        forecast_index = pd.period_range("2024Q1", periods=4, freq="Q")
        forecast_series = pd.Series(forecast, index=forecast_index)

        # Align actual vs forecast
        test_aligned = test.reindex(forecast_series.index)

        # Drop NaNs
        df_compare = pd.concat([test_aligned, forecast_series], axis=1)
        df_compare.columns = ["actual", "forecast"]
        df_compare = df_compare.dropna()

        if df_compare.empty:
            continue

        y_true = df_compare["actual"]
        y_pred = df_compare["forecast"]

        # Metrics
        mae = mean_absolute_error(y_true, y_pred)
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

        results.append({
            "file": file,
            "suburb": suburb,
            "MAE": mae,
            "RMSE": rmse,
            "MAPE": mape
        })

results_df = pd.DataFrame(results)

print("\n✅ Accuracy results (first 20 rows):")
display(results_df.head(20))

# Average metrics per property type
avg_results = results_df.groupby("file")[["MAE","RMSE","MAPE"]].mean().reset_index()
print("\n✅ Average accuracy per property type:")
display(avg_results)


Files found: ['2_bedroom_house.csv', '1_bedroom_flat.csv', '3_bedroom_house.csv', '4_bedroom_house.csv', '3_bedroom_flat.csv', '2_bedroom_flat.csv']

Evaluating file: 2_bedroom_house.csv

Evaluating file: 1_bedroom_flat.csv

Evaluating file: 3_bedroom_house.csv

Evaluating file: 4_bedroom_house.csv

Evaluating file: 3_bedroom_flat.csv

Evaluating file: 2_bedroom_flat.csv

✅ Accuracy results (first 20 rows):


Unnamed: 0,file,suburb,MAE,RMSE,MAPE
0,2_bedroom_house.csv,Albert Park-Middle Park-West St Kilda,15.91423,23.959625,2.015185
1,2_bedroom_house.csv,Armadale,21.450426,26.297084,2.955026
2,2_bedroom_house.csv,Carlton North,20.549487,22.063206,2.927243
3,2_bedroom_house.csv,Carlton-Parkville,13.257924,13.308216,2.039681
4,2_bedroom_house.csv,Collingwood-Abbotsford,18.552632,19.246002,2.650376
5,2_bedroom_house.csv,East St Kilda,12.815789,16.433615,1.766703
6,2_bedroom_house.csv,Elwood,35.187083,44.597214,4.709497
7,2_bedroom_house.csv,Fitzroy,41.002666,41.93102,5.405863
8,2_bedroom_house.csv,Fitzroy North-Clifton Hill,5.263158,6.112405,0.743834
9,2_bedroom_house.csv,Flemington-Kensington,5.131579,6.04977,0.835873



✅ Average accuracy per property type:


Unnamed: 0,file,MAE,RMSE,MAPE
0,1_bedroom_flat.csv,20.404222,22.330621,5.274755
1,2_bedroom_flat.csv,17.462253,19.267265,3.572516
2,2_bedroom_house.csv,25.661007,28.200524,4.8404
3,3_bedroom_flat.csv,36.89522,39.98133,5.353981
4,3_bedroom_house.csv,22.68925,24.424635,3.340882
5,4_bedroom_house.csv,34.612044,38.60333,4.096272
