# CMHC rental market ingest

This notebook reads the CMHC Rental Market Survey workbook and creates metro-level rent and vacancy indicators. The output is `data/processed/cmhc_metro.csv`.

We pull:
- Average two-bedroom rent
- Vacancy rate
- Year-over-year rent growth

These numbers describe current rental market pressure. They do not explain why it is happening.

In [1]:
from __future__ import annotations

import re
from pathlib import Path

import pandas as pd

ROOT = Path.cwd()
if not (ROOT / "data/processed/metros_master.csv").exists():
    ROOT = ROOT.parent

RAW = ROOT / "data/raw"
PROCESSED = ROOT / "data/processed"

RENTAL_FILE = RAW / "rental_market_report_latest.xlsx"
METROS_FILE = PROCESSED / "metros_master.csv"


In [2]:
df_raw = pd.read_excel(RENTAL_FILE, sheet_name="Table 1.0", header=None)
header_top_idx = df_raw.index[
    df_raw.apply(
        lambda row: row.astype(str).str.contains("Vacancy Rates", case=False, na=False).any(),
        axis=1,
    )
][0]
header_sub_idx = header_top_idx + 1

header_top = df_raw.iloc[header_top_idx].fillna("")
header_top = header_top.replace("", pd.NA).ffill().fillna("")
header_sub = df_raw.iloc[header_sub_idx].fillna("")

columns = []
for i, (top, sub) in enumerate(zip(header_top, header_sub)):
    top = str(top).strip()
    sub = str(sub).strip()
    if top and sub:
        columns.append(f"{top} | {sub}")
    elif sub:
        columns.append(sub)
    elif top:
        columns.append(top)
    else:
        columns.append(f"col_{i}")

data = df_raw.iloc[header_sub_idx + 1 :].copy()
data.columns = columns

def parse_year(label: str) -> int | None:
    digits = re.findall(r"\d{2,4}", label)
    if not digits:
        return None
    year = digits[-1]
    year_val = int(year)
    if len(year) == 2:
        year_val += 2000
    return year_val

def pick_latest_column(columns: list[str], keyword: str) -> tuple[str, int]:
    candidates = {}
    for col in columns:
        if keyword not in col:
            continue
        year = parse_year(col)
        if year:
            candidates[col] = year
    if not candidates:
        raise ValueError(f"No columns found for {keyword}")
    latest_col = max(candidates, key=candidates.get)
    return latest_col, candidates[latest_col]

centre_col = next((c for c in data.columns if c.strip().lower() == "centre"), data.columns[0])
vacancy_col, vacancy_year = pick_latest_column(list(data.columns), "Vacancy Rates (%)")
rent_col, rent_year = pick_latest_column(list(data.columns), "Average Rent Two Bedroom")
growth_col, growth_year = pick_latest_column(
    list(data.columns),
    "Percentage Change of Average Rent Two Bedroom",
)


In [3]:
def to_float(value):
    if pd.isna(value):
        return None
    text = str(value).replace(",", "").strip()
    if not text or text in {"-", "—", "–", "**"}:
        return None
    return pd.to_numeric(text, errors="coerce")

def normalize_centre(name: str) -> str:
    name = str(name).strip()
    name = name.replace(" CMA", "").replace(" CA", "")
    name = name.replace(" - ", "-")
    name = name.replace("Greater Sudbury/Grand Sudbury", "Greater Sudbury")
    if "Ottawa-Gatineau" in name:
        name = "Ottawa-Gatineau"
    return name

data = data[[centre_col, vacancy_col, rent_col, growth_col]].copy()
data.rename(
    columns={
        centre_col: "centre",
        vacancy_col: "cmhc_vacancy_rate",
        rent_col: "cmhc_avg_rent_2br",
        growth_col: "cmhc_rent_growth_yoy",
    },
    inplace=True,
)

data["cmhc_vacancy_rate"] = data["cmhc_vacancy_rate"].map(to_float)
data["cmhc_avg_rent_2br"] = data["cmhc_avg_rent_2br"].map(to_float)
data["cmhc_rent_growth_yoy"] = data["cmhc_rent_growth_yoy"].map(to_float)

data = data[data["centre"].notna()]
data = data[~data["centre"].astype(str).str.contains("10,000\+", regex=True)]
data = data[data["cmhc_vacancy_rate"].notna()]

data["metro_name_std"] = data["centre"].map(normalize_centre)

data = (
    data.groupby("metro_name_std", as_index=False)
    [["cmhc_vacancy_rate", "cmhc_avg_rent_2br", "cmhc_rent_growth_yoy"]]
    .mean()
)

metros = pd.read_csv(METROS_FILE)
data = data.merge(
    metros[["metro_id", "metro_name_std", "province"]],
    on="metro_name_std",
    how="left",
)

data["cmhc_rent_year"] = rent_year
data["cmhc_vacancy_year"] = vacancy_year
data["cmhc_rent_growth_year"] = growth_year


In [4]:
output = data[
    [
        "metro_id",
        "metro_name_std",
        "province",
        "cmhc_avg_rent_2br",
        "cmhc_vacancy_rate",
        "cmhc_rent_growth_yoy",
        "cmhc_rent_year",
        "cmhc_vacancy_year",
        "cmhc_rent_growth_year",
    ]
].copy()

unmatched = output[output["metro_id"].isna()][["metro_name_std"]]
output = output[output["metro_id"].notna()].copy()

output.sort_values("metro_name_std", inplace=True)
output_path = PROCESSED / "cmhc_metro.csv"
output.to_csv(output_path, index=False)

{
    "rows": int(len(output)),
    "missing_metro_id": int(len(unmatched)),
    "unmatched": unmatched.sort_values("metro_name_std").head(10).to_dict(orient="records"),
    "rent_year": rent_year,
    "vacancy_year": vacancy_year,
    "growth_year": growth_year,
}


{'rows': 41,
 'missing_metro_id': 2,
 'unmatched': [{'metro_name_std': 'Canadas'},
  {'metro_name_std': 'Charlottetown'}],
 'rent_year': 2025,
 'vacancy_year': 2025,
 'growth_year': 2025}