# StatCan metro ingest

This notebook turns StatCan CSV files into clean, metro-level indicators we can merge later. It expects the raw files listed in `notebooks/00_ingest_plan.ipynb` to already exist in `data/raw/`.

We build three inputs:
- Median after-tax household income
- Unemployment rate
- Population estimates

These are supporting context for housing stress. They are not causal proof on their own.

In [2]:
from __future__ import annotations

import csv
from dataclasses import dataclass
from pathlib import Path
from statistics import mean

ROOT = Path.cwd()
if not (ROOT / "data/processed/metros_master.csv").exists():
    ROOT = ROOT.parent

RAW = ROOT / "data/raw"
PROCESSED = ROOT / "data/processed"

INCOME_FILE = RAW / "median_household_income_11100035.csv"
POP_FILE = RAW / "population_estimates_17100148.csv"
UNEMP_FILE = RAW / "unemployment_rate_14100459.csv"
METROS_FILE = PROCESSED / "metros_master.csv"


def _normalize_fieldnames(fieldnames: list[str]) -> list[str]:
    return [name.replace("\ufeff", "").strip('"') for name in fieldnames]


def iter_rows(path: Path):
    with path.open(newline="", encoding="utf-8") as fh:
        reader = csv.reader(fh)
        fieldnames = _normalize_fieldnames(next(reader))
        for row in reader:
            if not row:
                continue
            yield dict(zip(fieldnames, row))


@dataclass(frozen=True)
class Metro:
    metro_id: str
    metro_name_std: str
    province: str
    dguid_2021: str

    @property
    def geo_label(self) -> str:
        return f"{self.metro_name_std}, {self.province}"

In [3]:
metros: list[Metro] = []
with METROS_FILE.open(newline="", encoding="utf-8") as fh:
    reader = csv.DictReader(fh)
    for row in reader:
        metros.append(
            Metro(
                metro_id=row["metro_id"],
                metro_name_std=row["metro_name_std"],
                province=row["province"],
                dguid_2021=row["dguid_2021"],
            )
        )

geo_to_metro = {metro.geo_label: metro for metro in metros}
dguid_to_metro = {metro.dguid_2021: metro for metro in metros}

def latest_income_year() -> int:
    years: set[int] = set()
    for row in iter_rows(INCOME_FILE):
        if row.get("Economic family type") != "All family units":
            continue
        geo = row.get("GEO", "")
        if geo in geo_to_metro:
            years.add(int(row["REF_DATE"]))
    return max(years)


def latest_population_year() -> int:
    years: set[int] = set()
    for row in iter_rows(POP_FILE):
        if row.get("Gender") != "Total - gender":
            continue
        if row.get("Age group") != "All ages":
            continue
        dguid = row.get("DGUID", "")
        if dguid in dguid_to_metro:
            years.add(int(row["REF_DATE"]))
    return max(years)


def latest_unemployment_year() -> int:
    years: set[int] = set()
    for row in iter_rows(UNEMP_FILE):
        if row.get("Labour force characteristics") != "Unemployment rate":
            continue
        if row.get("Statistics") != "Estimate":
            continue
        if row.get("Data type") != "Seasonally adjusted":
            continue
        geo = row.get("GEO", "")
        if geo in geo_to_metro:
            years.add(int(row["REF_DATE"].split("-")[0]))
    return max(years)


income_year = latest_income_year()
population_year = latest_population_year()
unemployment_year = latest_unemployment_year()

reference_year = min(income_year, population_year, unemployment_year)
reference_year


2012

In [4]:
income_by_metro: dict[str, float | None] = {}
for row in iter_rows(INCOME_FILE):
    if row.get("Economic family type") != "All family units":
        continue
    if int(row["REF_DATE"]) != reference_year:
        continue
    geo = row.get("GEO", "")
    metro = geo_to_metro.get(geo)
    if not metro:
        continue
    value = row.get("VALUE", "")
    income_by_metro[metro.metro_id] = float(value) if value else None

unemployment_values: dict[str, list[float]] = {metro.metro_id: [] for metro in metros}
for row in iter_rows(UNEMP_FILE):
    if row.get("Labour force characteristics") != "Unemployment rate":
        continue
    if row.get("Statistics") != "Estimate":
        continue
    if row.get("Data type") != "Seasonally adjusted":
        continue
    year = int(row["REF_DATE"].split("-")[0])
    if year != reference_year:
        continue
    geo = row.get("GEO", "")
    metro = geo_to_metro.get(geo)
    if not metro:
        continue
    value = row.get("VALUE", "")
    if value:
        unemployment_values[metro.metro_id].append(float(value))

unemployment_by_metro: dict[str, float | None] = {
    metro_id: mean(values) if values else None
    for metro_id, values in unemployment_values.items()
}

population_by_metro: dict[str, dict[int, float]] = {metro.metro_id: {} for metro in metros}
for row in iter_rows(POP_FILE):
    if row.get("Gender") != "Total - gender":
        continue
    if row.get("Age group") != "All ages":
        continue
    dguid = row.get("DGUID", "")
    metro = dguid_to_metro.get(dguid)
    if not metro:
        continue
    year = int(row["REF_DATE"])
    if year not in (reference_year, reference_year - 1):
        continue
    value = row.get("VALUE", "")
    if value:
        population_by_metro[metro.metro_id][year] = float(value)

output_rows: list[dict[str, object]] = []
for metro in metros:
    pop_current = population_by_metro[metro.metro_id].get(reference_year)
    pop_previous = population_by_metro[metro.metro_id].get(reference_year - 1)
    if pop_current is None or pop_previous in (None, 0):
        growth_pct = None
    else:
        growth_pct = (pop_current - pop_previous) / pop_previous * 100.0
    output_rows.append(
        {
            "metro_id": metro.metro_id,
            "metro_name_std": metro.metro_name_std,
            "province": metro.province,
            "statcan_reference_year": reference_year,
            "statcan_median_income_after_tax": income_by_metro.get(metro.metro_id),
            "statcan_unemployment_rate": unemployment_by_metro.get(metro.metro_id),
            "statcan_population": pop_current,
            "statcan_population_growth_pct": growth_pct,
        }
    )

output_path = PROCESSED / "statcan_metro.csv"
with output_path.open("w", newline="", encoding="utf-8") as fh:
    writer = csv.DictWriter(fh, fieldnames=output_rows[0].keys())
    writer.writeheader()
    writer.writerows(output_rows)

output_path


PosixPath('/Users/andrewharris/Projects/housing-affordability/data/processed/statcan_metro.csv')

In [5]:
missing_income = sum(1 for row in output_rows if row["statcan_median_income_after_tax"] is None)
missing_unemployment = sum(1 for row in output_rows if row["statcan_unemployment_rate"] is None)
missing_population = sum(1 for row in output_rows if row["statcan_population"] is None)

{
    "metros_total": len(output_rows),
    "missing_income": missing_income,
    "missing_unemployment": missing_unemployment,
    "missing_population": missing_population,
}


{'metros_total': 41,
 'missing_income': 21,
 'missing_unemployment': 2,
 'missing_population': 0}