In [1]:
# Phase A — Data audit & single “master” panel

from pathlib import Path
import pandas as pd
import numpy as np
import warnings
import re

# -------------------------------------------------------------------
# 0) Setup & paths
# -------------------------------------------------------------------
# Optional: quiet noisy pandas parsing warnings
warnings.filterwarnings("ignore", message="Could not infer format")

PROJ_ROOT = Path.cwd().resolve()  # run the notebook from the project root
DATA_RAW = PROJ_ROOT / "data" / "raw"
DATA_INTERIM = PROJ_ROOT / "data" / "interim"
DATA_PROCESSED = PROJ_ROOT / "data" / "processed"

for p in [DATA_RAW, DATA_INTERIM, DATA_PROCESSED]:
    p.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJ_ROOT)
print("Raw:", DATA_RAW)
print("Interim:", DATA_INTERIM)
print("Processed:", DATA_PROCESSED)

# -------------------------------------------------------------------
# 1) Helpers: column cleanup, date detection, value detection
# -------------------------------------------------------------------
def _clean_cols(df):
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.strip()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^\w_]+", "", regex=True)
        .str.lower()
    )
    return df

def _parse_year_quarter_like(s):
    """
    Try parsing common Y-Q formats:
      '2020 Q1', '2020Q1', '2020 Quarter 1', 'Q1 2020'
    Returns pandas datetime (end of quarter).
    """
    txt = s.astype(str).str.strip()

    # Normalize various quarter tokens to a uniform form
    txt = (
        txt.str.replace(r"quarter\s*", "Q", flags=re.I, regex=True)
           .str.replace(r"\s+", "", regex=True)
    )

    # Convert 'Q12020' -> '2020Q1'
    txt = txt.str.replace(r"^Q([1-4])(\d{4})$", r"\2Q\1", regex=True)

    # Keep those that look like 'YYYYQq'
    mask = txt.str.match(r"^\d{4}Q[1-4]$", na=False)
    yq = pd.Series(pd.NaT, index=s.index, dtype="datetime64[ns]")
    if mask.any():
        per = pd.PeriodIndex(txt[mask], freq="Q")
        yq.loc[mask] = per.to_timestamp(how="end")
    return yq

def _try_parse_year_quarter_cols(df):
    """
    If there are separate 'year' and 'quarter' columns, build a datetime series
    at quarter-end.
    """
    year_cols = [c for c in df.columns if c in {"year", "yr"}]
    q_cols = [c for c in df.columns if c in {"quarter", "qtr", "q"}]
    if year_cols and q_cols:
        yc, qc = year_cols[0], q_cols[0]
        y = pd.to_numeric(df[yc], errors="coerce")
        q = pd.to_numeric(df[qc], errors="coerce")
        ok = y.notna() & q.notna() & q.between(1, 4)
        per = pd.Series(pd.NaT, index=df.index, dtype="datetime64[ns]")
        if ok.any():
            idx = ok[ok].index
            per.loc[idx] = pd.PeriodIndex(
                (y.loc[idx].astype(int).astype(str) + "Q" + q.loc[idx].astype(int).astype(str)).values,
                freq="Q"
            ).to_timestamp(how="end")
        return per
    return pd.Series(pd.NaT, index=df.index, dtype="datetime64[ns]")

def _first_datetime_col(df):
    """
    Find/construct a usable datetime column (monthly/quarterly/annual).
    Returns a datetime Series or None.
    """
    # Try explicit date-like columns first
    candidates = ["period", "date", "time", "month"]
    for c in candidates:
        if c in df.columns:
            dt = pd.to_datetime(df[c], errors="coerce", dayfirst=True)
            if dt.notna().sum() >= max(3, int(0.4 * len(df))):
                return dt

    # Try a single 'year' column (assume Dec-31)
    if "year" in df.columns:
        y = pd.to_numeric(df["year"], errors="coerce")
        if y.notna().any():
            return pd.to_datetime(y.astype("Int64").astype(str) + "-12-31", errors="coerce")

    # Try year+quarter columns
    yq_cols = _try_parse_year_quarter_cols(df)
    if yq_cols.notna().sum() >= max(3, int(0.4 * len(df))):
        return yq_cols

    # Try generic text quarter formats in any object col
    for c in df.columns:
        if df[c].dtype == "object":
            yq_free = _parse_year_quarter_like(df[c])
            if yq_free.notna().sum() >= max(3, int(0.4 * len(df))):
                return yq_free

    # Brute force: try every column via to_datetime
    for c in df.columns:
        dt = pd.to_datetime(df[c], errors="coerce", dayfirst=True)
        if dt.notna().sum() >= max(3, int(0.4 * len(df))):
            return dt
    return None

def _numeric_candidate_columns(df):
    exclude = {"period", "date", "time", "month", "quarter", "qtr", "year", "yr", "q"}
    num_cols = []
    for c in df.columns:
        if c in exclude:
            continue
        s = pd.to_numeric(df[c], errors="coerce")
        if s.notna().sum() >= max(3, int(0.4 * len(df))):
            num_cols.append(c)
    return num_cols

def _guess_freq(period_series):
    """
    Roughly guess frequency from median day gaps.
    """
    s = pd.to_datetime(period_series.dropna(), errors="coerce").sort_values().unique()
    if len(s) < 3:
        return "unknown"

    s = pd.Series(s).astype("datetime64[ns]")
    deltas = s.diff().iloc[1:]  # TimedeltaIndex/Series
    days = deltas / np.timedelta64(1, "D")  # float days
    med = float(np.nanmedian(days))

    if 27 <= med <= 35:
        return "monthly"
    if 80 <= med <= 100:
        return "quarterly"
    if 360 <= med <= 370:
        return "annual"
    return "irregular"

# -------------------------------------------------------------------
# 2) Read any file (csv/xlsx/xls) and tidy to long format
# -------------------------------------------------------------------
def read_any(path):
    suf = path.suffix.lower()
    try:
        if suf == ".csv":
            try:
                return pd.read_csv(path)
            except UnicodeDecodeError:
                return pd.read_csv(path, encoding="latin-1")
        elif suf in {".xlsx", ".xls"}:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                # sheet_name=0 by default; override later via per-file logic if needed
                return pd.read_excel(path, sheet_name=0)
        else:
            raise ValueError(f"Unsupported file type: {suf}")
    except Exception as e:
        raise RuntimeError(f"Failed to read {path.name}: {e}") from e

def tidy_any(path):
    raw = read_any(path)
    raw = _clean_cols(raw)

    # Drop fully empty columns/rows
    raw = raw.dropna(axis=1, how="all").dropna(axis=0, how="all")
    if raw.empty:
        return pd.DataFrame(columns=["source_file","measure","period","value"])

    # Detect period column (datetime-like)
    period = _first_datetime_col(raw)
    if period is None:
        # As a last resort: look for a column called 'year' even if sparse
        if "year" in raw.columns:
            y = pd.to_numeric(raw["year"], errors="coerce")
            period = pd.to_datetime(y.astype("Int64").astype(str) + "-12-31", errors="coerce")
        else:
            # cannot structure as time series; return empty tidy frame
            return pd.DataFrame(columns=["source_file","measure","period","value"])

    # Identify numeric measure columns
    num_cols = _numeric_candidate_columns(raw)
    if not num_cols:
        return pd.DataFrame(columns=["source_file","measure","period","value"])

    # Slice to period + numeric cols
    df = pd.DataFrame({"period": pd.to_datetime(period, errors="coerce")})
    for c in num_cols:
        df[c] = pd.to_numeric(raw[c], errors="coerce")

    df = df.dropna(subset=["period"])
    if df.empty:
        return pd.DataFrame(columns=["source_file","measure","period","value"])

    # Long format
    tidy = df.melt(id_vars="period", value_vars=num_cols, var_name="measure", value_name="value")
    tidy["source_file"] = path.name
    tidy["measure"] = tidy["measure"].str.replace(r"__+", "_", regex=True).str.strip("_")
    tidy = tidy.dropna(subset=["value"]).sort_values(["measure", "period"]).reset_index(drop=True)

    return tidy[["source_file","measure","period","value"]]

# -------------------------------------------------------------------
# 3) Process all raw files -> interim tidy CSVs + audit
# -------------------------------------------------------------------
patterns = ["*.csv", "*.xlsx", "*.xls"]
raw_files = []
for pat in patterns:
    raw_files.extend(DATA_RAW.rglob(pat))

if not raw_files:
    print("No files found in /data/raw yet. Place your originals there and re-run.")

all_tidy = []
audit_rows = []

for f in sorted(raw_files):
    tidy_df = tidy_any(f)
    out_path = DATA_INTERIM / f"{f.stem}_tidy.csv"
    tidy_df.to_csv(out_path, index=False)

    # compute audit stats
    if tidy_df.empty:
        audit_rows.append({
            "source_file": f.name,
            "tidy_rows": 0,
            "measures": 0,
            "period_min": pd.NaT,
            "period_max": pd.NaT,
            "freq_guess": "unknown",
            "missing_values": 0
        })
    else:
        freq = _guess_freq(tidy_df["period"])
        audit_rows.append({
            "source_file": f.name,
            "tidy_rows": len(tidy_df),
            "measures": tidy_df["measure"].nunique(),
            "period_min": tidy_df["period"].min(),
            "period_max": tidy_df["period"].max(),
            "freq_guess": freq,
            "missing_values": tidy_df["value"].isna().sum()
        })
        all_tidy.append(tidy_df)

audit = pd.DataFrame(audit_rows)
audit_path = DATA_PROCESSED / "audit_summary.csv"
audit.to_csv(audit_path, index=False)
print(f"Audit summary written -> {audit_path}")

# -------------------------------------------------------------------
# 4) Build master panel (concat all tidy files)
# -------------------------------------------------------------------
if all_tidy:
    master = pd.concat(all_tidy, ignore_index=True).drop_duplicates()
    master["period"] = pd.to_datetime(master["period"], errors="coerce")
    master = master.dropna(subset=["period"]).sort_values(["measure","period"]).reset_index(drop=True)

    master_path = DATA_PROCESSED / "master_panel.csv"
    master.to_csv(master_path, index=False)
    print(f"Master panel written -> {master_path} ({len(master):,} rows)")
else:
    print("No tidy data produced; master panel not created.")

Project root: C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code
Raw: C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code\data\raw
Interim: C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code\data\interim
Processed: C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code\data\processed
No files found in /data/raw yet. Place your originals there and re-run.
Audit summary written -> C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code\data\processed\audit_summary.csv
No tidy data produced; master panel not created.


In [2]:
# Phase A — Data audit & single “master” panel

from pathlib import Path
import pandas as pd
import numpy as np
import warnings
import re

# -------------------------------------------------------------------
# 0) Setup & paths
# -------------------------------------------------------------------
# Optional: quiet noisy pandas parsing warnings
warnings.filterwarnings("ignore", message="Could not infer format")

PROJ_ROOT = Path.cwd().resolve()  # run the notebook from the project root
DATA_RAW = PROJ_ROOT / "data" / "raw"
DATA_INTERIM = PROJ_ROOT / "data" / "interim"
DATA_PROCESSED = PROJ_ROOT / "data" / "processed"

for p in [DATA_RAW, DATA_INTERIM, DATA_PROCESSED]:
    p.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJ_ROOT)
print("Raw:", DATA_RAW)
print("Interim:", DATA_INTERIM)
print("Processed:", DATA_PROCESSED)

# -------------------------------------------------------------------
# 1) Helpers: column cleanup, date detection, value detection
# -------------------------------------------------------------------
def _clean_cols(df):
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.strip()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^\w_]+", "", regex=True)
        .str.lower()
    )
    return df

def _parse_year_quarter_like(s):
    """
    Try parsing common Y-Q formats:
      '2020 Q1', '2020Q1', '2020 Quarter 1', 'Q1 2020'
    Returns pandas datetime (end of quarter).
    """
    txt = s.astype(str).str.strip()

    # Normalize various quarter tokens to a uniform form
    txt = (
        txt.str.replace(r"quarter\s*", "Q", flags=re.I, regex=True)
           .str.replace(r"\s+", "", regex=True)
    )

    # Convert 'Q12020' -> '2020Q1'
    txt = txt.str.replace(r"^Q([1-4])(\d{4})$", r"\2Q\1", regex=True)

    # Keep those that look like 'YYYYQq'
    mask = txt.str.match(r"^\d{4}Q[1-4]$", na=False)
    yq = pd.Series(pd.NaT, index=s.index, dtype="datetime64[ns]")
    if mask.any():
        per = pd.PeriodIndex(txt[mask], freq="Q")
        yq.loc[mask] = per.to_timestamp(how="end")
    return yq

def _try_parse_year_quarter_cols(df):
    """
    If there are separate 'year' and 'quarter' columns, build a datetime series
    at quarter-end.
    """
    year_cols = [c for c in df.columns if c in {"year", "yr"}]
    q_cols = [c for c in df.columns if c in {"quarter", "qtr", "q"}]
    if year_cols and q_cols:
        yc, qc = year_cols[0], q_cols[0]
        y = pd.to_numeric(df[yc], errors="coerce")
        q = pd.to_numeric(df[qc], errors="coerce")
        ok = y.notna() & q.notna() & q.between(1, 4)
        per = pd.Series(pd.NaT, index=df.index, dtype="datetime64[ns]")
        if ok.any():
            idx = ok[ok].index
            per.loc[idx] = pd.PeriodIndex(
                (y.loc[idx].astype(int).astype(str) + "Q" + q.loc[idx].astype(int).astype(str)).values,
                freq="Q"
            ).to_timestamp(how="end")
        return per
    return pd.Series(pd.NaT, index=df.index, dtype="datetime64[ns]")

def _first_datetime_col(df):
    """
    Find/construct a usable datetime column (monthly/quarterly/annual).
    Returns a datetime Series or None.
    """
    # Try explicit date-like columns first
    candidates = ["period", "date", "time", "month"]
    for c in candidates:
        if c in df.columns:
            dt = pd.to_datetime(df[c], errors="coerce", dayfirst=True)
            if dt.notna().sum() >= max(3, int(0.4 * len(df))):
                return dt

    # Try a single 'year' column (assume Dec-31)
    if "year" in df.columns:
        y = pd.to_numeric(df["year"], errors="coerce")
        if y.notna().any():
            return pd.to_datetime(y.astype("Int64").astype(str) + "-12-31", errors="coerce")

    # Try year+quarter columns
    yq_cols = _try_parse_year_quarter_cols(df)
    if yq_cols.notna().sum() >= max(3, int(0.4 * len(df))):
        return yq_cols

    # Try generic text quarter formats in any object col
    for c in df.columns:
        if df[c].dtype == "object":
            yq_free = _parse_year_quarter_like(df[c])
            if yq_free.notna().sum() >= max(3, int(0.4 * len(df))):
                return yq_free

    # Brute force: try every column via to_datetime
    for c in df.columns:
        dt = pd.to_datetime(df[c], errors="coerce", dayfirst=True)
        if dt.notna().sum() >= max(3, int(0.4 * len(df))):
            return dt
    return None

def _numeric_candidate_columns(df):
    exclude = {"period", "date", "time", "month", "quarter", "qtr", "year", "yr", "q"}
    num_cols = []
    for c in df.columns:
        if c in exclude:
            continue
        s = pd.to_numeric(df[c], errors="coerce")
        if s.notna().sum() >= max(3, int(0.4 * len(df))):
            num_cols.append(c)
    return num_cols

def _guess_freq(period_series):
    """
    Roughly guess frequency from median day gaps.
    """
    s = pd.to_datetime(period_series.dropna(), errors="coerce").sort_values().unique()
    if len(s) < 3:
        return "unknown"

    s = pd.Series(s).astype("datetime64[ns]")
    deltas = s.diff().iloc[1:]  # TimedeltaIndex/Series
    days = deltas / np.timedelta64(1, "D")  # float days
    med = float(np.nanmedian(days))

    if 27 <= med <= 35:
        return "monthly"
    if 80 <= med <= 100:
        return "quarterly"
    if 360 <= med <= 370:
        return "annual"
    return "irregular"

# -------------------------------------------------------------------
# 2) Read any file (csv/xlsx/xls) and tidy to long format
# -------------------------------------------------------------------
def read_any(path):
    suf = path.suffix.lower()
    try:
        if suf == ".csv":
            try:
                return pd.read_csv(path)
            except UnicodeDecodeError:
                return pd.read_csv(path, encoding="latin-1")
        elif suf in {".xlsx", ".xls"}:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                # sheet_name=0 by default; override later via per-file logic if needed
                return pd.read_excel(path, sheet_name=0)
        else:
            raise ValueError(f"Unsupported file type: {suf}")
    except Exception as e:
        raise RuntimeError(f"Failed to read {path.name}: {e}") from e

def tidy_any(path):
    raw = read_any(path)
    raw = _clean_cols(raw)

    # Drop fully empty columns/rows
    raw = raw.dropna(axis=1, how="all").dropna(axis=0, how="all")
    if raw.empty:
        return pd.DataFrame(columns=["source_file","measure","period","value"])

    # Detect period column (datetime-like)
    period = _first_datetime_col(raw)
    if period is None:
        # As a last resort: look for a column called 'year' even if sparse
        if "year" in raw.columns:
            y = pd.to_numeric(raw["year"], errors="coerce")
            period = pd.to_datetime(y.astype("Int64").astype(str) + "-12-31", errors="coerce")
        else:
            # cannot structure as time series; return empty tidy frame
            return pd.DataFrame(columns=["source_file","measure","period","value"])

    # Identify numeric measure columns
    num_cols = _numeric_candidate_columns(raw)
    if not num_cols:
        return pd.DataFrame(columns=["source_file","measure","period","value"])

    # Slice to period + numeric cols
    df = pd.DataFrame({"period": pd.to_datetime(period, errors="coerce")})
    for c in num_cols:
        df[c] = pd.to_numeric(raw[c], errors="coerce")

    df = df.dropna(subset=["period"])
    if df.empty:
        return pd.DataFrame(columns=["source_file","measure","period","value"])

    # Long format
    tidy = df.melt(id_vars="period", value_vars=num_cols, var_name="measure", value_name="value")
    tidy["source_file"] = path.name
    tidy["measure"] = tidy["measure"].str.replace(r"__+", "_", regex=True).str.strip("_")
    tidy = tidy.dropna(subset=["value"]).sort_values(["measure", "period"]).reset_index(drop=True)

    return tidy[["source_file","measure","period","value"]]

# -------------------------------------------------------------------
# 3) Process all raw files -> interim tidy CSVs + audit
# -------------------------------------------------------------------
patterns = ["*.csv", "*.xlsx", "*.xls"]
raw_files = []
for pat in patterns:
    raw_files.extend(DATA_RAW.rglob(pat))

if not raw_files:
    print("No files found in /data/raw yet. Place your originals there and re-run.")

all_tidy = []
audit_rows = []

for f in sorted(raw_files):
    tidy_df = tidy_any(f)
    out_path = DATA_INTERIM / f"{f.stem}_tidy.csv"
    tidy_df.to_csv(out_path, index=False)

    # compute audit stats
    if tidy_df.empty:
        audit_rows.append({
            "source_file": f.name,
            "tidy_rows": 0,
            "measures": 0,
            "period_min": pd.NaT,
            "period_max": pd.NaT,
            "freq_guess": "unknown",
            "missing_values": 0
        })
    else:
        freq = _guess_freq(tidy_df["period"])
        audit_rows.append({
            "source_file": f.name,
            "tidy_rows": len(tidy_df),
            "measures": tidy_df["measure"].nunique(),
            "period_min": tidy_df["period"].min(),
            "period_max": tidy_df["period"].max(),
            "freq_guess": freq,
            "missing_values": tidy_df["value"].isna().sum()
        })
        all_tidy.append(tidy_df)

audit = pd.DataFrame(audit_rows)
audit_path = DATA_PROCESSED / "audit_summary.csv"
audit.to_csv(audit_path, index=False)
print(f"Audit summary written -> {audit_path}")

# -------------------------------------------------------------------
# 4) Build master panel (concat all tidy files)
# -------------------------------------------------------------------
if all_tidy:
    master = pd.concat(all_tidy, ignore_index=True).drop_duplicates()
    master["period"] = pd.to_datetime(master["period"], errors="coerce")
    master = master.dropna(subset=["period"]).sort_values(["measure","period"]).reset_index(drop=True)

    master_path = DATA_PROCESSED / "master_panel.csv"
    master.to_csv(master_path, index=False)
    print(f"Master panel written -> {master_path} ({len(master):,} rows)")
else:
    print("No tidy data produced; master panel not created.")

Project root: C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code
Raw: C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code\data\raw
Interim: C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code\data\interim
Processed: C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code\data\processed
Audit summary written -> C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code\data\processed\audit_summary.csv
Master panel written -> C:\Users\Aniruddha\Desktop\Business Project_BEMM466\Project Code\data\processed\master_panel.csv (1,774 rows)


## Augmented Master Panel (real MoD via YBGB, per‑capita, and QoQ/YoY growth)

In [3]:
# Robust, cross‑platform augmentation script
# — Builds real MoD series from the GDP implied deflator (YBGB)
# — Adds per‑capita versions
# — Adds QoQ and YoY growth rates for GDP (real) and MoD (real total/cap/current)
# Works with a LONG master (period, measure, value) or a WIDE master.
# Never crashes on missing inputs: it will compute what’s feasible and print a summary.

from __future__ import annotations
from pathlib import Path
from glob import glob
import pandas as pd
import numpy as np

# ----------------------------- Paths & I/O ------------------------------

DATA_PROCESSED = Path("data/processed")
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

# Search order (most to least specific). Absolute /mnt/data entries are harmless on Windows (just ignored if absent).
CANDIDATE_INPUT_PATTERNS = [
    "/mnt/data/master_panel_*.csv",
    "/mnt/data/master_panel.csv",
    "data/processed/master_panel_*.csv",
    "data/processed/master_panel.csv",
    "master_panel_*.csv",
    "master_panel.csv",
]

def _latest_match(patterns: list[str]) -> Path | None:
    """Return the most recently modified existing file among the glob patterns (cross‑platform, supports absolute)."""
    candidates: list[Path] = []
    for pat in patterns:
        for hit in glob(pat):
            p = Path(hit)
            if p.exists():
                candidates.append(p)
    if not candidates:
        return None
    return max(candidates, key=lambda p: p.stat().st_mtime)

inp = _latest_match(CANDIDATE_INPUT_PATTERNS)
if inp is None:
    raise FileNotFoundError(
        "No master panel found. Expected one of:\n  "
        + "\n  ".join(CANDIDATE_INPUT_PATTERNS)
    )

print(f"Loaded master: {inp}")
raw = pd.read_csv(inp)

# ----------------------------- Utilities --------------------------------

def _norm(name: str) -> str:
    """Normalize a column/label: lowercase and keep only a-z0-9."""
    return "".join(ch for ch in str(name).lower() if ch.isalnum())

def _build_norm_map(cols) -> dict[str, str]:
    """Map normalized -> original column names."""
    m = {}
    for c in cols:
        nc = _norm(c)
        if nc and nc not in m:
            m[nc] = c
    return m

def _find_col(df: pd.DataFrame, candidates: list[str]) -> str | None:
    """Find first matching column by normalized name across synonyms."""
    nm = _build_norm_map(df.columns)
    for cand in candidates:
        nc = _norm(cand)
        if nc in nm:
            return nm[nc]
    return None

def _ensure_quarter_index(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure index is PeriodIndex(Q) named 'quarter'."""
    if isinstance(df.index, pd.PeriodIndex) and df.index.freqstr and df.index.freqstr.upper().startswith("Q"):
        out = df.sort_index().copy()
        out.index.name = "quarter"
        return out

    # try typical date/period columns
    for pcol in ["quarter","period","date","time","obs_date","year_quarter","yearq","qtr"]:
        if pcol in df.columns:
            series = df[pcol].astype(str)
            # try parse 'YYYYQn' first, else datetime→to_period('Q')
            try:
                idx = pd.PeriodIndex(series, freq="Q")
            except Exception:
                idx = pd.to_datetime(series, errors="coerce").dt.to_period("Q")
            out = df.set_index(idx).sort_index()
            out.index.name = "quarter"
            return out

    raise ValueError("No quarterly period column found (expected one of: quarter/period/date/time/obs_date/year_quarter/yearq/qtr).")

def _pivot_if_long(df: pd.DataFrame) -> pd.DataFrame:
    """If LONG (has measure & value), pivot to WIDE. Otherwise, just ensure Q index."""
    lower = {c.lower() for c in df.columns}
    if {"measure","value"}.issubset(lower):
        # standardize case
        tmp = df.rename(columns={c: c.lower() for c in df.columns})
        tmp = _ensure_quarter_index(tmp)
        wide = tmp.pivot_table(index=tmp.index.name, columns="measure", values="value", aggfunc="first")
        wide.columns.name = None
        return wide
    return _ensure_quarter_index(df)

def _coerce_numeric(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s.astype(str).str.replace(",", ""), errors="coerce")

def _deflator_factor(deflator: pd.Series) -> pd.Series:
    """Convert deflator index (e.g., 2019=100) → factor; pass through if already ~1.x."""
    d = _coerce_numeric(deflator)
    med = d.median(skipna=True)
    if pd.isna(med):
        raise ValueError("Deflator has no numeric values.")
    return d/100.0 if med > 10 else d

def _detect_population_scale(pop: pd.Series) -> int:
    """If median < 1e6 assume thousands → ×1000; else persons."""
    med = pop.median(skipna=True)
    if pd.isna(med):
        return 1
    return 1000 if med < 1_000_000 else 1

def _pct_change(s: pd.Series, periods: int) -> pd.Series:
    out = _coerce_numeric(s).div(_coerce_numeric(s.shift(periods))).sub(1)
    return out.replace([np.inf, -np.inf], np.nan)

def _safe_add(df: pd.DataFrame, name: str, series: pd.Series) -> None:
    df[name] = series.reindex(df.index)

# ----------------------- Normalize (long→wide) -------------------------

df = _pivot_if_long(raw)

# ------------------- Candidate synonyms for columns --------------------

CANDS = {
    "gdp_real": [
        "gdp_real_abmi","abmi","gdp_cvm_abmi","gdp_cvm","gdp_chain_volume","gdp_volume",
        "gdp_real","real_gdp","rgdp","gdpr"
    ],
    "gdp_nominal": [
        "gdp_nominal_ybha","ybha","gdp_current_prices","gdp_nominal","ngdp","gdp_value","gdp_cp","gdp_curr"
    ],
    "deflator": [
        "gdp_deflator_ybgb","ybgb","gdp_deflator","gdp_deflator_index","gdp_implied_deflator",
        "implied_deflator_ybgb","ybgb_q","gdp_deflator_index_2019_100"
    ],
    "population": [
        "population_q","population","uk_population_q","pop_q","pop","uk_pop","population_thousands","population_millions"
    ],
    "mod_total_nom": [
        "mod_total_nom","mod_total_nominal","mod_total_current_prices","mod_total","defence_total_nom","defence_total"
    ],
    "mod_cap_nom": [
        "mod_cap_nom","mod_capital_nominal","mod_capital","mod_capex_nom","defence_capital_nom","mod_equipment_nom"
    ],
    "mod_cur_nom": [
        "mod_cur_nom","mod_current_nominal","mod_current","mod_recurrent_nom","defence_current_nom","mod_resource_nom"
    ],
}

# Locate columns by synonyms (case/format tolerant)
gdp_real_col = _find_col(df, CANDS["gdp_real"])
gdp_nom_col  = _find_col(df, CANDS["gdp_nominal"])
defl_col     = _find_col(df, CANDS["deflator"])
pop_col      = _find_col(df, CANDS["population"])
mod_tot_col  = _find_col(df, CANDS["mod_total_nom"])
mod_cap_col  = _find_col(df, CANDS["mod_cap_nom"])
mod_cur_col  = _find_col(df, CANDS["mod_cur_nom"])

messages = [
    "Matched columns (None means not found):",
    f"  • GDP real:      {gdp_real_col}",
    f"  • GDP nominal:   {gdp_nom_col}",
    f"  • GDP deflator:  {defl_col}",
    f"  • Population:    {pop_col}",
    f"  • MoD total CP:  {mod_tot_col}",
    f"  • MoD capex CP:  {mod_cap_col}",
    f"  • MoD current CP:{mod_cur_col}",
]

# -------------------------- Compute series -----------------------------

# Deflator factor (if available)
price_factor = None
if defl_col is not None:
    price_factor = _deflator_factor(df[defl_col].ffill().bfill())
else:
    messages.append("  ! Deflator missing → cannot deflate nominal series.")

# Real GDP (prefer existing; else nominal / deflator)
gdp_real_name = None
if gdp_real_col is not None:
    _safe_add(df, gdp_real_col, _coerce_numeric(df[gdp_real_col]))
    gdp_real_name = gdp_real_col
else:
    if (gdp_nom_col is not None) and (price_factor is not None):
        _safe_add(df, "gdp_real_abmi", _coerce_numeric(df[gdp_nom_col]) / price_factor)
        gdp_real_name = "gdp_real_abmi"
        messages.append(f"  • Derived real GDP as `{gdp_real_name}` from {gdp_nom_col}/{defl_col}.")
    else:
        messages.append("  ! Could not obtain real GDP (need either an existing real series OR nominal+deflator).")

# Real MoD series (deflate with YBGB)
def _deflate_mod(nom_col: str | None, out_name: str):
    if nom_col is None:
        messages.append(f"  ! Missing nominal series for {out_name} → skipped.")
        return
    if price_factor is None:
        messages.append(f"  ! Missing deflator → skipped {out_name}.")
        return
    _safe_add(df, out_name, _coerce_numeric(df[nom_col]) / price_factor)
    messages.append(f"  • Created {out_name} from {nom_col}/{defl_col}.")

_deflate_mod(mod_tot_col, "mod_total_real")
_deflate_mod(mod_cap_col, "mod_cap_real")
_deflate_mod(mod_cur_col, "mod_cur_real")

# Per‑capita series
if pop_col is not None:
    persons = _coerce_numeric(df[pop_col]).ffill().bfill()
    scale = _detect_population_scale(persons)
    persons = persons * scale

    if gdp_real_name is not None:
        _safe_add(df, "gdp_real_pc", _coerce_numeric(df[gdp_real_name]) / persons)
        messages.append("  • Added gdp_real_pc.")

    for base in ["mod_total_real","mod_cap_real","mod_cur_real"]:
        if base in df.columns:
            _safe_add(df, f"{base}_pc", _coerce_numeric(df[base]) / persons)
            messages.append(f"  • Added {base}_pc.")
else:
    messages.append("  ! Population missing → skipped per‑capita calculations.")

# QoQ & YoY growth for GDP (real) and MoD (real)
def _add_growth(col: str):
    if col in df.columns:
        df[f"{col}_qoq_pct"] = _pct_change(df[col], 1)
        df[f"{col}_yoy_pct"] = _pct_change(df[col], 4)
        messages.append(f"  • Added QoQ/YoY growth for {col}.")
    else:
        messages.append(f"  ! Growth skipped (missing series): {col}")

if gdp_real_name is not None:
    _add_growth(gdp_real_name)
for base in ["mod_total_real","mod_cap_real","mod_cur_real"]:
    _add_growth(base)

# ------------------------------- Save ----------------------------------

today = pd.Timestamp.today().date()
out_csv = DATA_PROCESSED / f"master_panel_{today:%Y%m%d}.csv"
out_par = DATA_PROCESSED / f"master_panel_{today:%Y%m%d}.parquet"
latest  = DATA_PROCESSED / "master_panel_latest.csv"

df.to_csv(out_csv, index=True)
try:
    df.to_parquet(out_par)  # may require pyarrow/fastparquet; okay to fail silently
except Exception:
    pass
df.to_csv(latest, index=True)

print("\n".join(messages))
print("\nSaved:")
print(f"  - {out_csv}")
if out_par.exists():
    print(f"  - {out_par}")
print(f"  - {latest}")

Loaded master: data\processed\master_panel.csv
Matched columns (None means not found):
  • GDP real:      None
  • GDP nominal:   None
  • GDP deflator:  None
  • Population:    None
  • MoD total CP:  None
  • MoD capex CP:  None
  • MoD current CP:None
  ! Deflator missing → cannot deflate nominal series.
  ! Could not obtain real GDP (need either an existing real series OR nominal+deflator).
  ! Missing nominal series for mod_total_real → skipped.
  ! Missing nominal series for mod_cap_real → skipped.
  ! Missing nominal series for mod_cur_real → skipped.
  ! Population missing → skipped per‑capita calculations.
  ! Growth skipped (missing series): mod_total_real
  ! Growth skipped (missing series): mod_cap_real
  ! Growth skipped (missing series): mod_cur_real

Saved:
  - data\processed\master_panel_20250902.csv
  - data\processed\master_panel_20250902.parquet
  - data\processed\master_panel_latest.csv
