# Dynamic Tag Analysis (Qwen codebook labels)

This notebook computes **temporal and channel-level dynamics** over Qwen-assigned, codebook-constrained rhetorical tags:
- `theme_cb`
- `claim_types_cb`
- `ctas_cb`
- `evidence_cb`

**Important scope note:** `combined_proba` is treated as a continuous MBFC-informed credibility-risk proxy used only for **risk-weighted descriptive summaries**. This notebook does **not** implement monitoring/alerting, nor does it interpret the score as a probability of misinformation.


In [None]:
from __future__ import annotations

import ast
import json
import math
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from scipy.spatial.distance import jensenshannon
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize


URL_PAT = re.compile(r"(http://|https://|www\.|t\.me/)", re.IGNORECASE)


def ensure_dirs(out_dir: Path) -> None:
    (out_dir / "artifacts").mkdir(parents=True, exist_ok=True)
    (out_dir / "tables").mkdir(parents=True, exist_ok=True)
    (out_dir / "figures").mkdir(parents=True, exist_ok=True)
    (out_dir / "report").mkdir(parents=True, exist_ok=True)


def split_multi(s: object) -> list[str]:
    if s is None:
        return []
    s = str(s).strip()
    if s == "" or s.lower() == "nan" or s.upper() == "NA":
        return []
    parts = [p.strip() for p in s.split(",") if p.strip()]
    parts = [p for p in parts if p.upper() != "NA"]
    return parts


def sanitize_token(s: object) -> str:
    s = str(s)
    s = s.replace("‑", "-").replace("–", "-").replace("—", "-")
    s = re.sub(r"\s+", "_", s.strip())
    s = re.sub(r"[^\w\-/]+", "_", s)
    return s


def build_tag_doc(theme: object, claim: object, cta: object, evid: object) -> str:
    toks: list[str] = []
    toks.append("THEME=" + sanitize_token(theme))
    for t in split_multi(claim):
        toks.append("CLAIM=" + sanitize_token(t))
    for t in split_multi(cta):
        toks.append("CTA=" + sanitize_token(t))
    for t in split_multi(evid):
        toks.append("EVID=" + sanitize_token(t))
    return " ".join(toks)


def shannon_entropy(p: Iterable[float]) -> float:
    p = np.asarray(list(p), dtype=float)
    p = p[p > 0]
    if p.size == 0:
        return 0.0
    return float(-(p * np.log(p)).sum())


def coverage_curve(counts: pd.Series, ks: Iterable[int] = (1, 5, 10, 20, 50, 100, 200, 500, 1000)) -> pd.DataFrame:
    total = float(counts.sum())
    out = []
    for k in ks:
        out.append({"k": int(k), "coverage": float(counts.head(int(k)).sum() / total) if total > 0 else 0.0})
    return pd.DataFrame(out)


def keep_top_k(df_counts: pd.DataFrame, tag_col: str, k: int) -> pd.DataFrame:
    df_counts = df_counts.copy()
    top = (
        df_counts.groupby(tag_col, observed=True)["n"]
        .sum()
        .sort_values(ascending=False)
        .head(int(k))
        .index
    )
    col = df_counts[tag_col]
    if pd.api.types.is_categorical_dtype(col):
        if "Other" not in col.cat.categories:
            df_counts[tag_col] = col.cat.add_categories(["Other"])
    df_counts[tag_col] = df_counts[tag_col].where(df_counts[tag_col].isin(top), other="Other")
    return df_counts


def js_divergence(p: np.ndarray, q: np.ndarray, eps: float = 1e-12) -> float:
    p = np.asarray(p, dtype=float)
    q = np.asarray(q, dtype=float)
    p = p / p.sum() if p.sum() > 0 else p
    q = q / q.sum() if q.sum() > 0 else q
    p = np.clip(p, eps, 1)
    q = np.clip(q, eps, 1)
    return float(jensenshannon(p, q) ** 2)  # divergence (not distance)


def herfindahl(p: np.ndarray) -> float:
    p = np.asarray(p, dtype=float)
    s = p.sum()
    if s <= 0:
        return 0.0
    p = p / s
    return float((p * p).sum())


def df_to_markdown(df: pd.DataFrame, floatfmt: str = "{:.4f}") -> str:
    cols = [str(c) for c in df.columns]
    lines = []
    lines.append("| " + " | ".join(cols) + " |")
    lines.append("| " + " | ".join(["---"] * len(cols)) + " |")
    for _, row in df.iterrows():
        vals = []
        for v in row.tolist():
            if v is None or (isinstance(v, float) and np.isnan(v)):
                vals.append("")
            elif isinstance(v, (float, np.floating)):
                vals.append(floatfmt.format(float(v)))
            else:
                vals.append(str(v))
        lines.append("| " + " | ".join(vals) + " |")
    return "\n".join(lines)


In [None]:
# --- Configuration ---
# Note: this notebook is designed to work whether you run it from DataAppend/ OR from tag_dynamics/.

ANALYSIS_START = "2024-01-01"
ANALYSIS_END: str | None = None

HIGH_TAIL_QUANTILE = 0.95  # descriptive tail definition

# Prototype clustering parameters
K_MIN, K_MAX, K_STEP = 10, 40, 5
K_FINAL: int | None = 25  # set None to auto-pick best silhouette
SILHOUETTE_SAMPLE = 5000  # avoid O(n^2) on very large prototype vocabularies

# Channel profiling
TOPN_CHANNELS = 80
CHANNEL_K_MIN, CHANNEL_K_MAX = 5, 20
CHANNEL_K_FINAL: int | None = 12

RANDOM_SEED = 0

# Recompute knobs
FORCE_REBUILD_BASE = False
FORCE_RECLUSTER = False


def resolve_paths() -> tuple[Path, Path, Path]:
    import os

    cwd = Path.cwd()

    override = os.environ.get("TAGDYN_INPUT_CSV")
    if override:
        override_path = Path(override).expanduser()
        if override_path.exists():
            base_dir = cwd if cwd.name == "tag_dynamics" else cwd / "tag_dynamics"
            input_csv = override_path
            out_dir = base_dir / "outputs"
            return base_dir, input_csv, out_dir
    if (cwd / "full_risk_v2_core_final_closed.csv").exists():
        base_dir = cwd / "tag_dynamics"
        input_csv = cwd / "full_risk_v2_core_final_closed.csv"
        out_dir = base_dir / "outputs"
        return base_dir, input_csv, out_dir
    parent_csv = (cwd / "../full_risk_v2_core_final_closed.csv").resolve()
    if parent_csv.exists():
        base_dir = cwd
        input_csv = parent_csv
        out_dir = base_dir / "outputs"
        return base_dir, input_csv, out_dir
    raise FileNotFoundError(
        "Could not find full_risk_v2_core_final_closed.csv in current directory or parent. "
        "Run from DataAppend/ or tag_dynamics/."
    )


BASE_DIR, INPUT_CSV, OUT_DIR = resolve_paths()
ensure_dirs(OUT_DIR)

BASE_DIR, INPUT_CSV, OUT_DIR


In [None]:
# --- Stage 0: Build/load base analysis table ---

USECOLS = [
    "M#",
    "msg_id",
    "channel",
    "date",
    "message_original",
    "combined_proba",
    "filled_channel",
    "filled_date",
    "fuzzy_filled_channel",
    "fuzzy_filled_date",
    "theme_cb",
    "claim_types_cb",
    "ctas_cb",
    "evidence_cb",
]


def build_base_table(force: bool = False) -> tuple[pd.DataFrame, dict]:
    base_path = OUT_DIR / "artifacts" / "base_table.pkl.gz"
    meta_path = OUT_DIR / "artifacts" / "base_table_meta.json"

    if base_path.exists() and not force:
        df = pd.read_pickle(base_path)
        meta = json.loads(meta_path.read_text(encoding="utf-8")) if meta_path.exists() else {}
        return df, meta

    df = pd.read_csv(INPUT_CSV, usecols=USECOLS, low_memory=False)

    df["channel_final"] = df["filled_channel"].combine_first(df["fuzzy_filled_channel"]).combine_first(df["channel"])
    df["date_final_raw"] = df["filled_date"].combine_first(df["fuzzy_filled_date"]).combine_first(df["date"])
    df["date_final"] = pd.to_datetime(df["date_final_raw"], utc=True, errors="coerce", format="mixed")

    df = df[df["channel_final"].notna() & df["date_final"].notna()].copy()
    df = df[df["combined_proba"].notna()].copy()
    df["combined_proba"] = df["combined_proba"].astype("float32")

    # Codebook tags (closed vocab)
    for c in ["theme_cb", "claim_types_cb", "ctas_cb", "evidence_cb"]:
        df[c] = df[c].fillna("NA").astype(str)

    # Time grains
    df["day"] = df["date_final"].dt.floor("D")
    df["week"] = df["date_final"].dt.to_period("W").astype(str)
    df["month"] = df["date_final"].dt.to_period("M").astype(str)
    df["year"] = df["date_final"].dt.year.astype("int32")

    # Prototype (canonical tag-combo)
    df["prototype"] = (
        df["theme_cb"]
        + " || "
        + df["claim_types_cb"]
        + " || "
        + df["ctas_cb"]
        + " || "
        + df["evidence_cb"]
    )

    # Structural metadata
    msg_text = df["message_original"].fillna("").astype(str)
    df["msg_len"] = msg_text.str.len().astype("int32")
    df["has_url"] = msg_text.str.contains(URL_PAT).astype("int8")

    # Keep only what we need for downstream (avoid carrying full text)
    keep_cols = [
        "M#",
        "msg_id",
        "channel_final",
        "date_final",
        "day",
        "week",
        "month",
        "year",
        "combined_proba",
        "theme_cb",
        "claim_types_cb",
        "ctas_cb",
        "evidence_cb",
        "prototype",
        "msg_len",
        "has_url",
    ]
    df = df[keep_cols].copy()

    # Reduce memory
    for c in ["channel_final", "theme_cb", "claim_types_cb", "ctas_cb", "evidence_cb", "prototype", "week", "month"]:
        df[c] = df[c].astype("category")

    df.to_pickle(base_path, compression="gzip", protocol=5)

    meta = {
        "input_csv": str(INPUT_CSV),
        "rows": int(len(df)),
        "channels": int(df["channel_final"].nunique()),
        "day_min": str(df["day"].min()),
        "day_max": str(df["day"].max()),
        "cols": list(df.columns),
        "pandas_version": pd.__version__,
    }
    meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")

    return df, meta


df_base, base_meta = build_base_table(force=FORCE_REBUILD_BASE)
base_meta


In [None]:
# --- Apply analysis window + define high-tail ---

def apply_window(df: pd.DataFrame) -> pd.DataFrame:
    start = pd.Timestamp(ANALYSIS_START, tz="UTC")
    out = df[df["date_final"] >= start].copy()
    if ANALYSIS_END is not None:
        end = pd.Timestamp(ANALYSIS_END, tz="UTC")
        out = out[out["date_final"] < end].copy()
    return out


df = apply_window(df_base)
thr = float(df["combined_proba"].quantile(HIGH_TAIL_QUANTILE))
df["is_high_tail"] = (df["combined_proba"] >= thr).astype("int8")

thr, df.shape


In [None]:
# --- Stage 1: Tag audit + high-tail lift (descriptive) ---

def make_tag_table(df: pd.DataFrame, field: str, out_csv: Path, high_tail_col: str = "is_high_tail") -> pd.DataFrame:
    if field == "theme_cb":
        g = (
            df.groupby(field, observed=True)
            .agg(
                n=("combined_proba", "size"),
                mean_risk=("combined_proba", "mean"),
                median_risk=("combined_proba", "median"),
                risk_mass=("combined_proba", "sum"),
                high_tail_rate=(high_tail_col, "mean"),
            )
            .reset_index()
            .rename(columns={field: "tag"})
        )
    else:
        tmp = df[[field, "combined_proba", high_tail_col]].copy()
        tmp[field] = tmp[field].astype(str).apply(split_multi)
        tmp = tmp.explode(field)
        tmp = tmp[tmp[field].notna() & (tmp[field] != "")]
        tmp[field] = tmp[field].astype(str)
        g = (
            tmp.groupby(field, observed=True)
            .agg(
                n=("combined_proba", "size"),
                mean_risk=("combined_proba", "mean"),
                median_risk=("combined_proba", "median"),
                risk_mass=("combined_proba", "sum"),
                high_tail_rate=(high_tail_col, "mean"),
            )
            .reset_index()
            .rename(columns={field: "tag"})
        )

    g["msg_share"] = g["n"] / g["n"].sum()
    g["risk_mass_share"] = g["risk_mass"] / g["risk_mass"].sum()

    base_tail = float(df[high_tail_col].mean())
    g["high_tail_lift"] = g["high_tail_rate"] / (base_tail if base_tail > 0 else np.nan)

    g = g.sort_values("high_tail_lift", ascending=False)
    g.to_csv(out_csv, index=False)
    return g


audit = {
    "analysis_start": ANALYSIS_START,
    "analysis_end": ANALYSIS_END,
    "rows": int(len(df)),
    "channels": int(df["channel_final"].nunique()),
    "days": int(df["day"].nunique()),
    "score_quantiles": {
        "q50": float(df["combined_proba"].quantile(0.50)),
        "q90": float(df["combined_proba"].quantile(0.90)),
        "q95": float(df["combined_proba"].quantile(0.95)),
        "q99": float(df["combined_proba"].quantile(0.99)),
        "max": float(df["combined_proba"].max()),
    },
    "high_tail_quantile": float(HIGH_TAIL_QUANTILE),
    "high_tail_threshold": float(thr),
    "high_tail_rate": float(df["is_high_tail"].mean()),
}
(OUT_DIR / "artifacts" / "audit.json").write_text(json.dumps(audit, indent=2), encoding="utf-8")

tags_theme = make_tag_table(df, "theme_cb", OUT_DIR / "tables" / "tags_theme.csv")
tags_claim = make_tag_table(df, "claim_types_cb", OUT_DIR / "tables" / "tags_claim.csv")
tags_cta = make_tag_table(df, "ctas_cb", OUT_DIR / "tables" / "tags_cta.csv")
tags_evid = make_tag_table(df, "evidence_cb", OUT_DIR / "tables" / "tags_evidence.csv")

# Diversity over time: theme entropy per week
theme_week = df.groupby(["week", "theme_cb"], observed=True).size().rename("n").reset_index()
total_week = df.groupby("week", observed=True).size().rename("N").reset_index()
theme_week = theme_week.merge(total_week, on="week")
theme_week["p"] = theme_week["n"] / theme_week["N"]
ent = (
    theme_week.groupby("week", observed=True)["p"]
    .apply(lambda s: shannon_entropy(s.values))
    .rename("theme_entropy")
    .reset_index()
)
ent.to_csv(OUT_DIR / "tables" / "theme_entropy_by_week.csv", index=False)

tags_theme.head(10)


In [None]:
# --- Stage 1b: Tag association with high-tail via log-odds z (Dirichlet prior) ---

def log_odds_dirichlet(counts_a: pd.Series, counts_b: pd.Series, alpha: pd.Series) -> pd.DataFrame:
    """
    Monroe et al.-style informative Dirichlet prior log-odds with z-scores.
    counts_a: counts in group A (e.g., high tail)
    counts_b: counts in group B (rest)
    alpha: prior pseudo-count per tag (e.g., global counts)
    """
    a = counts_a.copy()
    b = counts_b.copy()
    al = alpha.copy()

    a.index = a.index.map(str)
    b.index = b.index.map(str)
    al.index = al.index.map(str)

    vocab = sorted(set(a.index) | set(b.index) | set(al.index))
    a = a.reindex(vocab, fill_value=0).astype(float)
    b = b.reindex(vocab, fill_value=0).astype(float)
    al = al.reindex(vocab, fill_value=0.0).astype(float)

    a0 = float(a.sum())
    b0 = float(b.sum())
    al0 = float(al.sum())

    denom_a = (a0 + al0) - (a + al)
    denom_b = (b0 + al0) - (b + al)

    logit_a = np.log((a + al) / (denom_a + 1e-12))
    logit_b = np.log((b + al) / (denom_b + 1e-12))
    delta = logit_a - logit_b

    var = 1.0 / (a + al) + 1.0 / (b + al)
    z = delta / np.sqrt(var)

    out = (
        pd.DataFrame(
            {
                "tag": vocab,
                "count_high": a.values.astype(int),
                "count_rest": b.values.astype(int),
                "log_odds": delta.values,
                "z": z.values,
            }
        )
        .sort_values("z", ascending=False)
        .reset_index(drop=True)
    )
    return out


def tag_instance_counts(df: pd.DataFrame, field: str, group_col: str = "is_high_tail") -> tuple[pd.Series, pd.Series, pd.Series]:
    """
    Returns (counts_high, counts_rest, prior_alpha) for a tag field.
    For multi-label fields, counts are over tag instances (exploded).
    """
    if field == "theme_cb":
        high = df[df[group_col] == 1].groupby(field, observed=True).size()
        rest = df[df[group_col] == 0].groupby(field, observed=True).size()
        prior = high.add(rest, fill_value=0).astype(float)
        return high, rest, prior

    tmp = df[[field, group_col]].copy()
    tmp[field] = tmp[field].astype(str).apply(split_multi)
    tmp = tmp.explode(field)
    tmp = tmp[tmp[field].notna() & (tmp[field] != "")]
    tmp[field] = tmp[field].astype(str)

    high = tmp[tmp[group_col] == 1].groupby(field, observed=True).size()
    rest = tmp[tmp[group_col] == 0].groupby(field, observed=True).size()
    prior = high.add(rest, fill_value=0).astype(float)
    return high, rest, prior


TAG_FIELDS = ["theme_cb", "claim_types_cb", "ctas_cb", "evidence_cb"]
logodds_base: dict[str, pd.DataFrame] = {}

for field in TAG_FIELDS:
    c_hi, c_lo, prior = tag_instance_counts(df, field)
    lod = log_odds_dirichlet(c_hi, c_lo, alpha=prior + 1.0)  # +1 smoothing
    lod.to_csv(OUT_DIR / "tables" / f"logodds_{field}.csv", index=False)
    logodds_base[field] = lod

print("Wrote log-odds tables ->", OUT_DIR / "tables")


In [None]:
# --- Stage 1c: Sensitivity to tail definition (quantiles) ---

from scipy.stats import spearmanr

TAILS = [0.90, 0.95, 0.97, 0.99]
TOP_N = 30
TOP_STAB = 10
TAG_FIELDS = ["theme_cb", "claim_types_cb", "ctas_cb", "evidence_cb"]

if "logodds_base" not in globals():
    logodds_base = {f: pd.read_csv(OUT_DIR / "tables" / f"logodds_{f}.csv") for f in TAG_FIELDS}

results = []
stability = []

for q in TAILS:
    thr_q = float(df["combined_proba"].quantile(q))
    is_hi = (df["combined_proba"] >= thr_q).astype("int8")
    n_high = int(is_hi.sum())
    tail_rate = float(is_hi.mean())

    for field in TAG_FIELDS:
        tmp = df[[field]].copy()
        tmp["is_high_tail"] = is_hi.values

        c_hi, c_lo, prior = tag_instance_counts(tmp, field, group_col="is_high_tail")
        lod = log_odds_dirichlet(c_hi, c_lo, alpha=prior + 1.0)
        lod["tail_q"] = float(q)
        lod["thr"] = float(thr_q)

        top = lod.head(TOP_N).copy()
        top["field"] = field
        top["n_high_msgs"] = n_high
        top["tail_rate"] = tail_rate
        results.append(top)

        base = logodds_base[field]
        base_topN = set(base.head(TOP_N)["tag"].astype(str))
        cur_topN = set(top["tag"].astype(str))
        unionN = base_topN | cur_topN
        jaccN = (len(base_topN & cur_topN) / len(unionN)) if unionN else np.nan

        base_top10 = set(base.head(TOP_STAB)["tag"].astype(str))
        cur_top10 = set(lod.head(TOP_STAB)["tag"].astype(str))
        union10 = base_top10 | cur_top10
        jacc10 = (len(base_top10 & cur_top10) / len(union10)) if union10 else np.nan

        z_base = base.set_index("tag")["z"]
        z_cur = lod.set_index("tag")["z"]
        common = z_base.index.intersection(z_cur.index)
        if len(common) >= 2:
            rho = spearmanr(z_base.loc[common].values, z_cur.loc[common].values).correlation
            rho = float(rho) if rho is not None else np.nan
        else:
            rho = np.nan

        stability.append(
            {
                "field": field,
                "tail_q": float(q),
                "thr": float(thr_q),
                "topN": int(TOP_N),
                "n_high_msgs": n_high,
                "tail_rate": tail_rate,
                "jaccard_topN_vs_base": float(jaccN) if not (isinstance(jaccN, float) and np.isnan(jaccN)) else np.nan,
                "jaccard_top10_vs_base": float(jacc10) if not (isinstance(jacc10, float) and np.isnan(jacc10)) else np.nan,
                "spearman_z_vs_base": rho,
            }
        )

sens = pd.concat(results, ignore_index=True) if results else pd.DataFrame()
sens.to_csv(OUT_DIR / "tables" / "sensitivity_logodds_top_tags.csv", index=False)

stab = pd.DataFrame(stability)
stab.to_csv(OUT_DIR / "tables" / "sensitivity_logodds_stability.csv", index=False)

print("Wrote sensitivity tables ->", OUT_DIR / "tables")


In [None]:
# --- Stage 2: Temporal dynamics (shares + risk-weighted shares) ---

def dynamics_single(df: pd.DataFrame, time_col: str, tag_col: str, top_k: int, out_prefix: str) -> None:
    c = df.groupby([time_col, tag_col], observed=True).size().rename("n").reset_index()
    c = keep_top_k(c, tag_col, top_k)
    c = c.groupby([time_col, tag_col], observed=True)["n"].sum().reset_index()

    total = df.groupby(time_col, observed=True).size().rename("N").reset_index()
    c = c.merge(total, on=time_col)
    c["share"] = c["n"] / c["N"]
    c.to_csv(OUT_DIR / "tables" / f"{out_prefix}_{time_col}_share.csv", index=False)

    r = df.groupby([time_col, tag_col], observed=True)["combined_proba"].sum().rename("risk_mass").reset_index()
    r = keep_top_k(r.rename(columns={"risk_mass": "n"}), tag_col, top_k).rename(columns={"n": "risk_mass"})
    r = r.groupby([time_col, tag_col], observed=True)["risk_mass"].sum().reset_index()
    rt = df.groupby(time_col, observed=True)["combined_proba"].sum().rename("R").reset_index()
    r = r.merge(rt, on=time_col)
    r["risk_share"] = r["risk_mass"] / r["R"]
    r.to_csv(OUT_DIR / "tables" / f"{out_prefix}_{time_col}_risk_share.csv", index=False)


def dynamics_multi(df: pd.DataFrame, time_col: str, field: str, top_k: int, out_prefix: str) -> None:
    tmp = df[[time_col, field, "combined_proba"]].copy()
    tmp[field] = tmp[field].astype(str).apply(split_multi)
    tmp = tmp.explode(field)
    tmp = tmp[tmp[field].notna() & (tmp[field] != "")]
    tmp[field] = tmp[field].astype(str)

    c = tmp.groupby([time_col, field], observed=True).size().rename("n").reset_index().rename(columns={field: "tag"})
    c = keep_top_k(c, "tag", top_k)
    c = c.groupby([time_col, "tag"], observed=True)["n"].sum().reset_index()
    total = df.groupby(time_col, observed=True).size().rename("N").reset_index()
    c = c.merge(total, on=time_col)
    c["share"] = c["n"] / c["N"]
    c.to_csv(OUT_DIR / "tables" / f"{out_prefix}_{field}_{time_col}_share.csv", index=False)

    r = (
        tmp.groupby([time_col, field], observed=True)["combined_proba"]
        .sum()
        .rename("risk_mass")
        .reset_index()
        .rename(columns={field: "tag"})
    )
    r = keep_top_k(r.rename(columns={"risk_mass": "n"}), "tag", top_k).rename(columns={"n": "risk_mass"})
    r = r.groupby([time_col, "tag"], observed=True)["risk_mass"].sum().reset_index()
    rt = df.groupby(time_col, observed=True)["combined_proba"].sum().rename("R").reset_index()
    r = r.merge(rt, on=time_col)
    r["risk_share"] = r["risk_mass"] / r["R"]
    r.to_csv(OUT_DIR / "tables" / f"{out_prefix}_{field}_{time_col}_risk_share.csv", index=False)


# theme dynamics
dynamics_single(df, "week", "theme_cb", top_k=10, out_prefix="dyn")
dynamics_single(df, "month", "theme_cb", top_k=10, out_prefix="dyn")

# multi-label fields
for f in ["claim_types_cb", "ctas_cb", "evidence_cb"]:
    dynamics_multi(df, "week", f, top_k=12, out_prefix="dyn")
    dynamics_multi(df, "month", f, top_k=12, out_prefix="dyn")

print("Wrote dynamics tables ->", OUT_DIR / "tables")


In [None]:
# --- Stage 3: Prototype mining ---

def build_prototypes(df: pd.DataFrame) -> pd.DataFrame:
    proto = (
        df.groupby(["prototype", "theme_cb", "claim_types_cb", "ctas_cb", "evidence_cb"], observed=True)
        .agg(
            n=("combined_proba", "size"),
            mean_risk=("combined_proba", "mean"),
            median_risk=("combined_proba", "median"),
            risk_mass=("combined_proba", "sum"),
            high_tail_rate=("is_high_tail", "mean"),
        )
        .reset_index()
    )
    proto["risk_mass_share"] = proto["risk_mass"] / proto["risk_mass"].sum()
    proto["tag_doc"] = [
        build_tag_doc(t, c, a, e)
        for t, c, a, e in zip(proto["theme_cb"], proto["claim_types_cb"], proto["ctas_cb"], proto["evidence_cb"])
    ]
    return proto


proto_all = build_prototypes(df).sort_values("n", ascending=False)
proto_all.to_csv(OUT_DIR / "tables" / "prototypes_overall.csv", index=False)

cov_all = coverage_curve(df["prototype"].value_counts())
cov_all.to_csv(OUT_DIR / "tables" / "prototype_coverage_overall.csv", index=False)

hi = df[df["is_high_tail"] == 1].copy()
proto_hi = build_prototypes(hi).sort_values("n", ascending=False)
proto_hi.to_csv(OUT_DIR / "tables" / "prototypes_high_tail.csv", index=False)

cov_hi = coverage_curve(hi["prototype"].value_counts())
cov_hi.to_csv(OUT_DIR / "tables" / "prototype_coverage_high_tail.csv", index=False)

# per-theme top prototypes
top_by_theme = []
for theme, g in df.groupby("theme_cb", observed=True):
    vc = g["prototype"].value_counts().head(30)
    for p, n in vc.items():
        top_by_theme.append(
            {
                "theme_cb": str(theme),
                "prototype": str(p),
                "n": int(n),
                "share_within_theme": float(n / len(g)) if len(g) else 0.0,
            }
        )
pd.DataFrame(top_by_theme).to_csv(OUT_DIR / "tables" / "prototypes_top_by_theme.csv", index=False)

print("Wrote prototype tables ->", OUT_DIR / "tables")
proto_all.head(10)


In [None]:
# --- Stage 4: Cluster prototypes into strategy clusters ---

def pick_k(proto_count: int) -> list[int]:
    ks = list(range(int(K_MIN), int(K_MAX) + 1, int(K_STEP)))
    ks = [k for k in ks if 2 <= k < proto_count]
    return ks


def cluster_prototypes(df: pd.DataFrame, force: bool = False) -> tuple[pd.DataFrame, pd.DataFrame]:
    mapping_path = OUT_DIR / "artifacts" / "prototype_to_cluster.csv"
    base_with_clusters_path = OUT_DIR / "artifacts" / "base_with_clusters.pkl.gz"

    if mapping_path.exists() and base_with_clusters_path.exists() and not force:
        mapping = pd.read_csv(mapping_path)
        dfc = pd.read_pickle(base_with_clusters_path)
        return dfc, mapping

    proto = (
        df.groupby(["prototype", "theme_cb", "claim_types_cb", "ctas_cb", "evidence_cb"], observed=True)
        .agg(n=("combined_proba", "size"), mean_risk=("combined_proba", "mean"), risk_mass=("combined_proba", "sum"))
        .reset_index()
        .sort_values("n", ascending=False)
    )

    proto["tag_doc"] = [
        build_tag_doc(t, c, a, e)
        for t, c, a, e in zip(proto["theme_cb"], proto["claim_types_cb"], proto["ctas_cb"], proto["evidence_cb"])
    ]
    proto.to_csv(OUT_DIR / "tables" / "prototype_universe.csv", index=False)

    vec = TfidfVectorizer(token_pattern=r"(?u)\b[\w\-/=]+\b", min_df=1)
    X = vec.fit_transform(proto["tag_doc"].values)

    ks = pick_k(proto_count=X.shape[0])
    rows = []
    for k in ks:
        km = KMeans(n_clusters=k, random_state=RANDOM_SEED, n_init=20)
        lab = km.fit_predict(X)
        sil = float(
            silhouette_score(
                X,
                lab,
                metric="cosine",
                sample_size=min(int(SILHOUETTE_SAMPLE), int(X.shape[0] - 1)) if X.shape[0] > 2 else None,
                random_state=RANDOM_SEED,
            )
        )
        rows.append({"k": int(k), "silhouette_cosine": sil, "inertia": float(km.inertia_)})
    ksel = pd.DataFrame(rows)
    ksel.to_csv(OUT_DIR / "tables" / "k_selection_strategy_clusters.csv", index=False)

    if ksel.empty:
        raise RuntimeError(f"Not enough unique prototypes ({X.shape[0]}) to cluster.")

    if K_FINAL is None:
        k_final = int(ksel.sort_values("silhouette_cosine", ascending=False).iloc[0]["k"])
    else:
        k_final = int(K_FINAL)
        if k_final >= X.shape[0]:
            k_final = int(min(ksel["k"].max(), X.shape[0] - 1))
        if k_final < 2:
            k_final = int(ksel["k"].min())

    km = KMeans(n_clusters=k_final, random_state=RANDOM_SEED, n_init=50)
    proto["cluster"] = km.fit_predict(X)

    # Cluster signatures (top centroid tokens)
    feats = np.array(vec.get_feature_names_out())
    centroids = km.cluster_centers_
    sig = []
    for c in range(k_final):
        top_idx = np.argsort(centroids[c])[::-1][:15]
        sig.append({"cluster": int(c), "top_tokens": "; ".join(feats[top_idx].tolist())})
    pd.DataFrame(sig).to_csv(OUT_DIR / "tables" / "strategy_cluster_signatures.csv", index=False)

    mapping = proto[["prototype", "cluster"]].copy()
    mapping.to_csv(mapping_path, index=False)

    dfc = df.merge(mapping, on="prototype", how="left")
    dfc.to_pickle(base_with_clusters_path, compression="gzip", protocol=5)

    # Cluster summary (descriptive)
    base_tail = float(dfc["is_high_tail"].mean())
    cl = (
        dfc.groupby("cluster", observed=True)
        .agg(
            n_msgs=("combined_proba", "size"),
            mean_risk=("combined_proba", "mean"),
            risk_mass=("combined_proba", "sum"),
            high_tail_rate=("is_high_tail", "mean"),
        )
        .reset_index()
    )
    cl["risk_mass_share"] = cl["risk_mass"] / cl["risk_mass"].sum()
    cl["high_tail_lift"] = cl["high_tail_rate"] / (base_tail if base_tail > 0 else np.nan)
    cl = cl.sort_values("high_tail_lift", ascending=False)
    cl.to_csv(OUT_DIR / "tables" / "strategy_cluster_summary.csv", index=False)

    meta = {
        "k_final": int(k_final),
        "high_tail_quantile": float(HIGH_TAIL_QUANTILE),
        "high_tail_threshold": float(thr),
        "base_high_tail_rate": float(base_tail),
        "silhouette_sample": int(SILHOUETTE_SAMPLE),
    }
    (OUT_DIR / "artifacts" / "strategy_cluster_meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")

    return dfc, mapping


dfc, proto_cluster_map = cluster_prototypes(df, force=FORCE_RECLUSTER)
dfc[["prototype", "cluster"]].head()


In [None]:
# --- Stage 5: Cluster dynamics + drift ---

wk = dfc.groupby(["week", "cluster"], observed=True).size().rename("n").reset_index()
wN = dfc.groupby("week", observed=True).size().rename("N").reset_index()
wk = wk.merge(wN, on="week")
wk["share"] = wk["n"] / wk["N"]
wk.to_csv(OUT_DIR / "tables" / "cluster_share_by_week.csv", index=False)

wr = (
    dfc.groupby(["week", "cluster"], observed=True)["combined_proba"]
    .sum()
    .rename("risk_mass")
    .reset_index()
)
wR = dfc.groupby("week", observed=True)["combined_proba"].sum().rename("R").reset_index()
wr = wr.merge(wR, on="week")
wr["risk_share"] = wr["risk_mass"] / wr["R"]
wr.to_csv(OUT_DIR / "tables" / "cluster_risk_share_by_week.csv", index=False)

burst = wk.groupby("cluster", observed=True)["share"].agg(["median", "max", "mean", "std"]).reset_index()
burst["max_over_median"] = burst["max"] / burst["median"].replace(0, np.nan)
burst = burst.sort_values("max_over_median", ascending=False)
burst.to_csv(OUT_DIR / "tables" / "cluster_burstiness.csv", index=False)

# JS drift between consecutive weeks
clusters = sorted([int(c) for c in dfc["cluster"].dropna().unique()])
idx = {c: i for i, c in enumerate(clusters)}

week_list = sorted([str(w) for w in dfc["week"].dropna().unique()])
week_vec: dict[str, np.ndarray] = {}
for w, g in wk.groupby("week", observed=True):
    v = np.zeros(len(clusters))
    for c, n in zip(g["cluster"].values, g["n"].values):
        v[idx[int(c)]] = n
    week_vec[str(w)] = v

drift_rows = []
for i in range(1, len(week_list)):
    w0, w1 = week_list[i - 1], week_list[i]
    js = js_divergence(week_vec.get(w0, np.zeros(len(clusters))), week_vec.get(w1, np.zeros(len(clusters))))
    drift_rows.append({"week_prev": w0, "week": w1, "js_cluster": js})
pd.DataFrame(drift_rows).to_csv(OUT_DIR / "tables" / "cluster_js_drift_by_week.csv", index=False)

print("Wrote cluster dynamics tables ->", OUT_DIR / "tables")


In [None]:
# --- Stage 5b: JS drift for tag distributions (weekly) ---

def weekly_dist_single(df: pd.DataFrame, time_col: str, field: str) -> dict[str, pd.Series]:
    out: dict[str, pd.Series] = {}
    for t, g in df.groupby(time_col, observed=True):
        vc = g[field].astype(str).value_counts()
        out[str(t)] = vc
    return out


def weekly_dist_multi(df: pd.DataFrame, time_col: str, field: str) -> dict[str, pd.Series]:
    out: dict[str, pd.Series] = {}
    for t, g in df.groupby(time_col, observed=True):
        tmp = g[field].astype(str).apply(split_multi).explode()
        tmp = tmp[tmp.notna() & (tmp != "")]
        vc = tmp.astype(str).value_counts()
        out[str(t)] = vc
    return out


def js_drift_from_countdict(d: dict[str, pd.Series]) -> pd.DataFrame:
    weeks = sorted(d.keys())
    vocab = sorted(set().union(*[set(s.index) for s in d.values() if len(s) > 0]))
    if not vocab:
        return pd.DataFrame(columns=["week_prev", "week", "js"])

    idx = {str(k): i for i, k in enumerate(vocab)}

    def vec(s: pd.Series) -> np.ndarray:
        v = np.zeros(len(vocab))
        for k, n in s.items():
            v[idx[str(k)]] = float(n)
        return v

    rows = []
    for i in range(1, len(weeks)):
        w0, w1 = weeks[i - 1], weeks[i]
        js = js_divergence(vec(d[w0]), vec(d[w1]))
        rows.append({"week_prev": w0, "week": w1, "js": js})
    return pd.DataFrame(rows)


TAG_FIELDS = ["theme_cb", "claim_types_cb", "ctas_cb", "evidence_cb"]
for field in TAG_FIELDS:
    if field == "theme_cb":
        d = weekly_dist_single(dfc, "week", field)
    else:
        d = weekly_dist_multi(dfc, "week", field)
    drift = js_drift_from_countdict(d)
    drift.to_csv(OUT_DIR / "tables" / f"js_drift_{field}_by_week.csv", index=False)

print("Wrote tag JS drift tables ->", OUT_DIR / "tables")


In [None]:
# --- Stage 5c: Burst case study extraction (cluster-week exemplars) ---

wk_tbl = pd.read_csv(OUT_DIR / "tables" / "cluster_share_by_week.csv")
burst_tbl = pd.read_csv(OUT_DIR / "tables" / "cluster_burstiness.csv")

topC = burst_tbl.head(5)["cluster"].tolist()
peaks = (
    wk_tbl[wk_tbl["cluster"].isin(topC)]
    .sort_values(["cluster", "share"], ascending=[True, False])
    .groupby("cluster", as_index=False)
    .head(1)
)

peaks.to_csv(OUT_DIR / "tables" / "burst_case_study_peaks.csv", index=False)

rows = []
for _, r in peaks.iterrows():
    c = int(r["cluster"])
    w = str(r["week"])
    sub = dfc[(dfc["cluster"] == c) & (dfc["week"].astype(str) == w)]
    top_proto = sub["prototype"].value_counts().head(10)
    for p, n in top_proto.items():
        rows.append({"cluster": c, "week": w, "prototype": str(p), "n": int(n)})

pd.DataFrame(rows).to_csv(OUT_DIR / "tables" / "burst_case_study_top_prototypes.csv", index=False)
print("Wrote burst case study tables ->", OUT_DIR / "tables")


In [None]:
# --- Stage 6: Channel profiles (strategy ecosystems) ---

# Focus on top-N channels by volume
top_channels = (
    dfc.groupby("channel_final", observed=True)
    .size()
    .sort_values(ascending=False)
    .head(int(TOPN_CHANNELS))
    .index
    .tolist()
)
df_top = dfc[dfc["channel_final"].isin(top_channels)].copy()

# Channel x cluster counts
mat = pd.crosstab(df_top["channel_final"], df_top["cluster"]).astype(float)
X = normalize(mat.values, norm="l1", axis=1)

# Choose channel community K
ks = [k for k in range(int(CHANNEL_K_MIN), int(CHANNEL_K_MAX) + 1) if 2 <= k < X.shape[0]]
eval_rows = []
for k in ks:
    km = KMeans(n_clusters=k, random_state=RANDOM_SEED, n_init=20)
    lab = km.fit_predict(X)
    sil = float(silhouette_score(X, lab, metric="cosine"))
    eval_rows.append({"k": int(k), "silhouette_cosine": sil, "inertia": float(km.inertia_)})
eval_df = pd.DataFrame(eval_rows)
eval_df.to_csv(OUT_DIR / "tables" / "channel_k_selection.csv", index=False)

if eval_df.empty:
    raise RuntimeError(f"Not enough channels ({X.shape[0]}) to cluster into communities.")

if CHANNEL_K_FINAL is None:
    k_final = int(eval_df.sort_values("silhouette_cosine", ascending=False).iloc[0]["k"])
else:
    k_final = int(CHANNEL_K_FINAL)
    if k_final >= X.shape[0]:
        k_final = int(min(eval_df["k"].max(), X.shape[0] - 1))
    if k_final < 2:
        k_final = int(eval_df["k"].min())

km = KMeans(n_clusters=k_final, random_state=RANDOM_SEED, n_init=50)
chan_cluster = km.fit_predict(X)

# PCA for plotting
pca = PCA(n_components=2, random_state=RANDOM_SEED)
X2 = pca.fit_transform(X)

# Persist PCA explained variance for figure labels
pca_meta = {
    "pc1_var": float(pca.explained_variance_ratio_[0]),
    "pc2_var": float(pca.explained_variance_ratio_[1]),
}
(OUT_DIR / "artifacts" / "channel_pca_meta.json").write_text(json.dumps(pca_meta, indent=2), encoding="utf-8")

channel_profiles = pd.DataFrame(
    {
        "channel_final": mat.index.astype(str),
        "community": chan_cluster.astype(int),
        "pca1": X2[:, 0],
        "pca2": X2[:, 1],
    }
)

# Add specialization metrics + volume + mean risk
vol = df_top.groupby("channel_final", observed=True).size().rename("n_msgs")
mean_r = df_top.groupby("channel_final", observed=True)["combined_proba"].mean().rename("mean_risk")
channel_profiles = (
    channel_profiles.merge(vol.reset_index().rename(columns={"channel_final": "channel_final"}), on="channel_final")
    .merge(mean_r.reset_index().rename(columns={"channel_final": "channel_final"}), on="channel_final")
)

channel_profiles["hhi_specialization"] = [herfindahl(row) for row in X]
channel_profiles.to_csv(OUT_DIR / "tables" / "channel_profiles.csv", index=False)

# Community mean cluster shares
mat_share = pd.DataFrame(X, index=mat.index.astype(str), columns=[str(c) for c in mat.columns])
mat_share["community"] = chan_cluster.astype(int)
comm = mat_share.groupby("community", observed=True).mean(numeric_only=True)
comm.to_csv(OUT_DIR / "tables" / "community_mean_cluster_share.csv")

# Community x theme counts
df_top2 = df_top.merge(channel_profiles[["channel_final", "community"]], on="channel_final", how="left")
theme_comm = df_top2.groupby(["community", "theme_cb"], observed=True).size().rename("n").reset_index()
theme_comm.to_csv(OUT_DIR / "tables" / "community_theme_counts.csv", index=False)

print("Wrote channel profile tables ->", OUT_DIR / "tables")
channel_profiles.head()


In [None]:
# --- Stage 7: ICWSM-style plots (PDF+PNG) ---

def savefig(path_base: Path) -> None:
    plt.tight_layout()
    plt.savefig(str(path_base) + ".pdf", bbox_inches="tight", pad_inches=0.05)
    plt.savefig(str(path_base) + ".png", dpi=300, bbox_inches="tight", pad_inches=0.05)
    plt.close()


def week_label_to_ts(w: object) -> pd.Timestamp:
    s = str(w)
    # expected form: YYYY-MM-DD/YYYY-MM-DD
    try:
        return pd.to_datetime(s.split("/")[0], errors="coerce")
    except Exception:
        return pd.NaT


def plot_top_series(df_in: pd.DataFrame, time_col: str, tag_col: str, value_col: str, top_k: int, title: str, xlabel: str, ylabel: str, outpath: Path) -> None:
    order = df_in.groupby(tag_col, observed=True)[value_col].mean().sort_values(ascending=False).head(int(top_k)).index.tolist()
    pivot = df_in[df_in[tag_col].isin(order)].pivot_table(index=time_col, columns=tag_col, values=value_col, fill_value=0.0)
    pivot = pivot.sort_index()

    # nicer x-axis for weekly strings
    if time_col == "week":
        pivot.index = [week_label_to_ts(w) for w in pivot.index]
        pivot = pivot.sort_index()

    plt.figure(figsize=(9, 4))
    for col in pivot.columns:
        plt.plot(pivot.index, pivot[col], label=str(col))
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(ncol=2, fontsize=8)
    savefig(outpath)


figdir = OUT_DIR / "figures"

# Theme share + risk-weighted share
t1 = pd.read_csv(OUT_DIR / "tables" / "dyn_week_share.csv")
t2 = pd.read_csv(OUT_DIR / "tables" / "dyn_week_risk_share.csv")

plot_top_series(
    t1,
    time_col="week",
    tag_col="theme_cb",
    value_col="share",
    top_k=8,
    title="Theme share over time (weekly)",
    xlabel="Week",
    ylabel="Share of messages",
    outpath=figdir / "fig1_theme_share_week",
)

plot_top_series(
    t2,
    time_col="week",
    tag_col="theme_cb",
    value_col="risk_share",
    top_k=8,
    title="Theme risk-weighted share over time (weekly)",
    xlabel="Week",
    ylabel="Share of risk mass (Σ score)",
    outpath=figdir / "fig2_theme_risk_share_week",
)

# Claim/CTA/evidence dynamics (share)
for field in ["claim_types_cb", "ctas_cb", "evidence_cb"]:
    df_share = pd.read_csv(OUT_DIR / "tables" / f"dyn_{field}_week_share.csv")
    plot_top_series(
        df_share,
        time_col="week",
        tag_col="tag",
        value_col="share",
        top_k=10,
        title=f"{field} share over time (weekly)",
        xlabel="Week",
        ylabel="Share of messages",
        outpath=figdir / f"fig_{field}_share_week",
    )

# Prototype coverage
cov_all = pd.read_csv(OUT_DIR / "tables" / "prototype_coverage_overall.csv")
cov_hi = pd.read_csv(OUT_DIR / "tables" / "prototype_coverage_high_tail.csv")

plt.figure(figsize=(6, 4))
plt.plot(cov_all["k"], cov_all["coverage"], marker="o", label="All messages")
plt.plot(cov_hi["k"], cov_hi["coverage"], marker="o", label=f"High-tail (q≥{HIGH_TAIL_QUANTILE})")
plt.xscale("log")
plt.xlabel("Top-k prototypes (log scale)")
plt.ylabel("Cumulative coverage")
plt.legend()
savefig(figdir / "fig3_prototype_coverage")

# Cluster enrichment + burstiness
cl = pd.read_csv(OUT_DIR / "tables" / "strategy_cluster_summary.csv").head(12)
plt.figure(figsize=(7, 4))
plt.barh(cl["cluster"].astype(str), cl["high_tail_lift"].astype(float))
plt.gca().invert_yaxis()
plt.xlabel("High-tail lift (P(tail|cluster)/P(tail))")
plt.ylabel("Strategy cluster")
savefig(figdir / "fig4_cluster_high_tail_lift")

burst = pd.read_csv(OUT_DIR / "tables" / "cluster_burstiness.csv").head(12)
plt.figure(figsize=(7, 4))
plt.barh(burst["cluster"].astype(str), burst["max_over_median"].astype(float))
plt.gca().invert_yaxis()
plt.xlabel("Burstiness (max weekly share / median weekly share)")
plt.ylabel("Strategy cluster")
savefig(figdir / "fig5_cluster_burstiness")

# Drift time series
drift = pd.read_csv(OUT_DIR / "tables" / "cluster_js_drift_by_week.csv").sort_values("week")
x = [week_label_to_ts(w) for w in drift["week"].tolist()]
plt.figure(figsize=(9, 4))
plt.plot(x, drift["js_cluster"].astype(float))
plt.xlabel("Week")
plt.ylabel("JS divergence (cluster distribution)")
savefig(figdir / "fig6_cluster_js_drift")

# Channel communities in PCA space (self-explanatory legend)
ch = pd.read_csv(OUT_DIR / "tables" / "channel_profiles.csv")
comm_share = pd.read_csv(OUT_DIR / "tables" / "community_mean_cluster_share.csv")
sig = pd.read_csv(OUT_DIR / "tables" / "strategy_cluster_signatures.csv")

import json

def _pretty_token(s: object) -> str:
    t = str(s)
    t = t.replace("___", " & ").replace("_/_", " / ").replace("__", ", ")
    t = t.replace("_", " ")
    t = " ".join(t.split())
    return (t[:1].upper() + t[1:]) if t else t


def _tok(label: object) -> str:
    return sanitize_token(label).lower()


def _extract_codebook_lists(py_path: Path):
    try:
        tree = ast.parse(py_path.read_text(encoding="utf-8"))
    except Exception:
        return {}

    out = {}
    for node in tree.body:
        if not isinstance(node, ast.Assign) or len(node.targets) != 1 or not isinstance(node.targets[0], ast.Name):
            continue
        name = node.targets[0].id
        if name not in {"THEMES", "CLAIMS", "CTAS", "EVID"}:
            continue
        try:
            out[name] = ast.literal_eval(node.value)
        except Exception:
            continue

    return out


def _mk_map_from_labels(labels):
    return {_tok(l): str(l) for l in labels}


# Canonical display names so plot labels match the codebook
codebook_py = BASE_DIR.parent / "codebook_normalize_and_fill.py"
cb_lists = _extract_codebook_lists(codebook_py) if codebook_py.exists() else {}

THEME_MAP = _mk_map_from_labels(cb_lists.get("THEMES", []))
CLAIM_MAP = _mk_map_from_labels(cb_lists.get("CLAIMS", []))
CTA_MAP = _mk_map_from_labels(cb_lists.get("CTAS", []))
EVID_MAP = _mk_map_from_labels(cb_lists.get("EVID", []))

# Add observed multi-theme combos / missing tags when available
summary_path = BASE_DIR.parent / "full_risk_v2_core_codebook_summary.json"
if summary_path.exists():
    cb = json.loads(summary_path.read_text(encoding="utf-8"))
    for row in cb.get("theme_cb", []):
        if row and row[0]:
            THEME_MAP[_tok(row[0])] = str(row[0])

for m in [THEME_MAP, CLAIM_MAP, CTA_MAP, EVID_MAP]:
    m.setdefault(_tok("MISSING"), "MISSING")


def _display_from_map(map_, token: object) -> str:
    key = str(token).lower()
    return map_.get(key, _pretty_token(token))


def _display_theme(token: object) -> str:
    key = str(token).lower()
    if key in THEME_MAP:
        return THEME_MAP[key]
    parts = re.split(r"(?<!_)__(?!_)", key)
    if len(parts) > 1:
        return ", ".join([THEME_MAP.get(p, _pretty_token(p)) for p in parts])
    return _pretty_token(token)


def _pick_first(tokens, prefix, avoid=None):
    vals = [t[len(prefix) :] for t in tokens if t.startswith(prefix)]
    if not vals:
        return None
    if avoid:
        for v in vals:
            if v not in avoid:
                return v
    return vals[0]


def strategy_cluster_label(cluster_id: int) -> str:
    row = sig[sig["cluster"] == cluster_id]
    if row.empty:
        return f"Strategy {cluster_id}"
    tokens = [t.strip() for t in str(row.iloc[0]["top_tokens"]).split(";") if t.strip()]

    theme = _pick_first(tokens, "theme=") or "other__theme_"
    claim = (
        _pick_first(tokens, "claim=", avoid={"no_substantive_claim"})
        or _pick_first(tokens, "claim=")
        or "no_substantive_claim"
    )
    cta = (_pick_first(tokens, "cta=", avoid={"no_cta"}) or _pick_first(tokens, "cta=") or "no_cta")
    evid = (
        _pick_first(tokens, "evid=", avoid={"none_/_assertion_only"})
        or _pick_first(tokens, "evid=")
        or "none_/_assertion_only"
    )

    theme_s = _display_theme(theme)
    claim_s = _display_from_map(CLAIM_MAP, claim)
    cta_s = _display_from_map(CTA_MAP, cta)
    evid_s = _display_from_map(EVID_MAP, evid)

    extras = []
    if cta in {"buy_/_invest_/_donate", "no_cta"}:
        extras.append(cta_s)
    if evid in {"statistics", "quotes/testimony", "chart_/_price_graph_/_ta_diagram"}:
        extras.append(evid_s)
    extra = f" ({', '.join(extras)})" if extras else ""
    return f"{theme_s}: {claim_s}{extra}"


cluster_labels = {int(c): strategy_cluster_label(int(c)) for c in sig["cluster"].unique()}

comm_share = comm_share.set_index("community")
comm_share.columns = [int(c) for c in comm_share.columns]
top2 = {}
for cid, row in comm_share.iterrows():
    cols = row.sort_values(ascending=False).head(2).index.tolist()
    top2[int(cid)] = cols

# Plot IDs (C1..Ck) are deterministic: largest communities first
comm_sizes = ch.groupby("community", observed=True).size().sort_values(ascending=False)
comm_order = comm_sizes.index.astype(int).tolist()
plot_id = {cid: i + 1 for i, cid in enumerate(comm_order)}

community_labels = {}
for cid in comm_order:
    cs = top2.get(cid, [])
    if len(cs) >= 2:
        p1 = float(comm_share.loc[cid, cs[0]])
        p2 = float(comm_share.loc[cid, cs[1]])
        l1 = cluster_labels.get(cs[0], f"Strategy {cs[0]}")
        l2 = cluster_labels.get(cs[1], f"Strategy {cs[1]}")
        community_labels[cid] = f"{l1} [{p1:.0%}] + {l2} [{p2:.0%}]"
    elif len(cs) == 1:
        p1 = float(comm_share.loc[cid, cs[0]])
        l1 = cluster_labels.get(cs[0], f"Strategy {cs[0]}")
        community_labels[cid] = f"{l1} [{p1:.0%}]"
    else:
        community_labels[cid] = "Mixed strategies"

# Save mapping for reuse
pd.DataFrame(
    [
        {
            "community_raw": int(cid),
            "community_plot": f"C{plot_id[cid]}",
            "n_channels": int(comm_sizes.loc[cid]),
            "top_strategy_1": int(top2.get(cid, [None, None])[0]) if top2.get(cid) else None,
            "top_strategy_2": int(top2.get(cid, [None, None])[1]) if len(top2.get(cid, [])) > 1 else None,
            "label": community_labels[cid],
        }
        for cid in comm_order
    ]
).to_csv(OUT_DIR / "tables" / "community_labels.csv", index=False)

# --- Fig 7: Channel communities by strategy mix (paper-ready) ---
from itertools import combinations

# Paper-safe rendering defaults (TrueType embedding + readable strokes)
plt.rcParams.update({"pdf.fonttype": 42, "ps.fonttype": 42})


def _abbr_theme(theme: str) -> str:
    t = theme.strip()
    t = t.replace("\u2011", "-")  # non-breaking hyphen
    low = t.lower()
    if low.startswith("conversation") or low.startswith("chat"):
        return "Chat"
    if low.startswith("public health") or low.startswith("health"):
        return "Health"
    if low.startswith("finance") or "crypto" in low:
        return "Crypto"
    if low.startswith("news"):
        return "News"
    if low.startswith("politics"):
        return "Pol"
    if low.startswith("lifestyle"):
        return "Life"
    if low.startswith("technology"):
        return "Tech"
    if low.startswith("gaming"):
        return "Gaming"
    if low.startswith("sports"):
        return "Sports"
    if low.startswith("other"):
        return "Other"
    return t.split("/")[0].split(",")[0].split("(")[0].strip()[:12] or "Other"


def _abbr_claim(claim: str) -> str:
    c = claim.strip()
    low = c.lower()
    if low.startswith("announcement"):
        return "Ann"
    if low.startswith("verifiable factual"):
        return "Fact"
    if low.startswith("speculative forecast") or low.startswith("speculative"):
        return "Forecast"
    if low.startswith("promotional hype"):
        return "Hype"
    if low.startswith("opinion"):
        return "Opinion"
    if low.startswith("no substantive"):
        return "No-claim"
    if low.startswith("rumour"):
        return "Rumour"
    if low.startswith("misleading"):
        return "Mislead"
    if low.startswith("emotional"):
        return "Fear"
    if low.startswith("scarcity"):
        return "FOMO"
    if low.startswith("other"):
        return "Other"
    return c.split("/")[0].strip()[:12] or "Other"


def _abbr_extras(extras: list[str]) -> list[str]:
    out: list[str] = []
    for e in extras:
        low = e.lower().strip()
        if "no cta" in low and "NoCTA" not in out:
            out.append("NoCTA")
        elif "buy" in low and "Buy" not in out:
            out.append("Buy")
        elif "statistic" in low and "Stat" not in out:
            out.append("Stat")
        elif ("quote" in low or "testimony" in low) and "Quote" not in out:
            out.append("Quote")
        elif ("chart" in low or "price graph" in low or "ta" in low) and "Chart" not in out:
            out.append("Chart")
    return out


def strategy_label_parts(full_label: str) -> tuple[str, str, list[str]]:
    if ":" in full_label:
        theme, rest = full_label.split(":", 1)
    else:
        return "Other", full_label.strip(), []
    rest = rest.strip()
    claim = rest.split("(")[0].strip()
    m = re.search(r"\((.*?)\)", rest)
    extras = [x.strip() for x in m.group(1).split(",")] if m else []
    return theme.strip(), claim.strip(), extras


def strategy_label_short(full_label: str, include_extras: bool = True) -> str:
    theme, claim, extras = strategy_label_parts(full_label)
    t = _abbr_theme(theme)
    c = _abbr_claim(claim)
    s = f"{t}-{c}"
    if include_extras:
        ex = _abbr_extras(extras)
        if ex:
            s += "[" + ",".join(ex) + "]"
    return s


def strategy_label_phrase(full_label: str, include_extras: bool = False) -> str:
    theme, claim, extras = strategy_label_parts(full_label)
    t = _abbr_theme(theme)
    c = _abbr_claim(claim)
    s = f"{t} {c}"
    if include_extras:
        ex = _abbr_extras(extras)
        if ex:
            s += " (" + ", ".join(ex) + ")"
    return s


def community_short_label(cid: int) -> str:
    cs = top2.get(int(cid), [])
    if not cs:
        return "Mixed"
    t1 = int(cs[0])
    l1_full = cluster_labels.get(t1, f"Strategy {t1}")
    p1 = float(comm_share.loc[int(cid), t1])
    s1 = strategy_label_phrase(l1_full, include_extras=False)
    if len(cs) == 1:
        return s1
    t2 = int(cs[1])
    l2_full = cluster_labels.get(t2, f"Strategy {t2}")
    s2 = strategy_label_phrase(l2_full, include_extras=False)
    if s2 == s1:
        s2 = strategy_label_phrase(l2_full, include_extras=True)
    # add a hint when one strategy dominates
    if p1 >= 0.65:
        s1 = s1 + " (dom.)"
    return f"{s1} + {s2}"


def _resolve_label_overlaps(fig, ax, annotations, max_iter: int = 250) -> bool:
    fig.canvas.draw()
    renderer = fig.canvas.get_renderer()
    ax_bb = ax.get_window_extent(renderer)
    px_per_pt = fig.dpi / 72.0
    pad_px = 2.0
    step = 0.35

    for _ in range(int(max_iter)):
        fig.canvas.draw()
        renderer = fig.canvas.get_renderer()
        ax_bb = ax.get_window_extent(renderer)
        bbs = [ann.get_window_extent(renderer).expanded(1.05, 1.15) for ann in annotations]

        shifts = np.zeros((len(annotations), 2), dtype=float)
        n_ov = 0
        for i, j in combinations(range(len(annotations)), 2):
            if not bbs[i].overlaps(bbs[j]):
                continue
            n_ov += 1
            ci = np.array([(bbs[i].x0 + bbs[i].x1) / 2.0, (bbs[i].y0 + bbs[i].y1) / 2.0])
            cj = np.array([(bbs[j].x0 + bbs[j].x1) / 2.0, (bbs[j].y0 + bbs[j].y1) / 2.0])
            v = ci - cj
            if float(np.hypot(v[0], v[1])) < 1e-6:
                v = np.array([1.0, 0.0]) if i < j else np.array([-1.0, 0.0])
            v = v / float(np.hypot(v[0], v[1]))
            ox = min(bbs[i].x1, bbs[j].x1) - max(bbs[i].x0, bbs[j].x0)
            oy = min(bbs[i].y1, bbs[j].y1) - max(bbs[i].y0, bbs[j].y0)
            mag = max(float(ox), float(oy), 1.0) + pad_px
            shifts[i] += v * (mag / 2.0)
            shifts[j] -= v * (mag / 2.0)

        # Keep labels inside axes
        for idx, bb in enumerate(bbs):
            dx = 0.0
            dy = 0.0
            if bb.x0 < ax_bb.x0 + pad_px:
                dx += (ax_bb.x0 + pad_px) - bb.x0
            if bb.x1 > ax_bb.x1 - pad_px:
                dx -= bb.x1 - (ax_bb.x1 - pad_px)
            if bb.y0 < ax_bb.y0 + pad_px:
                dy += (ax_bb.y0 + pad_px) - bb.y0
            if bb.y1 > ax_bb.y1 - pad_px:
                dy -= bb.y1 - (ax_bb.y1 - pad_px)
            shifts[idx] += np.array([dx, dy])

        if n_ov == 0:
            return True

        # Apply shifts in offset-points coordinates
        for ann, (sx, sy) in zip(annotations, shifts):
            if sx == 0 and sy == 0:
                continue
            dx_pt, dy_pt = ann.get_position()
            ndx = float(dx_pt) + (float(sx) * step) / px_per_pt
            ndy = float(dy_pt) + (float(sy) * step) / px_per_pt
            # clamp so labels can't drift off
            ndx = float(np.clip(ndx, -90.0, 90.0))
            ndy = float(np.clip(ndy, -70.0, 70.0))
            ann.set_position((ndx, ndy))

    return False


# Signature table for paper (separate from the plot)
rows = []
for cid in comm_order:
    cs = top2.get(cid, [])
    if not cs:
        continue
    t1 = int(cs[0])
    t2 = int(cs[1]) if len(cs) > 1 else None
    p1 = float(comm_share.loc[cid, t1])
    p2 = float(comm_share.loc[cid, t2]) if t2 is not None else None
    l1_full = cluster_labels.get(t1, f"Strategy {t1}")
    l2_full = cluster_labels.get(t2, f"Strategy {t2}") if t2 is not None else ""
    rows.append(
        {
            "community_id": f"C{plot_id[cid]}",
            "community_raw": int(cid),
            "n_channels": int(comm_sizes.loc[cid]),
            "short_label": community_short_label(int(cid)),
            "top1_cluster": int(t1),
            "top1_label_full": l1_full,
            "top1_label_short": strategy_label_short(l1_full, include_extras=True),
            "top1_share": float(p1),
            "top2_cluster": int(t2) if t2 is not None else None,
            "top2_label_full": l2_full if t2 is not None else None,
            "top2_label_short": strategy_label_short(l2_full, include_extras=True) if t2 is not None else None,
            "top2_share": float(p2) if p2 is not None else None,
            "top_strats": (
                f"{strategy_label_short(l1_full, include_extras=True)} {p1:.0%}"
                + (f"; {strategy_label_short(l2_full, include_extras=True)} {p2:.0%}" if p2 is not None else "")
            ),
        }
    )
sig_table = pd.DataFrame(rows)
sig_table["_comm_num"] = sig_table["community_id"].astype(str).str.replace("C", "", regex=False).astype(int)
sig_table = sig_table.sort_values("_comm_num").drop(columns=["_comm_num"])
sig_table.to_csv(OUT_DIR / "tables" / "community_signature_table.csv", index=False)

# Compact signature string for figure/table panels
sig_table["sig_compact"] = ""
for r in sig_table.itertuples(index=False):
    top1_full = r.top1_label_full if not pd.isna(r.top1_label_full) else "Other: Other"
    s1 = strategy_label_phrase(str(top1_full), include_extras=False)

    missing_top2 = pd.isna(r.top2_label_full) or pd.isna(r.top2_share)
    if missing_top2:
        sig_table.loc[sig_table["community_id"] == r.community_id, "sig_compact"] = f"{s1} {r.top1_share:.0%}"
        continue

    top2_full = str(r.top2_label_full)
    s2 = strategy_label_phrase(top2_full, include_extras=False)
    if s1 == s2:
        s1 = strategy_label_phrase(str(top1_full), include_extras=True)
        s2 = strategy_label_phrase(top2_full, include_extras=True)

    sig_table.loc[sig_table["community_id"] == r.community_id, "sig_compact"] = (
        f"{s1} {r.top1_share:.0%} + {s2} {r.top2_share:.0%}"
    )

# Read explained variance (or fall back to current PCA object if present)
pc1_var = None
pc2_var = None
meta_path = OUT_DIR / "artifacts" / "channel_pca_meta.json"
if meta_path.exists():
    meta = json.loads(meta_path.read_text(encoding="utf-8"))
    pc1_var = float(meta.get("pc1_var", 0.0))
    pc2_var = float(meta.get("pc2_var", 0.0))
elif "pca" in globals():
    pc1_var = float(pca.explained_variance_ratio_[0])
    pc2_var = float(pca.explained_variance_ratio_[1])

# Grayscale-safe: marker shapes + black edges (color helps but isn't required)
markers = ["o", "s", "^", "v", "D", "P", "X", "<", ">", "h", "p", "*"]
cmap = plt.get_cmap("tab20")
colors = [cmap(i) for i in range(len(comm_order))]

# Layout switch: use "double" for a scatter + compact table panel (paper-ready).
# Use "single" for a single-column variant (legend moved to external table in the paper).
FIG7_MODE = "double"  # {"single", "double"}

with plt.rc_context(
    {
        "font.size": 9,
        "axes.labelsize": 9,
        "axes.titlesize": 10,
        "xtick.labelsize": 9,
        "ytick.labelsize": 9,
        "pdf.fonttype": 42,
        "ps.fonttype": 42,
    }
):
    if FIG7_MODE == "single":
        fig = plt.figure(figsize=(3.35, 4.9))
        gs = fig.add_gridspec(2, 1, height_ratios=[3.0, 2.0], hspace=0.15)
        ax = fig.add_subplot(gs[0, 0])
        ax_tbl = fig.add_subplot(gs[1, 0])
    else:
        fig = plt.figure(figsize=(7.0, 3.6))
        gs = fig.add_gridspec(1, 2, width_ratios=[1.55, 1.0], wspace=0.06)
        ax = fig.add_subplot(gs[0, 0])
        ax_tbl = fig.add_subplot(gs[0, 1])

    ax.grid(True, alpha=0.25, linewidth=0.6)

    annotations = []
    for i, cid in enumerate(comm_order):
        sub = ch[ch["community"] == cid]
        if sub.empty:
            continue
        pid = plot_id[cid]
        marker = markers[i % len(markers)]
        color = colors[i]

        ax.scatter(
            sub["pca1"],
            sub["pca2"],
            s=44,
            marker=marker,
            alpha=0.88,
            facecolor=color,
            edgecolor="black",
            linewidth=0.6,
            zorder=2,
        )

        cx, cy = float(sub["pca1"].mean()), float(sub["pca2"].mean())
        ann = ax.annotate(
            f"C{pid}",
            xy=(cx, cy),
            xycoords="data",
            xytext=(0, 0),
            textcoords="offset points",
            ha="center",
            va="center",
            fontsize=10,
            weight="bold",
            bbox=dict(boxstyle="round,pad=0.25", fc="white", ec="black", lw=0.9, alpha=0.98),
            arrowprops=dict(arrowstyle="-", color="0.35", lw=0.8, shrinkA=4, shrinkB=4),
            zorder=5,
        )
        annotations.append(ann)

    ax.set_xlabel(f"PC1 ({pc1_var*100:.1f}% var.)" if pc1_var is not None else "PC1")
    ax.set_ylabel(f"PC2 ({pc2_var*100:.1f}% var.)" if pc2_var is not None else "PC2")
    ax.set_title("Channel communities by strategy mix", pad=3)

    # Run overlap resolution after final layout so the bbox coordinates match export
    fig.tight_layout()
    ok = _resolve_label_overlaps(fig, ax, annotations, max_iter=500)
    if not ok:
        print("[fig7] Warning: label overlap solver reached max_iter; consider FIG7_MODE='double' or increasing figsize.")

    # Compact signature panel: marker + C# (n) + top-2 strategy mix
    ax_tbl.axis("off")
    ax_tbl.set_xlim(0, 1)
    ax_tbl.set_ylim(0, 1)
    # Column anchors (axes coords)
    X_MARK = 0.04
    X_COMM = 0.20
    X_N = 0.32
    X_SIG = 0.38
    ax_tbl.text(X_COMM, 0.98, "Comm", ha="right", va="top", fontsize=9, weight="bold", transform=ax_tbl.transAxes)
    ax_tbl.text(X_N, 0.98, "n", ha="right", va="top", fontsize=9, weight="bold", transform=ax_tbl.transAxes)
    ax_tbl.text(
        X_SIG,
        0.98,
        "Signature (top 2 strategies; mean share)",
        ha="left",
        va="top",
        fontsize=9,
        weight="bold",
        transform=ax_tbl.transAxes,
    )
    ax_tbl.plot([0.02, 0.98], [0.94, 0.94], color="0.8", lw=0.8, transform=ax_tbl.transAxes, clip_on=False)

    # Ensure table rows are ordered C1..Ck
    sig_tbl = sig_table.copy()
    sig_tbl["_n"] = sig_tbl["community_id"].astype(str).str.replace("C", "", regex=False).astype(int)
    sig_tbl = sig_tbl.sort_values("_n").drop(columns=["_n"])

    y = 0.90
    dy = 0.072 if FIG7_MODE == "double" else 0.078
    for r in sig_tbl.itertuples(index=False):
        raw = int(r.community_raw)
        idx = comm_order.index(raw)
        marker = markers[idx % len(markers)]
        color = colors[idx]

        ax_tbl.scatter(
            [X_MARK],
            [y],
            s=48,
            marker=marker,
            facecolor=color,
            edgecolor="black",
            linewidth=0.6,
            transform=ax_tbl.transAxes,
            clip_on=False,
            zorder=3,
        )
        ax_tbl.text(
            X_COMM,
            y,
            f"{r.community_id}",
            ha="right",
            va="center",
            fontsize=9,
            weight="bold",
            transform=ax_tbl.transAxes,
        )
        ax_tbl.text(
            X_N,
            y,
            f"{int(r.n_channels)}",
            ha="right",
            va="center",
            fontsize=9,
            transform=ax_tbl.transAxes,
        )
        ax_tbl.text(
            X_SIG,
            y,
            str(r.sig_compact),
            ha="left",
            va="center",
            fontsize=9,
            transform=ax_tbl.transAxes,
        )
        y -= dy

    out_base = figdir / "fig7_channel_communities_pca"
    fig.savefig(str(out_base) + ".pdf", bbox_inches="tight", pad_inches=0.05)
    fig.savefig(str(out_base) + ".png", dpi=300, bbox_inches="tight", pad_inches=0.05)
    plt.close(fig)

# Community x cluster heatmap
comm = pd.read_csv(OUT_DIR / "tables" / "community_mean_cluster_share.csv", index_col=0)
top_cols = comm.mean(axis=0).sort_values(ascending=False).head(15).index
comm = comm[top_cols].copy()

plt.figure(figsize=(10, 5))
plt.imshow(comm.values, aspect="auto")
plt.yticks(range(len(comm.index)), [str(i) for i in comm.index])
plt.xticks(range(len(comm.columns)), [str(c) for c in comm.columns], rotation=90)
plt.colorbar(label="Mean share")
plt.xlabel("Cluster")
plt.ylabel("Channel community")
savefig(figdir / "fig8_community_cluster_heatmap")

print("Wrote figures ->", figdir)


In [None]:
# --- Stage 8: Report scaffold (Markdown, no tabulate dependency) ---

audit = json.loads((OUT_DIR / "artifacts" / "audit.json").read_text(encoding="utf-8"))
base_meta = json.loads((OUT_DIR / "artifacts" / "base_table_meta.json").read_text(encoding="utf-8"))
clmeta = json.loads((OUT_DIR / "artifacts" / "strategy_cluster_meta.json").read_text(encoding="utf-8"))

lod_theme_path = OUT_DIR / "tables" / "logodds_theme_cb.csv"
lod_claim_path = OUT_DIR / "tables" / "logodds_claim_types_cb.csv"
lod_cta_path = OUT_DIR / "tables" / "logodds_ctas_cb.csv"
lod_evid_path = OUT_DIR / "tables" / "logodds_evidence_cb.csv"
stab_path = OUT_DIR / "tables" / "sensitivity_logodds_stability.csv"

lod_theme = pd.read_csv(lod_theme_path).head(12) if lod_theme_path.exists() else pd.DataFrame()
lod_claim = pd.read_csv(lod_claim_path).head(12) if lod_claim_path.exists() else pd.DataFrame()
lod_cta = pd.read_csv(lod_cta_path).head(12) if lod_cta_path.exists() else pd.DataFrame()
lod_evid = pd.read_csv(lod_evid_path).head(12) if lod_evid_path.exists() else pd.DataFrame()
stab = pd.read_csv(stab_path) if stab_path.exists() else pd.DataFrame()

clsum = pd.read_csv(OUT_DIR / "tables" / "strategy_cluster_summary.csv").head(12)
sig = pd.read_csv(OUT_DIR / "tables" / "strategy_cluster_signatures.csv").head(20)

md: list[str] = []
md.append("# Dynamic Tag Analysis (Qwen codebook labels)\n")
md.append("## Scope and disclaimers\n")
md.append(
    "This section analyzes the *temporal and channel-level dynamics* of Qwen-assigned, codebook-constrained rhetorical tags "
    "(theme, claim/framing, CTA, evidence). The continuous score `combined_proba` is treated as an MBFC-informed credibility-risk proxy "
    "used only for *risk-weighted descriptive summaries*. We do **not** interpret the score as a probability of misinformation or factual falsity.\n"
)

md.append("## Data window\n")
md.append(f"- Analysis window start: **{audit['analysis_start']}**\n")
md.append(f"- Messages in window: **{audit['rows']}**\n")
md.append(f"- Channels in window: **{audit['channels']}**\n")
md.append(f"- Days in window: **{audit['days']}**\n")
md.append(
    "- Score quantiles: "
    f"q50={audit['score_quantiles']['q50']:.4f}, "
    f"q95={audit['score_quantiles']['q95']:.4f}, "
    f"q99={audit['score_quantiles']['q99']:.4f}\n"
)
md.append(
    f"- High-tail definition (descriptive): top **{audit['high_tail_quantile']:.2f}**, "
    f"threshold={audit['high_tail_threshold']:.6f}\n"
)

md.append("\n## High-tail association (descriptive)\n")
md.append(
    "We quantify tag association with the high-score tail using Monroe et al.-style log-odds with an informative Dirichlet prior "
    "(reported as log-odds and z-scores). This is descriptive (not causal).\n"
)

md.append("\nTop themes by log-odds z (high tail vs rest):\n")
if not lod_theme.empty:
    md.append(df_to_markdown(lod_theme[["tag", "count_high", "count_rest", "log_odds", "z"]]))
else:
    md.append("(Missing: outputs/tables/logodds_theme_cb.csv)\n")

md.append("\n\nTop claim/framing tags by log-odds z:\n")
if not lod_claim.empty:
    md.append(df_to_markdown(lod_claim[["tag", "count_high", "count_rest", "log_odds", "z"]]))
else:
    md.append("(Missing: outputs/tables/logodds_claim_types_cb.csv)\n")

md.append("\n\nTop CTA tags by log-odds z:\n")
if not lod_cta.empty:
    md.append(df_to_markdown(lod_cta[["tag", "count_high", "count_rest", "log_odds", "z"]]))
else:
    md.append("(Missing: outputs/tables/logodds_ctas_cb.csv)\n")

md.append("\n\nTop evidence tags by log-odds z:\n")
if not lod_evid.empty:
    md.append(df_to_markdown(lod_evid[["tag", "count_high", "count_rest", "log_odds", "z"]]))
else:
    md.append("(Missing: outputs/tables/logodds_evidence_cb.csv)\n")

md.append("\n## Sensitivity to tail definition\n")
md.append(
    "We recompute log-odds at multiple tail quantiles (q ∈ {0.90, 0.95, 0.97, 0.99}). "
    "The table reports stability vs the baseline tail (the notebook's `HIGH_TAIL_QUANTILE`) using Jaccard overlap of top-N tags and "
    "Spearman correlation of z-scores.\n"
)
if not stab.empty:
    md.append(
        df_to_markdown(
            stab[[
                "field",
                "tail_q",
                "thr",
                "n_high_msgs",
                "tail_rate",
                "topN",
                "jaccard_top10_vs_base",
                "jaccard_topN_vs_base",
                "spearman_z_vs_base",
            ]]
        )
    )
else:
    md.append("(Missing: outputs/tables/sensitivity_logodds_stability.csv)\n")

md.append("\n## Drift and burst case studies\n")
md.append("- Cluster JS drift: outputs/tables/cluster_js_drift_by_week.csv\n")
md.append("- Tag JS drift (weekly): outputs/tables/js_drift_{theme_cb,claim_types_cb,ctas_cb,evidence_cb}_by_week.csv\n")
md.append("- Burst peaks: outputs/tables/burst_case_study_peaks.csv\n")
md.append("- Burst prototypes: outputs/tables/burst_case_study_top_prototypes.csv\n")

md.append("\n## Strategy prototypes and clustering\n")
md.append(f"- Strategy clusters (k-means over prototype tag tokens): **k={clmeta['k_final']}**\n")
md.append("Top clusters by high-tail lift:\n")
md.append(df_to_markdown(clsum[["cluster", "n_msgs", "mean_risk", "high_tail_lift", "risk_mass_share"]]))

md.append("\n\nCluster signatures (top tokens per cluster):\n")
md.append(df_to_markdown(sig[["cluster", "top_tokens"]], floatfmt="{:.4f}"))

md.append("\n## Figures produced\n")
md.append("- outputs/figures/fig1_theme_share_week.{pdf,png}\n")
md.append("- outputs/figures/fig2_theme_risk_share_week.{pdf,png}\n")
md.append("- outputs/figures/fig3_prototype_coverage.{pdf,png}\n")
md.append("- outputs/figures/fig4_cluster_high_tail_lift.{pdf,png}\n")
md.append("- outputs/figures/fig5_cluster_burstiness.{pdf,png}\n")
md.append("- outputs/figures/fig6_cluster_js_drift.{pdf,png}\n")
md.append("- outputs/figures/fig7_channel_communities_pca.{pdf,png}\n")
md.append("- outputs/figures/fig8_community_cluster_heatmap.{pdf,png}\n")

out_md = OUT_DIR / "report" / "dynamic_tag_analysis_section.md"
out_md.write_text("\n".join(md), encoding="utf-8")

out_md
