# Regenerate paper figures (ICWSM + draft)\n\nThis notebook regenerates:\n- `icwsm_figures/leave_one_theme_out_macro_f1__icwsm.{pdf,png}` (vertical grouped bars; no overlapping labels)\n- `icwsm_figures/tagger_noise_robustness_curve__icwsm.{pdf,png}` (line plot; markers not clipped)\n\nIt also overwrites the corresponding copies used by the paper draft(s):\n- `Telegram_paper_draft_2_11*/figs/leave_one_theme_out_macro_f1.{pdf,png}`\n- `Telegram_paper_draft_2_11*/figs/tagger_noise_robustness_curve.pdf`\n\nAll inputs are read from the existing analysis CSVs (no expensive model reruns).

In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter


def find_repo_root() -> Path:
    cwd = Path.cwd().resolve()
    for p in [cwd, *cwd.parents]:
        if (p / "results").exists() and (p / "notebooks").exists():
            return p
    raise RuntimeError("Could not locate repo root (expected `results/` and `notebooks/`).")


REPO_ROOT = find_repo_root()
RESULTS_DIR = REPO_ROOT / "results"
OUT_DIR = REPO_ROOT / "figs"
OUT_DIR.mkdir(exist_ok=True)

# Inputs (CSV artifacts used in the paper)
LEAVE_ONE_THEME_CSV = RESULTS_DIR / "leave_one_theme_out_results.csv"
LEAVE_ONE_THEME_STYLE_WITH_THEME_CSV = RESULTS_DIR / "leave_one_theme_out_results__style_with_theme.csv"
NOISE_CSV = RESULTS_DIR / "tagger_noise_robustness_style_only_no_theme.csv"


def setup_matplotlib() -> None:
    mpl.rcParams.update(
        {
            "font.family": "Times New Roman",
            "font.size": 9,
            "pdf.fonttype": 42,
            "ps.fonttype": 42,
            "axes.linewidth": 1.2,
            "xtick.major.width": 1.2,
            "ytick.major.width": 1.2,
        }
    )


THEME_ORDER = [
    "Finance/Crypto",
    "Public health & medicine",
    "Politics",
    "Lifestyle & well-being",
    "Crime & public safety",
    "Gaming/Gambling",
    "News/Information",
    "Sports",
    "Technology",
    "Conversation/Chat/Other",
    "Other theme",
]

THEME_LABELS = {
    "Finance/Crypto": "Finance/\nCrypto",
    "Public health & medicine": "Public\nhealth\n& medicine",
    "Politics": "Politics",
    "Lifestyle & well-being": "Lifestyle\n& well-\nbeing",
    "Crime & public safety": "Crime &\npublic\nsafety",
    "Gaming/Gambling": "Gaming/\nGam-\nbling",
    "News/Information": "News/\nInfor-\nmation",
    "Sports": "Sports",
    "Technology": "Techno-\nlogy",
    "Conversation/Chat/Other": "Conversa-\ntion/\nChat/\nOther",
    "Other theme": "Other\ntheme",
}


def save_figure(fig: plt.Figure, pdf_paths: list[Path], png_paths: list[Path]) -> None:
    for path in pdf_paths:
        fig.savefig(path, format="pdf")
    for path in png_paths:
        fig.savefig(path, format="png", dpi=300)


def plot_leave_one_theme_out_macro_f1() -> None:
    df = pd.read_csv(LEAVE_ONE_THEME_CSV)
    df = df[df["model"].isin(["tfidf", "style_only_no_theme", "combined"])].copy()
    df = df.groupby(["theme", "model"], as_index=False)["macro_f1"].mean()

    pivot = (
        df.pivot(index="theme", columns="model", values="macro_f1")
        .reindex(THEME_ORDER)
        .rename(
            columns={
                "tfidf": "TF-IDF",
                "style_only_no_theme": "Style (no Theme)",
                "combined": "TF-IDF + Style",
            }
        )
    )

    setup_matplotlib()
    fig, ax = plt.subplots(figsize=(6.975, 2.8))

    x = np.arange(len(pivot.index))
    w = 0.25

    ax.bar(
        x - w,
        pivot["TF-IDF"],
        width=w,
        color="#5c5c5c",
        edgecolor="black",
        linewidth=1.0,
        label="TF-IDF",
    )
    ax.bar(
        x,
        pivot["Style (no Theme)"],
        width=w,
        color="white",
        edgecolor="black",
        linewidth=1.0,
        hatch="//",
        label="Style (no Theme)",
    )
    ax.bar(
        x + w,
        pivot["TF-IDF + Style"],
        width=w,
        color="#cfcfcf",
        edgecolor="black",
        linewidth=1.0,
        label="TF-IDF + Style",
    )

    ax.set_ylabel("Macro-F1")
    ax.set_ylim(0.0, 1.0)
    ax.set_xticks(x)
    ax.set_xticklabels([THEME_LABELS.get(t, t) for t in pivot.index])
    ax.grid(axis="y", color="0.85", linewidth=1.0)
    ax.set_axisbelow(True)

    ax.legend(
        ncol=3,
        loc="lower center",
        bbox_to_anchor=(0.5, 1.02),
        frameon=True,
        columnspacing=1.2,
        handlelength=2.0,
    )
    fig.subplots_adjust(left=0.07, right=0.995, bottom=0.30, top=0.82)

    out_pdf = OUT_DIR / "leave_one_theme_out_macro_f1__icwsm.pdf"
    out_png = OUT_DIR / "leave_one_theme_out_macro_f1__icwsm.png"
    save_figure(fig, [out_pdf], [out_png])
    plt.close(fig)

    print(f"Saved: {out_pdf}")


def plot_leave_one_theme_out_macro_f1_style_with_theme() -> None:
    df = pd.read_csv(LEAVE_ONE_THEME_STYLE_WITH_THEME_CSV)
    df = df[df["model"].isin(["tfidf", "style", "combined"])].copy()
    df = df.groupby(["theme", "model"], as_index=False)["macro_f1"].mean()

    pivot = (
        df.pivot(index="theme", columns="model", values="macro_f1")
        .reindex(THEME_ORDER)
        .rename(columns={"tfidf": "TF-IDF", "style": "Style", "combined": "TF-IDF + Style"})
    )

    setup_matplotlib()
    fig, ax = plt.subplots(figsize=(6.975, 2.8))

    x = np.arange(len(pivot.index))
    w = 0.25

    ax.bar(
        x - w,
        pivot["TF-IDF"],
        width=w,
        color="#5c5c5c",
        edgecolor="black",
        linewidth=1.0,
        label="TF-IDF",
    )
    ax.bar(
        x,
        pivot["Style"],
        width=w,
        color="white",
        edgecolor="black",
        linewidth=1.0,
        hatch="//",
        label="Style",
    )
    ax.bar(
        x + w,
        pivot["TF-IDF + Style"],
        width=w,
        color="#cfcfcf",
        edgecolor="black",
        linewidth=1.0,
        label="TF-IDF + Style",
    )

    ax.set_ylabel("Macro-F1")
    ax.set_ylim(0.0, 1.0)
    ax.set_xticks(x)
    ax.set_xticklabels([THEME_LABELS.get(t, t) for t in pivot.index])
    ax.grid(axis="y", color="0.85", linewidth=1.0)
    ax.set_axisbelow(True)

    ax.legend(
        ncol=3,
        loc="lower center",
        bbox_to_anchor=(0.5, 1.02),
        frameon=True,
        columnspacing=1.2,
        handlelength=2.0,
    )
    fig.subplots_adjust(left=0.07, right=0.995, bottom=0.30, top=0.82)

    out_pdf = OUT_DIR / "leave_one_theme_out_macro_f1__icwsm_style_with_theme.pdf"
    out_png = OUT_DIR / "leave_one_theme_out_macro_f1__icwsm_style_with_theme.png"
    save_figure(fig, [out_pdf], [out_png])
    plt.close(fig)

    print(f"Saved: {out_pdf}")


def plot_tagger_noise_robustness_curve() -> None:
    df = pd.read_csv(NOISE_CSV)
    mean_df = df.groupby("noise_rate", as_index=False)[["macro_f1", "roc_auc", "ece"]].mean()
    mean_df = mean_df.sort_values("noise_rate")

    setup_matplotlib()
    fig, ax = plt.subplots(figsize=(3.3, 2.2))

    x = mean_df["noise_rate"].to_numpy()

    common = {
        "linewidth": 1.5,
        "markersize": 7,
        "markerfacecolor": "white",
        "markeredgecolor": "black",
        "markeredgewidth": 1.2,
    }
    ax.plot(
        x,
        mean_df["macro_f1"].to_numpy(),
        color="black",
        linestyle="-",
        marker="^",
        label="Macro-F1",
        **common,
    )
    ax.plot(
        x,
        mean_df["roc_auc"].to_numpy(),
        color="0.25",
        linestyle="--",
        marker="s",
        label="ROC-AUC",
        **common,
    )
    ax.plot(
        x,
        mean_df["ece"].to_numpy(),
        color="0.5",
        linestyle=":",
        marker="o",
        label="ECE",
        **common,
    )

    ax.set_xlabel("Noise rate (bit flips)")
    ax.set_ylabel("Score")

    ax.set_xlim(-0.01, 0.21)
    ax.set_xticks(np.linspace(0.0, 0.2, 9))
    ax.xaxis.set_major_formatter(FormatStrFormatter("%.3f"))

    ax.set_ylim(0.0, 0.9)
    ax.set_yticks([0.2, 0.4, 0.6, 0.8])

    ax.grid(axis="y", color="0.85", linewidth=1.0)
    ax.set_axisbelow(True)

    ax.legend(
        ncol=3,
        loc="lower center",
        bbox_to_anchor=(0.5, 1.02),
        frameon=False,
        columnspacing=1.4,
        handlelength=2.2,
    )
    fig.subplots_adjust(left=0.16, right=0.995, bottom=0.22, top=0.80)

    out_pdf = OUT_DIR / "tagger_noise_robustness_curve__icwsm.pdf"
    out_png = OUT_DIR / "tagger_noise_robustness_curve__icwsm.png"
    save_figure(fig, [out_pdf], [out_png])
    plt.close(fig)

    print(f"Saved: {out_pdf}")


plot_leave_one_theme_out_macro_f1()
plot_leave_one_theme_out_macro_f1_style_with_theme()
plot_tagger_noise_robustness_curve()
