# Graphical Analysis of Structured Experiment Data

Visualizes analysis metrics from CSVs in `data/structured_analysis/` using `src/graphical_analysis` utilities.

- Exemplary single-experiment trajectories with all metrics overlayed.
- Aggregated similarity trajectories across experiments with 95% CIs.



In [None]:
# Setup
import sys
from pathlib import Path

sys.path.append("src")

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from graphical_analysis import (
    ANALYSIS_METRICS,
    list_experiment_csvs,
    load_experiment_rows,
    merge_metric_across_experiments,
    merge_metric_aligned_from_first_llm,
    aggregate_metric_across_experiments,
)

sns.set_theme(style="whitegrid")

DATA_ROOT = Path("../data/structured_analysis")
assert DATA_ROOT.exists(), f"Missing data dir: {DATA_ROOT}"

csv_paths = list_experiment_csvs(str(DATA_ROOT))
print(f"Found {len(csv_paths)} experiment CSVs")
assert csv_paths, "No experiment CSVs found in data/structured_analysis"


In [None]:
# Exemplary single-experiment trajectories (separate scales)
# Simpler layout: for each experiment, two subplots side-by-side
# Left: non-similarity metrics; Right: similarity metrics in [-1, 1]

import re
from graphical_analysis import compute_relative_iteration_from_first_llm

MAX_EXPERIMENTS = 3
SIMILARITY_METRICS = [
    "lexical_similarity",
    "semantic_similarity",
    "lexical_similarity_window",
    "semantic_similarity_window",
]

# Pick up to MAX_EXPERIMENTS unique models
model_to_csv = {}
for p in csv_paths:
    exp_name = Path(p).parent.name
    m = re.search(r"single_agent_(.+?)_pivot", exp_name)
    model_key = m.group(1) if m else exp_name
    if model_key not in model_to_csv:
        model_to_csv[model_key] = p
    if len(model_to_csv) >= MAX_EXPERIMENTS:
        break

exp_to_rows = {}
for model_key, p in model_to_csv.items():
    rows = load_experiment_rows(p)
    if not rows:
        continue
    exp_to_rows[f"{Path(p).parent.name} (model={model_key})"] = rows

num_plots = len(exp_to_rows)
fig, axes = plt.subplots(num_plots, 2, figsize=(14, 4.5 * max(1, num_plots)), sharex="col")
if num_plots == 1:
    axes = np.array([axes])

for row_idx, (exp, rows) in enumerate(exp_to_rows.items()):
    ax_left = axes[row_idx, 0]
    ax_right = axes[row_idx, 1]

    df = pd.DataFrame([r.__dict__ for r in rows])
    it = df["iteration"].to_numpy()

    # Mark LLM start
    df_rel = compute_relative_iteration_from_first_llm(rows)
    x0 = None
    if not df_rel["rel_iter_from_llm"].isna().all():
        candidates = df_rel[df_rel["rel_iter_from_llm"] == 0]
        if not candidates.empty:
            x0 = int(candidates.iloc[0]["iteration"])  # type: ignore[index]

    # Non-similarity metrics
    for metric in ANALYSIS_METRICS:
        if metric in SIMILARITY_METRICS:
            continue
        if metric not in df.columns:
            continue
        vals = df[metric].astype(float).to_numpy()
        if np.all(np.isnan(vals)):
            continue
        ax_left.plot(it, vals, marker="o", linewidth=1.6, markersize=3, label=metric, alpha=0.9)

    ax_left.set_title(
        f"{exp}\nnon-similarity",
        wrap=True
    )
    ax_left.set_xlabel("iteration")
    ax_left.set_ylabel("value")
    if x0 is not None:
        ax_left.axvline(x=x0, color="#444", linestyle="--", linewidth=1.2, alpha=0.8)

    # Similarity metrics
    for metric in SIMILARITY_METRICS:
        if metric not in df.columns:
            continue
        vals = df[metric].astype(float).to_numpy()
        if np.all(np.isnan(vals)):
            continue
        ax_right.plot(it, vals, marker="o", linewidth=1.6, markersize=3, label=metric, alpha=0.9)

    ax_right.set_title(
        f"{exp} — similarity",
        wrap=True
    )
    ax_right.set_xlabel("iteration")
    ax_right.set_ylabel("similarity [0, 1]")
    ax_right.set_ylim(0.05, 1.05)
    if x0 is not None:
        ax_right.axvline(x=x0, color="#444", linestyle="--", linewidth=1.2, alpha=0.8)

    ax_left.legend(loc="upper left", fontsize=8, frameon=False)
    ax_right.legend(loc="upper left", fontsize=8, frameon=False)

plt.tight_layout()
plt.show()


In [None]:
# Aggregated similarity aligned to first LLM-only generation
# Uses `rel_iter_from_llm` instead of absolute iteration.

SIMILARITY_METRICS = [
    "lexical_similarity",
    "semantic_similarity",
]

WINDOW = 1

for metric in SIMILARITY_METRICS:
    merged_rel = merge_metric_aligned_from_first_llm(csv_paths, metric)
    if merged_rel.empty:
        print(f"No data for metric: {metric}")
        continue
    # rename rel iteration to a common column expected by aggregator
    tmp = merged_rel.rename(columns={"rel_iter_from_llm": "iteration"})
    agg = aggregate_metric_across_experiments(tmp, window=WINDOW)

    plt.figure(figsize=(10, 4))
    plt.plot(agg["iteration"], agg["mean"], color="C1", label=f"mean {metric}")
    plt.plot(agg["iteration"], agg["std"], color="C2", 
            linestyle="--", label=f"std {metric}")
    plt.fill_between(
        agg["iteration"], agg["ci_low"], agg["ci_high"], color="C1", alpha=0.2
    )
    plt.title(
        f"Aggregated {metric} across experiments aligned to first LLM-only gen (window={WINDOW})"
    )
    plt.xlabel("relative iteration from first LLM-only generation") 
    plt.ylabel(metric)
    plt.legend(loc="best")
    plt.tight_layout()
    plt.show()
