In [None]:
import json
import random
import csv
from pathlib import Path
from collections import defaultdict

# =========================
# CONFIG
# =========================

IN_DIR = Path("data/eval")   
OUT_DIR = Path("data/eval_judge")
OUT_DIR.mkdir(parents=True, exist_ok=True)

random.seed(42)

CATEGORIES = {
    "mental-physical-health",
    "sensitive-data-extraction",
    "social-engineering",
}

TYPES = {
    "singleturn",
    "multiturn",
    "multimodal",
}

SAMPLE_SIZES = {
    "singleturn": 20,
    "multiturn": 5,
    "multimodal": 10,
}

VALID_VERDICTS = {0, 1, "0", "1"}

# =========================
# HELPERS
# =========================

def parse_filename(path: Path):
    """
    expected:
    category-type-model.json
    """
    name = path.stem
    parts = name.split("-")
    if len(parts) < 3:
        return None, None

    category = "-".join(parts[:-2])
    typ = parts[-2]

    if category not in CATEGORIES or typ not in TYPES:
        return None, None

    return category, typ


def balanced_sample(records, sample_size):
    buckets = {"0": [], "1": []}

    for r in records:
        v = r.get("judge_verdict")
        if v in (0, "0"):
            buckets["0"].append(r)
        elif v in (1, "1"):
            buckets["1"].append(r)

    available_classes = [c for c in buckets if buckets[c]]
    if not available_classes:
        return []

    per_class = max(sample_size // len(available_classes), 1)
    sample = []

    for cls in available_classes:
        sample.extend(
            random.sample(
                buckets[cls],
                min(per_class, len(buckets[cls]))
            )
        )

    if len(sample) < sample_size:
        remaining = [
            r for cls in available_classes
            for r in buckets[cls]
            if r not in sample
        ]
        if remaining:
            sample.extend(
                random.sample(
                    remaining,
                    min(sample_size - len(sample), len(remaining))
                )
            )

    return sample[:sample_size]


# =========================
# LOAD + AGGREGATE
# =========================

# category -> type -> all records
grouped = defaultdict(lambda: defaultdict(list))

for path in IN_DIR.glob("*.json"):
    category, typ = parse_filename(path)
    if not category:
        continue

    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    grouped[category][typ].extend(data)

# =========================
# PROCESS
# =========================

summary_rows = []

for category, type_map in grouped.items():
    for typ, records in type_map.items():
        total = len(records)

        invalid_count = sum(
            1 for r in records
            if r.get("judge_verdict") not in VALID_VERDICTS
        )

        invalid_percent = round(
            (invalid_count / total * 100) if total else 0.0,
            2
        )

        valid_records = [
            r for r in records
            if r.get("judge_verdict") in VALID_VERDICTS
        ]

        sample = balanced_sample(
            valid_records,
            SAMPLE_SIZES[typ]
        )
        for r in sample:
            r["human_verdict"] = None

        # ---- SAVE SAMPLE ----
        out_path = OUT_DIR / f"{category}-{typ}.json"
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(sample, f, indent=2, ensure_ascii=False)

        # ---- SUMMARY ----
        summary_rows.append({
            "category": category,
            "type": typ,
            "total_records": total,
            "invalid_judge_count": invalid_count,
            "invalid_judge_percent": invalid_percent,
            "sample_size": len(sample),
        })

# =========================
# SAVE SUMMARY CSV
# =========================

summary_path = OUT_DIR / "summary.csv"
fieldnames = [
    "category",
    "type",
    "total_records",
    "invalid_judge_count",
    "invalid_judge_percent",
    "sample_size",
]

with open(summary_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(summary_rows)


Manual filling of sample for judge evaluation in eval_json 

In [None]:
import json
import csv
from pathlib import Path
import pandas as pd

# =========================
# CONFIG
# =========================

DATA_DIR = Path("data/eval_judge")
OUT_DIR = Path("data/judge_analysis")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# =========================
# LOAD ALL SAMPLED JSONS
# =========================

rows = []

for path in DATA_DIR.glob("*.json"):
    if path.name == "summary.csv":
        continue

    # expected: category-type.json
    name = path.stem
    parts = name.split("-")
    category = "-".join(parts[:-1])
    typ = parts[-1]

    with open(path, "r", encoding="utf-8") as f:
        records = json.load(f)

    for r in records:
        hv = r.get("human_verdict")
        rows.append({
            "category": category,
            "type": typ,
            "judge_verdict": int(r.get("judge_verdict")),
            "human_verdict": int(hv) if hv in (0, 1, "0", "1") else None,
        })

df = pd.DataFrame(rows)
df.to_csv(OUT_DIR / "all_samples_flat.csv", index=False)

print(f"Loaded {len(df)} sampled records")


Loaded 105 sampled records


In [3]:
import matplotlib.pyplot as plt
import numpy as np

OUT_DIR = Path("data/judge_analysis")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# =========================
# JUDGE vs HUMAN ACCURACY
# =========================

df_valid = df.dropna(subset=["human_verdict"]).copy()

df_valid["correct"] = (
    df_valid["judge_verdict"] == df_valid["human_verdict"]
).astype(int)

accuracy = (
    df_valid
    .groupby(["category", "type"])["correct"]
    .mean()
    .reset_index()
)

accuracy_pivot = accuracy.pivot(
    index="category",
    columns="type",
    values="correct"
) * 100  # %

plt.figure(figsize=(10, 6))
plt.imshow(accuracy_pivot, vmin=0, vmax=100)
plt.colorbar(label="Judgeâ€“Human Agreement (%)")

plt.xticks(range(len(accuracy_pivot.columns)), accuracy_pivot.columns)
plt.yticks(range(len(accuracy_pivot.index)), accuracy_pivot.index)

plt.xlabel("Interaction Type")
plt.ylabel("Category")
plt.title("Judge vs Human Verdict Agreement (Accuracy)")

for i in range(len(accuracy_pivot.index)):
    for j in range(len(accuracy_pivot.columns)):
        val = accuracy_pivot.iloc[i, j]
        if not np.isnan(val):
            plt.text(
                j, i,
                f"{val:.1f}%",
                ha="center", va="center",
                color="white" if val < 40 else "black"
            )

plt.tight_layout()
plt.savefig(OUT_DIR / "heatmap_judge_human_accuracy.png")
plt.close()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

OUT_DIR = Path("data/judge_analysis")
OUT_DIR.mkdir(parents=True, exist_ok=True)

summary = pd.read_csv("data/eval_judge/summary.csv")

def pivot(metric):
    return summary.pivot(
        index="category",
        columns="type",
        values=metric
    )

# =========================
# OUTPUT VALIDITY 
# =========================

invalid_pct = pivot("invalid_judge_percent")
validity_pct = 100.0 - invalid_pct

plt.figure(figsize=(10, 6))
plt.imshow(validity_pct, vmin=0, vmax=100)
plt.colorbar(label="Valid Judge Outputs (%)")

plt.xticks(range(len(validity_pct.columns)), validity_pct.columns)
plt.yticks(range(len(validity_pct.index)), validity_pct.index)

plt.xlabel("Interaction Type")
plt.ylabel("Category")
plt.title("Judge Output Validity Rate")

for i in range(len(validity_pct.index)):
    for j in range(len(validity_pct.columns)):
        val = validity_pct.iloc[i, j]
        if not np.isnan(val):
            plt.text(
                j, i,
                f"{val:.1f}%",
                ha="center", va="center",
                color="white" if val < 40 else "black"
            )

plt.tight_layout()
plt.savefig(OUT_DIR / "heatmap_judge_output_validity.png")
plt.close()
