In [None]:
import os
import json
import numpy as np
from sklearn.metrics import roc_auc_score

def compute_ece(confidences, accuracies, n_bins=10):
    """Expected Calibration Error"""
    bins = np.linspace(0, 1, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        mask = (confidences >= bins[i]) & (confidences < bins[i + 1])
        if np.any(mask):
            acc_bin = np.mean(accuracies[mask])
            conf_bin = np.mean(confidences[mask])
            ece += np.abs(acc_bin - conf_bin) * np.mean(mask)
    return ece

def analyze_log_folder(folder="log_llm"):
    all_confs, all_accs = [], []

    results = []
    for filename in os.listdir(folder):
        if not filename.endswith(".json"):
            continue
        with open(os.path.join(folder, filename), "r", encoding="utf-8") as f:
            data = json.load(f)
        
        rolls = data.get("each_roll", [])
        confs = np.array([r["confidence"] for r in rolls if r.get("confidence") is not None])
        accs  = np.array([r["accuracy"]   for r in rolls if r.get("accuracy")   is not None])
        
        if len(confs) == 0: continue

        mean_conf_correct = confs[accs == 1].mean() if np.any(accs == 1) else np.nan
        mean_conf_wrong   = confs[accs == 0].mean() if np.any(accs == 0) else np.nan
        delta = mean_conf_correct - mean_conf_wrong if not np.isnan(mean_conf_correct) and not np.isnan(mean_conf_wrong) else np.nan
        
        # 汇总全局统计
        all_confs.extend(confs)
        all_accs.extend(accs)

        results.append({
            "file": filename,
            "mean_conf_correct": mean_conf_correct,
            "mean_conf_wrong": mean_conf_wrong,
            "delta": delta,
            "acc_rate": np.mean(accs)
        })

    # 全局指标
    all_confs = np.array(all_confs)
    all_accs = np.array(all_accs)

    global_stats = {
        "global_mean_conf_correct": all_confs[all_accs == 1].mean(),
        "global_mean_conf_wrong": all_confs[all_accs == 0].mean(),
        "global_delta": all_confs[all_accs == 1].mean() - all_confs[all_accs == 0].mean(),
        "global_auc": roc_auc_score(all_accs, all_confs) if len(np.unique(all_accs)) > 1 else np.nan,
        "global_ece": compute_ece(all_confs, all_accs),
        "global_brier": np.mean((all_confs - all_accs) ** 2),
        "global_acc": np.mean(all_accs)
    }

    return results, global_stats

results, global_stats = analyze_log_folder("log_llm")

print("=== 每个文件的统计 ===")
for r in results:
    print(r)

print("\n=== 全局统计 ===")
for k, v in global_stats.items():
    print(f"{k}: {v:.4f}")
