In [None]:
import json
import sys
import pandas as pd
import matplotlib.pyplot as plt


def load_results(path: str):
    """Load lm-evaluation-harness results from JSON."""
    with open(path, "r") as f:
        data = json.load(f)
    return data


def build_dataframe(results: dict) -> pd.DataFrame:
    """Convert results dict to pandas DataFrame."""
    rows = []
    for task, metrics in results.items():
        rows.append({
            "Task": task,
            "Accuracy": metrics.get("acc,none"),
            "Acc_StdErr": metrics.get("acc_stderr,none"),
            "Norm_Acc": metrics.get("acc_norm,none"),
            "Norm_Acc_StdErr": metrics.get("acc_norm_stderr,none"),
        })
    return pd.DataFrame(rows)


json_path = "results_last_ckpt.json"
data = load_results(json_path)
results = data["results"]

# Build DataFrame
df = build_dataframe(results)
print("\n=== Evaluation Results Summary ===")
print(df.to_string(index=False))

# Compute overall averages
avg_acc = df["Accuracy"].mean()
avg_norm_acc = df["Norm_Acc"].mean()
print(f"\nAverage Accuracy: {avg_acc:.4f}")
print(f"Average Normalized Accuracy: {avg_norm_acc:.4f}")

# # Plot Accuracy with error bars
# plt.figure(figsize=(10, 6))
# plt.barh(df["Task"], df["Accuracy"], 
#          xerr=df["Acc_StdErr"], capsize=5, color="skyblue")
# plt.xlabel("Accuracy")
# plt.title("LM Evaluation Harness Results (Accuracy with StdErr)")
# plt.tight_layout()
# plt.show()

In [3]:
import json
import sys
import pandas as pd
from collections import defaultdict


def load_results(path: str):
    """Load lm-evaluation-harness results from JSON."""
    with open(path, "r") as f:
        data = json.load(f)
    return data["results"]


def extract_prefix(task: str) -> str:
    """
    提取任务前缀（例如 mmlu_physics -> mmlu）
    规则：取第一个 "_" 前的部分，如果没有 "_"，就用整个任务名
    """
    return task.split("_")[0] if "_" in task else task


def build_dataframe(results: dict) -> pd.DataFrame:
    """Convert results dict to pandas DataFrame."""
    rows = []
    for task, metrics in results.items():
        rows.append({
            "Task": task,
            "Prefix": extract_prefix(task),
            "Accuracy": metrics.get("acc,none"),
            "Norm_Acc": metrics.get("acc_norm,none"),
        })
    return pd.DataFrame(rows)


print("\n=== benchmark ===")
json_path = "results_last_ckpt_benchmark.json"
results = load_results(json_path)
df = build_dataframe(results)

# print("\n=== All Tasks ===")
# print(df.to_string(index=False))

# 按前缀分组求均值
grouped = df.groupby("Prefix").mean(numeric_only=True).reset_index()

print("\n=== Grouped Results by Prefix ===")
print(grouped.to_string(index=False))

avg_acc = grouped["Accuracy"].mean()
avg_norm_acc = grouped["Norm_Acc"].mean()
print(f"\nAverage Accuracy: {avg_acc:.4f}")
print(f"Average Normalized Accuracy: {avg_norm_acc:.4f}")


=== benchmark ===

=== Grouped Results by Prefix ===
         Prefix  Accuracy  Norm_Acc
            arc  0.336122  0.334288
      hellaswag  0.280920  0.298048
     humanities  0.236132       NaN
        lambada  0.203377       NaN
           mmlu  0.237545  0.266861
     openbookqa  0.178000  0.288000
          other  0.274541       NaN
           piqa  0.605550  0.603917
social sciences  0.269418       NaN
           stem  0.225182       NaN
     winogrande  0.498816       NaN

Average Accuracy: 0.3041
Average Normalized Accuracy: 0.3582


In [4]:
print("\n=== our model ===")
json_path = "results_last_ckpt.json"
results = load_results(json_path)
df = build_dataframe(results)

# print("\n=== All Tasks ===")
# print(df.to_string(index=False))

# 按前缀分组求均值
grouped = df.groupby("Prefix").mean(numeric_only=True).reset_index()

print("\n=== Grouped Results by Prefix ===")
print(grouped.to_string(index=False))

avg_acc = grouped["Accuracy"].mean()
avg_norm_acc = grouped["Norm_Acc"].mean()
print(f"\nAverage Accuracy: {avg_acc:.4f}")
print(f"Average Normalized Accuracy: {avg_norm_acc:.4f}")


=== our model ===

=== Grouped Results by Prefix ===
         Prefix  Accuracy  Norm_Acc
            arc  0.335719  0.332581
      hellaswag  0.282016  0.296355
     humanities  0.234644       NaN
        lambada  0.202989       NaN
           mmlu  0.239920  0.262708
     openbookqa  0.184000  0.292000
          other  0.274220       NaN
           piqa  0.605005  0.605550
social sciences  0.278193       NaN
           stem  0.226134       NaN
     winogrande  0.496448       NaN

Average Accuracy: 0.3054
Average Normalized Accuracy: 0.3578
