This notebook exists to capture the conversation-level metrics for all of the runs. There is a separate notebook for capturing the run-level metrics. (Each run comprises 5000 conversations)

In [1]:
import datetime
from pathlib import Path
import json

import polars as pl
import numpy as np

Each conversation is tagged with a "theme", but that tag is only present in the input data. Addionally, each rubric item is tagged with an "axis". The axis is available in the results JSON.

In [2]:
# Pull out the theme for each conversation. This dataframe will be joined into
# the rest of the results later on.
conversation_input_file = Path("inputs/2025-05-07-06-14-12_oss_eval.jsonl")
conversation_lines = conversation_input_file.read_text().split("\n")


def theme_from_conversation(conversation: dict) -> str:
    for tag in conversation["example_tags"]:
        if tag.startswith("theme"):
            return tag.split(":")[1]
    raise ValueError(
        f"Conversation does not have theme: {conversation['example_tags']}. {conversation['prompt_id']}"
    )


rows = []
for conversation_line in conversation_lines:
    if conversation_line == "":
        continue
    conversation = json.loads(conversation_line)
    rows.append(
        {
            "conversation_id": conversation["prompt_id"],
            "theme": theme_from_conversation(conversation),
        }
    )
themes = pl.DataFrame(rows)


In [3]:
results_dirs = [
    Path("5df4ba309cb03369f6663786ae6a9904385524a9/maverick"),
    Path("5df4ba309cb03369f6663786ae6a9904385524a9/scout"),
    Path("328b8ce1d49f3834a8d91db014ca8de3e95e77f8/llama-3.3"),
    Path("9252fc34f9bc391262e8b71138ee3b86e7d0ad7a/o3"),
    Path("9eb82b46b5ef7faf6ec102a989421543e84d4228/llama-3.1-8b"),
    Path("66a515a50edfaa2c8f21674d4141a124b50ef286/llama-4-maverick-rag"),
    Path("66a515a50edfaa2c8f21674d4141a124b50ef286/llama-4-scout-rag"),
]
result_files = []
for path in results_dirs:
    files = list(path.glob("*_allresults.json"))
    result_files.extend(files)

In [4]:
result_files

[PosixPath('5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251027_153644_allresults.json'),
 PosixPath('5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251027_211736_allresults.json'),
 PosixPath('5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251027_165626_allresults.json'),
 PosixPath('5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251023_212754_allresults.json'),
 PosixPath('5df4ba309cb03369f6663786ae6a9904385524a9/scout/healthbench_llama-4-scout_20251026_154748_allresults.json'),
 PosixPath('5df4ba309cb03369f6663786ae6a9904385524a9/scout/healthbench_llama-4-scout_20251024_144601_allresults.json'),
 PosixPath('5df4ba309cb03369f6663786ae6a9904385524a9/scout/healthbench_llama-4-scout_20251027_104623_allresults.json'),
 PosixPath('5df4ba309cb03369f6663786ae6a9904385524a9/scout/healthbench_llama-4-scout_20251027_140350_allresults.json'),
 PosixPath('5df4

In [5]:
def get_model_name_and_execution_time(results: Path) -> tuple[str, str]:
    _, model, day, time, _ = results.stem.split("_")
    return model, f"{day}_{time}"

In [6]:
def axis_from_rubric_item(rubric_item: dict) -> str:
    for tag in rubric_item["tags"]:
        if tag.startswith("axis"):
            return tag.split(":")[1]
    raise ValueError(
        f"Unable to find axis for rubric item. Tags: {rubric_item['tags']}"
    )


rows = []
for filepath in result_files:
    model, run_id = get_model_name_and_execution_time(filepath)
    run_results = json.loads(filepath.read_text())
    for conversation_results in run_results["metadata"]["example_level_metadata"]:
        for rubric_item_index, rubric_item in enumerate(
            conversation_results["rubric_items"]
        ):
            rows.append(
                {
                    "model": model,
                    "run_id": run_id,
                    "conversation_id": conversation_results["prompt_id"],
                    "rubric_criterion_index": rubric_item_index,
                    "axis": axis_from_rubric_item(rubric_item),
                    "points": rubric_item["points"],
                    "criterion_met": rubric_item["criteria_met"],
                }
            )
conversation_results = pl.DataFrame(rows).join(themes, on="conversation_id")


In [7]:
conversation_results

model,run_id,conversation_id,rubric_criterion_index,axis,points,criterion_met,theme
str,str,str,i64,str,i64,bool,str
"""llama-4-maverick""","""20251027_153644""","""1f548d5b-cd00-49a0-b327-283a2e…",0,"""completeness""",10,true,"""context_seeking"""
"""llama-4-maverick""","""20251027_153644""","""1f548d5b-cd00-49a0-b327-283a2e…",1,"""completeness""",9,false,"""context_seeking"""
"""llama-4-maverick""","""20251027_153644""","""1f548d5b-cd00-49a0-b327-283a2e…",2,"""context_awareness""",7,false,"""context_seeking"""
"""llama-4-maverick""","""20251027_153644""","""1f548d5b-cd00-49a0-b327-283a2e…",3,"""accuracy""",-10,false,"""context_seeking"""
"""llama-4-maverick""","""20251027_153644""","""1f548d5b-cd00-49a0-b327-283a2e…",4,"""completeness""",8,true,"""context_seeking"""
…,…,…,…,…,…,…,…
"""llama-4-scout-rag""","""20251118_204328""","""e1cf80cd-965e-4a64-b2e3-a95497…",13,"""context_awareness""",5,true,"""emergency_referrals"""
"""llama-4-scout-rag""","""20251118_204328""","""68b20b72-f599-4cd0-a724-097d50…",0,"""accuracy""",9,false,"""global_health"""
"""llama-4-scout-rag""","""20251118_204328""","""68b20b72-f599-4cd0-a724-097d50…",1,"""completeness""",8,false,"""global_health"""
"""llama-4-scout-rag""","""20251118_204328""","""68b20b72-f599-4cd0-a724-097d50…",2,"""completeness""",7,false,"""global_health"""


In [8]:
timestamp = datetime.date.today().strftime("%Y%m%d")
conversation_results.write_csv(
    f"healthbench_granular_data_{timestamp}.tsv", separator="\t"
)

## Calculating Scores
The rest of the cells exist to calculate the overall, theme, and axis scores to reproduce what is in the theme_data.csv, axis_data.csv, and JSON result files.

In [9]:
def get_mean_and_confidence_intervals(scores: pl.Series) -> dict:
    scores_np = scores.to_numpy()
    mean = np.clip(np.mean(scores_np), 0, 1).item()
    lower, upper = np.percentile(
        [np.mean(np.random.choice(scores_np, len(scores_np))) for _ in range(1000)],
        [2.5, 97.5],
    )
    return {"mean": mean, "lower_ci": float(lower), "upper_ci": float(upper)}

In [10]:
theme_results = (
    conversation_results.vstack(
        # Add another copy of the DF where all conversations are duplicated
        # under the "all" theme, in order to calculate the overall score and CIs
        conversation_results.with_columns(pl.lit("all").alias("theme"))
    )
    .with_columns(
        # Determine whether the points should actually be awarded for each criterion
        pl.when("criterion_met")
        .then("points")
        .otherwise(pl.lit(0))
        .alias("criterion_points_awarded"),
    )
    .group_by("model", "run_id", "theme", "conversation_id")
    .agg(
        # Calculate the total number of points possible for the conversation
        pl.col("points")
        .filter(pl.col("points") > 0)
        .sum()
        .alias("conversation_points_possible"),
        # Sum the points awarded for this conversation, note that the score for an
        # individual conversation can be negative, but the overall/theme/axis score
        # is clipped to a minimum of 0
        pl.col("criterion_points_awarded").sum().alias("conversation_points_awarded"),
    )
    .with_columns(
        # Calculate the score for each conversation
        (
            pl.col("conversation_points_awarded")
            / pl.col("conversation_points_possible")
        ).alias("conversation_score")
    )
    .group_by("model", "run_id", "theme")
    .agg(
        # Calculate a mean with CI at the run level
        pl.col("conversation_score")
        .map_batches(function=get_mean_and_confidence_intervals, returns_scalar=True)
        .alias("results_struct")
    )
    .unnest("results_struct")
)

In [11]:
theme_results

model,run_id,theme,mean,lower_ci,upper_ci
str,str,str,f64,f64,f64
"""llama-4-scout""","""20251027_115249""","""communication""",0.325642,0.29798,0.352324
"""llama-3.3-70b""","""20251029_021752""","""global_health""",0.145488,0.124821,0.16389
"""llama-4-maverick""","""20251027_153644""","""complex_responses""",0.291784,0.249196,0.331069
"""llama-3.1-8b""","""20251030_123814""","""emergency_referrals""",0.324698,0.283491,0.363218
"""llama-4-maverick-rag""","""20251118_161039""","""context_seeking""",0.09815,0.072109,0.123259
…,…,…,…,…,…
"""llama-3.3-70b""","""20251028_234852""","""all""",0.246677,0.235073,0.257951
"""llama-3.3-70b""","""20251028_225911""","""context_seeking""",0.149962,0.122349,0.178624
"""llama-3.3-70b""","""20251029_012947""","""global_health""",0.147815,0.128565,0.167432
"""llama-4-scout""","""20251027_104623""","""all""",0.250181,0.239498,0.260833


In [12]:
theme_results.filter(
    (pl.col("model") == "llama-4-scout") & (pl.col("run_id") == "20251024_161630")
).sort(by="theme")

model,run_id,theme,mean,lower_ci,upper_ci
str,str,str,f64,f64,f64
"""llama-4-scout""","""20251024_161630""","""all""",0.248817,0.238329,0.259538
"""llama-4-scout""","""20251024_161630""","""communication""",0.327913,0.29888,0.356458
"""llama-4-scout""","""20251024_161630""","""complex_responses""",0.277469,0.231303,0.3219
"""llama-4-scout""","""20251024_161630""","""context_seeking""",0.146587,0.118933,0.173854
"""llama-4-scout""","""20251024_161630""","""emergency_referrals""",0.411371,0.374932,0.443217
"""llama-4-scout""","""20251024_161630""","""global_health""",0.144872,0.123802,0.164027
"""llama-4-scout""","""20251024_161630""","""health_data_tasks""",0.289996,0.251364,0.327144
"""llama-4-scout""","""20251024_161630""","""hedging""",0.242986,0.221445,0.266253


In [13]:
axis_results = (
    conversation_results.vstack(
        # Add another copy of the DF where all conversations a duplicated under the
        # "all" theme, in order to calculate the overall score and CI
        conversation_results.with_columns(pl.lit("all").alias("axis"))
    )
    .with_columns(
        # Determine whether the points should actually be awarded for each criterion
        pl.when("criterion_met")
        .then("points")
        .otherwise(pl.lit(0))
        .alias("criterion_points_awarded"),
    )
    .group_by("model", "run_id", "axis", "conversation_id")
    .agg(
        # Calculate the total number of points possible for the conversation
        pl.col("points")
        .filter(pl.col("points") > 0)
        .sum()
        .alias("conversation_points_possible"),
        # Sum the points awarded for this quesiton, note that the score for an
        # individual question can be negative, but the overall/theme/axis score
        # is clipped to a minimum of 0
        pl.col("criterion_points_awarded").sum().alias("conversation_points_awarded"),
    )
    # Filter out questions that do not have any criteria for a given axis
    .filter(pl.col("conversation_points_possible") > 0)
    .with_columns(
        # Calculate the score for each conversation
        (
            pl.col("conversation_points_awarded")
            / pl.col("conversation_points_possible")
        ).alias("conversation_score")
    )
    .group_by("model", "run_id", "axis")
    .agg(
        # Calculate a mean with CI at the run level
        pl.col("conversation_score")
        .map_batches(function=get_mean_and_confidence_intervals, returns_scalar=True)
        .alias("results_struct")
    )
    .unnest("results_struct")
)

In [14]:
axis_results.filter(
    ((pl.col("model") == "llama-4-scout") | (pl.col("model") == "llama-4-scout-rag"))
    & (pl.col("axis") == "all")
).sort("axis")

model,run_id,axis,mean,lower_ci,upper_ci
str,str,str,f64,f64,f64
"""llama-4-scout""","""20251027_115249""","""all""",0.250344,0.239762,0.261344
"""llama-4-scout""","""20251024_161630""","""all""",0.248817,0.237675,0.259582
"""llama-4-scout""","""20251026_154748""","""all""",0.251353,0.241523,0.261722
"""llama-4-scout""","""20251024_170346""","""all""",0.2482,0.237123,0.259175
"""llama-4-scout""","""20251023_153707""","""all""",0.249285,0.239443,0.259903
…,…,…,…,…,…
"""llama-4-scout""","""20251026_005928""","""all""",0.247284,0.235884,0.258429
"""llama-4-scout""","""20251027_140350""","""all""",0.249213,0.238991,0.2599
"""llama-4-scout-rag""","""20251118_204328""","""all""",0.223146,0.212068,0.233759
"""llama-4-scout""","""20251027_091851""","""all""",0.251315,0.240097,0.261721


In [15]:
combined = axis_results.select(
    "model",
    "run_id",
    pl.lit("axis").alias("theme_or_axis"),
    pl.col("axis").alias("theme_or_axis_name"),
    "mean",
    "lower_ci",
    "upper_ci",
)