This notebook exists to capture the conversation-level metrics for all of the runs. There is a separate notebook for capturing the run-level metrics. (Each run comprises 5000 conversations)

In [16]:
from pathlib import Path
import json
import polars as pl
import numpy as np

Each conversation is tagged with a "theme", but that tag is only present in the input data. Addionally, each rubric item is tagged with an "axis". The axis is available in the results JSON.

In [None]:
# Pull out the theme for each conversation. This dataframe will be joined into
# the rest of the results later on.
conversation_input_file = Path("inputs/2025-05-07-06-14-12_oss_eval.jsonl")
conversation_lines = conversation_input_file.read_text().split("\n")


def theme_from_conversation(conversation: dict) -> str:
    for tag in conversation["example_tags"]:
        if tag.startswith("theme"):
            return tag.split(":")[1]
    raise ValueError(
        f"Conversation does not have theme: {conversation['example_tags']}. {conversation['prompt_id']}"
    )


rows = []
for conversation_line in conversation_lines:
    if conversation_line == "":
        continue
    conversation = json.loads(conversation_line)
    rows.append(
        {
            "conversation_id": conversation["prompt_id"],
            "theme": theme_from_conversation(conversation),
        }
    )
themes = pl.DataFrame(rows)


In [None]:
results_dirs = [
    Path("5df4ba309cb03369f6663786ae6a9904385524a9/maverick"),
    Path("5df4ba309cb03369f6663786ae6a9904385524a9/scout"),
    Path("328b8ce1d49f3834a8d91db014ca8de3e95e77f8/llama-3.3"),
    Path("9252fc34f9bc391262e8b71138ee3b86e7d0ad7a/o3"),
    Path("9eb82b46b5ef7faf6ec102a989421543e84d4228/llama-3.1-8b"),
]
result_files = []
for path in results_dirs:
    files = list(path.glob("*_allresults.json"))
    result_files.extend(files)

In [3]:
result_files

[PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251027_153644_allresults.json'),
 PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251027_211736_allresults.json'),
 PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251027_165626_allresults.json'),
 PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251023_212754_allresults.json'),
 PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/scout/healthbench_llama-4-scout_20251026_154748_allresults.json'),
 PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/scout/healthbench_llama-4-scout_20251024_

In [9]:
def get_model_name_and_execution_time(results: Path) -> tuple[str, str]:
    _, model, day, time, _ = results.stem.split("_")
    return model, f"{day}_{time}"

In [101]:
def axis_from_rubric_item(rubric_item: dict) -> str:
    for tag in rubric_item["tags"]:
        if tag.startswith("axis"):
            return tag.split(":")[1]
    raise ValueError(
        f"Unable to find axis for rubric item. Tags: {rubric_item['tags']}"
    )


rows = []
for filepath in result_files:
    model, run_id = get_model_name_and_execution_time(filepath)
    run_results = json.loads(filepath.read_text())
    for conversation_results in run_results["metadata"]["example_level_metadata"]:
        for rubric_item_index, rubric_item in enumerate(
            conversation_results["rubric_items"]
        ):
            rows.append(
                {
                    "model": model,
                    "run_id": run_id,
                    "conversation_id": conversation_results["prompt_id"],
                    "rubric_criterion_index": rubric_item_index,
                    "axis": axis_from_rubric_item(rubric_item),
                    "points": rubric_item["points"],
                    "criterion_met": rubric_item["criteria_met"],
                }
            )
conversation_results = pl.DataFrame(rows).join(themes, on="conversation_id")


In [102]:
conversation_results

model,run_id,conversation_id,rubric_criterion_index,axis,points,criterion_met,theme
str,str,str,i64,str,i64,bool,str
"""llama-4-maverick""","""20251027_153644""","""1f548d5b-cd00-49a0-b327-283a2e…",0,"""completeness""",10,true,"""context_seeking"""
"""llama-4-maverick""","""20251027_153644""","""1f548d5b-cd00-49a0-b327-283a2e…",1,"""completeness""",9,false,"""context_seeking"""
"""llama-4-maverick""","""20251027_153644""","""1f548d5b-cd00-49a0-b327-283a2e…",2,"""context_awareness""",7,false,"""context_seeking"""
"""llama-4-maverick""","""20251027_153644""","""1f548d5b-cd00-49a0-b327-283a2e…",3,"""accuracy""",-10,false,"""context_seeking"""
"""llama-4-maverick""","""20251027_153644""","""1f548d5b-cd00-49a0-b327-283a2e…",4,"""completeness""",8,true,"""context_seeking"""
…,…,…,…,…,…,…,…
"""llama-3.1-8b""","""20251030_123814""","""e1cf80cd-965e-4a64-b2e3-a95497…",13,"""context_awareness""",5,true,"""emergency_referrals"""
"""llama-3.1-8b""","""20251030_123814""","""68b20b72-f599-4cd0-a724-097d50…",0,"""accuracy""",9,false,"""global_health"""
"""llama-3.1-8b""","""20251030_123814""","""68b20b72-f599-4cd0-a724-097d50…",1,"""completeness""",8,false,"""global_health"""
"""llama-3.1-8b""","""20251030_123814""","""68b20b72-f599-4cd0-a724-097d50…",2,"""completeness""",7,false,"""global_health"""


In [103]:
conversation_results.write_csv("healthbench_granular_data.tsv", separator="\t")

## Calculating Scores
The rest of the cells exist to calculate the overall, theme, and axis scores to reproduce what is in the theme_data.csv, axis_data.csv, and JSON result files.

In [None]:
def get_mean_and_confidence_intervals(scores: pl.Series) -> dict:
    scores_np = scores.to_numpy()
    mean = np.clip(np.mean(scores_np), 0, 1).item()
    upper, lower = np.percentile(
        [np.mean(np.random.choice(scores_np, len(scores_np))) for _ in range(1000)],
        [2.5, 97.5],
    )
    return {"mean": mean, "lower_ci": float(lower), "upper_ci": float(upper)}

In [None]:
theme_results = (
    conversation_results.vstack(
        # Add another copy of the DF where all conversations are duplicated
        # under the "all" theme, in order to calculate the overall score and CIs
        conversation_results.with_columns(pl.lit("all").alias("theme"))
    )
    .with_columns(
        # Determine whether the points should actually be awarded for each criterion
        pl.when("criterion_met")
        .then("points")
        .otherwise(pl.lit(0))
        .alias("criterion_points_awarded"),
    )
    .group_by("model", "run_id", "theme", "conversation_id")
    .agg(
        # Calculate the total number of points possible for the conversation
        pl.col("points")
        .filter(pl.col("points") > 0)
        .sum()
        .alias("conversation_points_possible"),
        # Sum the points awarded for this conversation, note that the score for an
        # individual conversation can be negative, but the overall/theme/axis score
        # is clipped to a minimum of 0
        pl.col("criterion_points_awarded").sum().alias("conversation_points_awarded"),
    )
    .with_columns(
        # Calculate the score for each conversation
        (
            pl.col("conversation_points_awarded")
            / pl.col("conversation_points_possible")
        ).alias("conversation_score")
    )
    .group_by("model", "run_id", "theme")
    .agg(
        # Calculate a mean with CI at the run level
        pl.col("conversation_score")
        .map_batches(function=get_mean_and_confidence_intervals, returns_scalar=True)
        .alias("results_struct")
    )
    .unnest("results_struct")
)

In [None]:
theme_results

model,run_id,theme,conversation_id,conversation_points_awarded,conversation_points_possible,conversation_score
str,str,str,str,i64,i64,f64
"""llama-3.3-70b""","""20251028_205737""","""hedging""","""472251ad-dd7a-4ea9-965a-37dc12…",5,38,0.131579
"""llama-3.3-70b""","""20251028_205737""","""context_seeking""","""041e7d49-6aa1-4c56-af79-051d01…",32,48,0.666667
"""llama-3.3-70b""","""20251029_004114""","""context_seeking""","""4f097549-151a-4844-899a-3c3db5…",0,23,0.0
"""llama-4-maverick""","""20251027_165626""","""all""","""bcda6dfc-b4fa-42f1-96af-2e751f…",0,51,0.0
"""llama-3.3-70b""","""20251029_004114""","""hedging""","""e59dc979-71b3-4702-90f8-d62d1e…",45,81,0.555556
…,…,…,…,…,…,…
"""llama-4-scout""","""20251026_005928""","""all""","""eefd92ef-79df-4b4b-bfdc-fa62bd…",12,65,0.184615
"""llama-4-maverick""","""20251027_165626""","""health_data_tasks""","""6314e6ed-65e4-475e-a4ab-fd141d…",0,37,0.0
"""llama-4-scout""","""20251026_154748""","""all""","""2ffec207-51b1-439d-8404-165ce4…",0,76,0.0
"""llama-3.3-70b""","""20251028_213700""","""health_data_tasks""","""6726cc55-d3d3-41bd-95a5-ec1cec…",19,47,0.404255


In [96]:
theme_results.filter(
    (pl.col("model") == "llama-4-scout") & (pl.col("run_id") == "20251024_161630")
).sort(by="theme")

model,run_id,theme,mean,lower_ci,upper_ci
str,str,str,f64,f64,f64
"""llama-4-scout""","""20251024_161630""","""all""",0.248817,0.259437,0.238414
"""llama-4-scout""","""20251024_161630""","""communication""",0.327913,0.354857,0.300975
"""llama-4-scout""","""20251024_161630""","""complex_responses""",0.277469,0.32157,0.232058
"""llama-4-scout""","""20251024_161630""","""context_seeking""",0.146587,0.174102,0.119266
"""llama-4-scout""","""20251024_161630""","""emergency_referrals""",0.411371,0.449624,0.374191
"""llama-4-scout""","""20251024_161630""","""global_health""",0.144872,0.164754,0.125622
"""llama-4-scout""","""20251024_161630""","""health_data_tasks""",0.289996,0.327951,0.25115
"""llama-4-scout""","""20251024_161630""","""hedging""",0.242986,0.262826,0.220866


In [92]:
axis_results = (
    conversation_results.vstack(
        # Add another copy of the DF where all conversations a duplicated under the
        # "all" theme, in order to calculate the overall score and CI
        conversation_results.with_columns(pl.lit("all").alias("axis"))
    )
    .with_columns(
        # Determine whether the points should actually be awarded for each criterion
        pl.when("criterion_met")
        .then("points")
        .otherwise(pl.lit(0))
        .alias("criterion_points_awarded"),
    )
    .group_by("model", "run_id", "axis", "conversation_id")
    .agg(
        # Calculate the total number of points possible for the conversation
        pl.col("points")
        .filter(pl.col("points") > 0)
        .sum()
        .alias("conversation_points_possible"),
        # Sum the points awarded for this quesiton, note that the score for an
        # individual question can be negative, but the overall/theme/axis score
        # is clipped to a minimum of 0
        pl.col("criterion_points_awarded").sum().alias("conversation_points_awarded"),
    )
    # Filter out questions that do not have any criteria for a given axis
    .filter(pl.col("conversation_points_possible") > 0)
    .with_columns(
        # Calculate the score for each conversation
        (
            pl.col("conversation_points_awarded")
            / pl.col("conversation_points_possible")
        ).alias("conversation_score")
    )
    .group_by("model", "run_id", "axis")
    .agg(
        # Calculate a mean with CI at the run level
        pl.col("conversation_score")
        .map_batches(function=get_mean_and_confidence_intervals, returns_scalar=True)
        .alias("results_struct")
    )
    .unnest("results_struct")
)

In [93]:
axis_results.filter(
    (pl.col("model") == "llama-4-scout") & (pl.col("run_id") == "20251024_161630")
).sort("axis")

model,run_id,axis,mean,lower_ci,upper_ci
str,str,str,f64,f64,f64
"""llama-4-scout""","""20251024_161630""","""accuracy""",0.380528,0.372354,0.388701
"""llama-4-scout""","""20251024_161630""","""all""",0.248817,0.243258,0.254377
"""llama-4-scout""","""20251024_161630""","""communication_quality""",0.655078,0.643244,0.666912
"""llama-4-scout""","""20251024_161630""","""completeness""",0.114009,0.105132,0.122887
"""llama-4-scout""","""20251024_161630""","""context_awareness""",0.248394,0.236761,0.260027
"""llama-4-scout""","""20251024_161630""","""instruction_following""",0.477346,0.459662,0.495031


In [None]:
combined = axis_results.select(
    "model",
    "run_id",
    pl.lit("axis").alias("theme_or_axis"),
    pl.col("axis").alias("theme_or_axis_name"),
    "mean",
    "lower_ci",
    "upper_ci",
)

model,run_id,theme_or_axis,theme_or_axis_name,mean,lower_ci,upper_ci
str,str,str,str,f64,f64,f64
"""llama-3.3-70b""","""20251029_012947""","""axis""","""all""",0.246996,0.24163,0.252362
"""llama-4-scout""","""20251024_161630""","""axis""","""all""",0.248817,0.243258,0.254377
"""llama-3.3-70b""","""20251028_205737""","""axis""","""accuracy""",0.3697,0.361509,0.377891
"""llama-3.3-70b""","""20251029_030629""","""axis""","""accuracy""",0.376129,0.367866,0.384392
"""llama-4-scout""","""20251024_144601""","""axis""","""instruction_following""",0.472583,0.454868,0.490298
…,…,…,…,…,…,…
"""llama-4-maverick""","""20251027_165626""","""axis""","""completeness""",0.096537,0.087927,0.105146
"""llama-4-scout""","""20251027_104623""","""axis""","""completeness""",0.117148,0.107631,0.126666
"""llama-3.3-70b""","""20251028_213700""","""axis""","""all""",0.245781,0.240343,0.25122
"""llama-4-maverick""","""20251027_165626""","""axis""","""communication_quality""",0.682917,0.671113,0.69472
