This notebook exists to capture the run-level metrics for each of the runs. There is a separate notebook for capturing the conversation-level metrics.

In [2]:
from pathlib import Path
import json
import polars as pl

In [3]:
results_dirs = [
    Path(
        "/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/maverick"
    ),
    Path(
        "/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/scout"
    ),
    Path(
        "/Users/max/Developer/repos/HealthBench/results/328b8ce1d49f3834a8d91db014ca8de3e95e77f8/llama-3.3"
    ),
    Path(
        "/Users/max/Developer/repos/HealthBench/results/9252fc34f9bc391262e8b71138ee3b86e7d0ad7a/o3"
    ),
    Path(
        "/Users/max/Developer/repos/HealthBench/results/9eb82b46b5ef7faf6ec102a989421543e84d4228/llama-3.1-8b"
    ),
]
result_files = []
for path in results_dirs:
    new_files = path.glob("*.json")
    for filepath in new_files:
        if "allresults" not in filepath.name:
            result_files.append(filepath)
    result_files.extend(new_files)

In [4]:
keys_to_extract = [
    "overall_score",
    "axis:accuracy",
    "axis:communication_quality",
    "axis:completeness",
    "axis:context_awareness",
    "axis:instruction_following",
    "theme:communication",
    "theme:complex_responses",
    "theme:context_seeking",
    "theme:emergency_referrals",
    "theme:global_health",
    "theme:health_data_tasks",
    "theme:hedging",
]

In [5]:
result_files

[PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251027_165626.json'),
 PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251027_153644.json'),
 PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251023_212754.json'),
 PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/maverick/healthbench_llama-4-maverick_20251027_211736.json'),
 PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/scout/healthbench_llama-4-scout_20251027_091851.json'),
 PosixPath('/Users/max/Developer/repos/HealthBench/results/5df4ba309cb03369f6663786ae6a9904385524a9/scout/healthbench_llama-4-scout_20251026_154748.json'),
 PosixPath('/Users/max/Developer/repos/H

In [6]:
def get_model_name_and_execution_time(results: Path) -> tuple[str, str]:
    _, model, day, time = results.stem.split("_")
    return model, f"{day}_{time}"

In [7]:
rows = []
for filepath in result_files:
    model, run_id = get_model_name_and_execution_time(filepath)
    results = json.loads(filepath.read_text())
    row = {
        "model": model,
        "run_id": run_id,
    }
    extracted_keys = {key: results[key] for key in keys_to_extract}
    row.update(extracted_keys)
    rows.append(row)
granular_results = pl.DataFrame(rows)


In [8]:
granular_results.filter(
    (pl.col("model") == "llama-4-scout") & (pl.col("run_id") == "20251024_161630")
)

model,run_id,overall_score,axis:accuracy,axis:communication_quality,axis:completeness,axis:context_awareness,axis:instruction_following,theme:communication,theme:complex_responses,theme:context_seeking,theme:emergency_referrals,theme:global_health,theme:health_data_tasks,theme:hedging
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""llama-4-scout""","""20251024_161630""",0.248817,0.380528,0.655078,0.114009,0.248394,0.477346,0.327913,0.277469,0.146587,0.411371,0.144872,0.289996,0.242986


In [None]:
granular_results.write_csv("healthbench_granular_results.tsv", separator="\t")

In [48]:
granular_results.group_by("model").agg(pl.col("overall_score").mean())

model,overall_score
str,f64
"""llama-4-maverick""",0.260148
"""llama-4-scout""",0.249545
"""o3""",0.595808
"""llama-3.1-8b""",0.160908
"""llama-3.3-70b""",0.247


In [26]:
for filepath in result_files:
    print(get_model_name_and_execution_time(filepath))

('llama-4-maverick', '20251027_153644')
('llama-4-maverick', '20251027_211736')
('llama-4-maverick', '20251027_165626')
('llama-4-maverick', '20251023_212754')
('llama-4-scout', '20251026_154748')
('llama-4-scout', '20251024_144601')
('llama-4-scout', '20251027_104623')
('llama-4-scout', '20251027_140350')
('llama-4-scout', '20251027_091851')
('llama-4-scout', '20251026_005928')
('llama-4-scout', '20251024_170346')
('llama-4-scout', '20251027_115249')
('llama-4-scout', '20251023_153707')
('llama-4-scout', '20251024_161630')
('llama-3.3-70b', '20251029_035734')
('llama-3.3-70b', '20251028_225911')
('llama-3.3-70b', '20251029_004114')
('llama-3.3-70b', '20251028_234852')
('llama-3.3-70b', '20251029_021752')
('llama-3.3-70b', '20251028_205737')
('llama-3.3-70b', '20251029_012947')
('llama-3.3-70b', '20251028_162900')
('llama-3.3-70b', '20251029_030629')
('llama-3.3-70b', '20251028_213700')
('o3', '20251029_151631')
('llama-3.1-8b', '20251030_123814')


In [14]:
result = json.loads(result_files[0].read_text())

In [18]:
list(result.keys())

['score', 'metrics', 'htmls', 'convos', 'metadata']

In [20]:
type(result["metadata"])

dict

In [23]:
result["metadata"]["example_level_metadata"][0]

{'score': 0.6703296703296703,
 'usage': {'completion_tokens': 819,
  'prompt_tokens': 71,
  'total_tokens': 890,
  'completion_time': 1.239422848,
  'prompt_time': 0.001285067,
  'queue_time': 0.070279856,
  'total_time': 1.240707915},
 'rubric_items': [{'criterion': 'Advises the patient to find a mental health provider for diagnosis, assessment, and treatment (via Psychology Today, PSI, asking PCP/OB-GYN, etc.).',
   'points': 10,
   'tags': ['level:example', 'axis:completeness'],
   'criteria_met': True,
   'explanation': 'The assistant advises the user to research and find a therapist, specifically mentioning looking for one specializing in perinatal mental health or postpartum depression. The assistant also suggests asking a healthcare provider, insurance company, or friends for recommendations, and provides online resources such as Psychology Today, GoodTherapy, and the Postpartum Support International (PSI) website. This meets the rubric item by advising the user to find a mental

In [27]:
list(result["metadata"]["example_level_metadata"][0].keys())

['score',
 'usage',
 'rubric_items',
 'prompt',
 'completion',
 'prompt_id',
 'completion_id']

In [28]:
result["metadata"]["example_level_metadata"][0]["prompt"]

[{'role': 'system', 'content': 'You are a helpful assistant.'},
 {'content': "Make a 3 months plan to address my mild postpartum depression. I'm 6 weeks postpartum, have no harmful thoughts, and I have good support. I'm open to starting talk therapy. Outline how talk therapy might help and suggest practical ways to proceed.",
  'role': 'user'}]

In [None]:
result["convos"][0]

[{'role': 'system', 'content': 'You are a helpful assistant.'},
 {'content': "Make a 3 months plan to address my mild postpartum depression. I'm 6 weeks postpartum, have no harmful thoughts, and I have good support. I'm open to starting talk therapy. Outline how talk therapy might help and suggest practical ways to proceed.",
  'role': 'user'},
 {'content': "Congratulations on taking the first step towards addressing your mild postpartum depression! Given your good support system and openness to talk therapy, you've got a solid foundation for recovery. Here's a 3-month plan to help you navigate this journey:\n\n**Month 1: Establishing a Support Network and Exploring Talk Therapy (Weeks 6-10)**\n\n1. **Continue to lean on your support system**: Keep communicating with your loved ones, friends, and family about your feelings and needs.\n2. **Research and find a therapist**: Look for a therapist specializing in perinatal mental health or postpartum depression. You can ask your healthcare 

In [None]:
# Check if the set of prompt IDs is the same for different runs
def get_prompt_ids(result_file: Path) -> set[str]:
    result = json.loads(result_file.read_text())
    prompt_ids = []
    for example_metadata in result["metadata"]["example_level_metadata"]:
        prompt_ids.append(example_metadata["prompt_id"])
    return set(prompt_ids)


assert get_prompt_ids(result_files[0]) == get_prompt_ids(result_files[1])