In [1]:
import pandas as pd
import numpy as np
import os

  from pandas.core import (


In [2]:
def bootstrap_ci(scores, n_bootstrap=10000, confidence_level=95):
    n_samples = len(scores)
    bootstrap_means = []
    for _ in range(n_bootstrap):
        resampled_scores = np.random.choice(scores, size=n_samples, replace=True)
        bootstrap_means.append(np.mean(resampled_scores))
    bootstrap_means = np.array(bootstrap_means)
    mean_score = np.mean(bootstrap_means)
    std_score = np.std(bootstrap_means)    
    lower_percentile = (100 - confidence_level) / 2
    upper_percentile = 100 - lower_percentile
    ci_lower = np.percentile(bootstrap_means, lower_percentile)
    ci_upper = np.percentile(bootstrap_means, upper_percentile)
    return mean_score, std_score, ci_lower, ci_upper

## Table S2: Prompt Engineering and Temperature Ablations

In [5]:
!ls /mnt/sohn2022/Adrian/rad-llm-pmhx/experiments/results/scores

llm_development_dataset


In [6]:
BASEPATH = "/mnt/sohn2022/Adrian/rad-llm-pmhx/experiments/results/scores/llm_development_dataset"
MODEL = "gpt4o"
NUM_SAMPLES = 100
PROMPT_ENGINEERING_EXPERIMENTS = [
    "zero_shot_standard",
    "zero_shot_augmented",
    "one_shot_standard",
    "one_shot_augmented"
]
TEMPERATURE_EXPERIMENTS = [
    "temp_0.0",
    "temp_0.5",
    "temp_1.0"
]
METRICS = ["rouge", "medcon", "radgraph", "bertscore"]

In [21]:
def _scale_metrics_inplace(df, metric_names):
    """Multiply all metrics by 100."""
    for m in metric_names:
        df[m] = pd.to_numeric(df[m], errors="coerce") * 100

def _fmt_stats(mu, lo, hi, signed=False):
    if signed:
        return f"{mu:+.2f} [{lo:+.2f},{hi:+.2f}]"
    return f"{mu:.2f} [{lo:.2f},{hi:.2f}]"

# ----- Prompt Engineering -----
results = []

baseline_scores = pd.read_csv(f"{BASEPATH}/{MODEL}/zero_shot_standard/results_{NUM_SAMPLES}.csv")
_scale_metrics_inplace(baseline_scores, METRICS)

for experiment in PROMPT_ENGINEERING_EXPERIMENTS:
    exp_scores = pd.read_csv(f"{BASEPATH}/{MODEL}/{experiment}/results_{NUM_SAMPLES}.csv")
    _scale_metrics_inplace(exp_scores, METRICS)

    row = {"Condition": experiment}
    for m in METRICS:
        mu, std, lo, hi = bootstrap_ci(exp_scores[m])
        row[m] = _fmt_stats(mu, lo, hi, signed=False)
    results.append(row)

ordered_cols = ["Condition"] + METRICS
table_df = pd.DataFrame(results)[ordered_cols]
print(table_df)

# ----- Temperature Ablations -----
temp_rows = []
for experiment in TEMPERATURE_EXPERIMENTS:
    exp_scores = pd.read_csv(f"{BASEPATH}/{MODEL}/{experiment}/results_{NUM_SAMPLES}.csv")
    _scale_metrics_inplace(exp_scores, METRICS)

    row = {"Condition": experiment}
    for m in METRICS:
        mu, std, lo, hi = bootstrap_ci(exp_scores[m])
        row[m] = _fmt_stats(mu, lo, hi, signed=False)
    temp_rows.append(row)

table_df_temp = pd.DataFrame(temp_rows)[ordered_cols]
print(table_df_temp)

             Condition                rouge               medcon  \
0   zero_shot_standard    9.47 [8.08,11.03]  13.30 [10.99,15.73]   
1  zero_shot_augmented  20.07 [17.55,22.68]  23.91 [19.53,28.41]   
2    one_shot_standard  14.37 [12.39,16.45]  17.03 [14.06,20.10]   
3   one_shot_augmented  21.12 [18.33,23.98]  27.46 [22.69,32.30]   

             radgraph            bertscore  
0    4.57 [3.01,6.34]  18.56 [15.83,21.25]  
1  10.05 [6.62,13.91]  31.75 [28.70,34.68]  
2   8.73 [6.23,11.53]  26.23 [23.31,29.14]  
3   9.65 [6.53,12.98]  34.30 [31.12,37.43]  
  Condition              rouge               medcon          radgraph  \
0  temp_0.0  9.48 [8.02,11.08]  13.23 [10.95,15.64]  5.07 [3.50,6.84]   
1  temp_0.5  9.38 [8.00,10.87]  13.56 [11.29,15.96]  5.11 [3.33,7.13]   
2  temp_1.0  9.13 [7.93,10.37]  13.19 [10.89,15.82]  5.30 [3.18,7.91]   

             bertscore  
0  18.55 [15.91,21.18]  
1  19.04 [16.24,21.80]  
2  19.00 [16.27,21.74]  
