In [2]:
import wandb
import pandas as pd

In [40]:
run_urls = [
    "https://wandb.ai/ai2-llm/olmo-ladder-benb/runs/tedq3zes",
    "https://wandb.ai/ai2-llm/olmo-ladder-benb/runs/nyry8k36",
    "https://wandb.ai/ai2-llm/olmo-ladder-benb/runs/4gzo1e35",
    "https://wandb.ai/ai2-llm/olmo-ladder-benb/runs/eewei3q1",
    "https://wandb.ai/ai2-llm/olmo-ladder-benb/runs/9a4sn074",
    "https://wandb.ai/ai2-llm/olmo-ladder-benb/runs/ms3sd05k",
    "https://wandb.ai/ai2-llm/olmo-ladder-benb/runs/89cgvukx",
    "https://wandb.ai/ai2-llm/olmo-ladder-benb/runs/5sxy1ux0",
    "https://wandb.ai/ai2-llm/olmo-ladder-benb/runs/mlulk77o"
]

In [43]:
api = wandb.Api(timeout=30)
wandb_entity = "ai2-llm"
wandb_project = "olmo-ladder-benb"

metrics = [
    "arc_challenge_len_norm",
    "arc_easy_acc",
    "basic_arithmetic_acc",
    # "boolq_acc",
    "commonsense_qa_len_norm",
    # "copa_acc",
    "hellaswag_len_norm",
    "mmlu_humanities_var_len_norm",
    "mmlu_other_var_len_norm",
    "mmlu_social_sciences_var_len_norm",
    "mmlu_stem_var_len_norm",
    "openbook_qa_len_norm",
    "piqa_len_norm",
    "sciq_acc",
    "social_iqa_len_norm",
    "winogrande_acc"
]
    

# Placeholder for metrics data
data = []

# Loop through each run ID and retrieve the metrics
for run_url in run_urls:
    run_id = run_url.split("/")[-1]
    run = api.run(f"{wandb_entity}/{wandb_project}/{run_id}")
    row = {"run_name": run.name}
    row.update({metric: run.summary[f"eval/downstream/{metric}"] for metric in metrics})
    data.append(row)


In [44]:

final_df = pd.DataFrame(data)

# add an average column
final_df["average"] = final_df.select_dtypes(include='number').mean(axis=1)

# Reorder columns to place 'average' right after 'run_name'
cols = final_df.columns.tolist()
cols.insert(1, cols.pop(cols.index('average')))
final_df = final_df[cols]

# sort by average
final_df = final_df.sort_values(by="average", ascending=False)

final_df

Unnamed: 0,run_name,average,arc_challenge_len_norm,arc_easy_acc,basic_arithmetic_acc,commonsense_qa_len_norm,hellaswag_len_norm,mmlu_humanities_var_len_norm,mmlu_other_var_len_norm,mmlu_social_sciences_var_len_norm,mmlu_stem_var_len_norm,openbook_qa_len_norm,piqa_len_norm,sciq_acc,social_iqa_len_norm,winogrande_acc
1,pos_eli5_oh_neg_dclm_refinedweb_steps_2000_lr3...,0.519302,0.411371,0.72807,0.405667,0.47502,0.613822,0.327082,0.431791,0.406952,0.316422,0.4,0.76333,0.926,0.458547,0.606156
3,pos_eli5_oh_neg_dclm_refinedweb_steps_2000_lr3...,0.514043,0.444816,0.719298,0.434333,0.461916,0.594503,0.32791,0.42173,0.40568,0.312417,0.386,0.730141,0.92,0.449846,0.588003
0,regression_synthetic_20epochs_bs640_lf1_lre35_...,0.512562,0.408027,0.74386,0.307333,0.443898,0.642601,0.322945,0.44668,0.385333,0.301291,0.422,0.764418,0.927,0.459058,0.601421
2,regression_synthetic_20epochs_bs640_lf1_lre35_...,0.512163,0.428094,0.745614,0.352667,0.403767,0.639713,0.315775,0.451911,0.383637,0.332443,0.4,0.767138,0.925,0.446776,0.577743
4,falcon_and_cc_og_eli5_oh_top10p-1B-5xC-2,0.508023,0.428094,0.708772,0.363,0.445536,0.629058,0.317154,0.44668,0.360746,0.296395,0.392,0.754081,0.91,0.457011,0.603788
8,fineweb_edu_dedup-1B-5xC,0.505339,0.408027,0.750877,0.305667,0.438165,0.587134,0.339217,0.457545,0.401441,0.286159,0.406,0.736126,0.921,0.447799,0.589582
6,fineweb_edu_dedup-1B-5xC-2,0.498841,0.367893,0.764912,0.297,0.425061,0.583051,0.32791,0.441046,0.394235,0.279484,0.412,0.737758,0.918,0.442682,0.592739
7,prox_fineweb_pro-1B-5xC-2,0.488075,0.364549,0.694737,0.274667,0.413595,0.62388,0.316878,0.432998,0.365409,0.269248,0.402,0.745375,0.902,0.445241,0.582478
5,baseline-1B-5xC-2,0.474626,0.29097,0.607018,0.325,0.488124,0.599881,0.281302,0.403219,0.305638,0.267468,0.346,0.754081,0.91,0.485159,0.5809


In [50]:
# convert to md
print(final_df.to_markdown(index=False))

| run_name                                                       |   average |   arc_challenge_len_norm |   arc_easy_acc |   basic_arithmetic_acc |   commonsense_qa_len_norm |   hellaswag_len_norm |   mmlu_humanities_var_len_norm |   mmlu_other_var_len_norm |   mmlu_social_sciences_var_len_norm |   mmlu_stem_var_len_norm |   openbook_qa_len_norm |   piqa_len_norm |   sciq_acc |   social_iqa_len_norm |   winogrande_acc |
|:---------------------------------------------------------------|----------:|-------------------------:|---------------:|-----------------------:|--------------------------:|---------------------:|-------------------------------:|--------------------------:|------------------------------------:|-------------------------:|-----------------------:|----------------:|-----------:|----------------------:|-----------------:|
| pos_eli5_oh_neg_dclm_refinedweb_steps_2000_lr3e4_top20p-1B-5xC |  0.519302 |                 0.411371 |       0.72807  |               0.405667 |     