# Tabulate the `experiments.csv` files from each eval

In [1]:
import os
import json
import pandas as pd

In [2]:
# get the dir of this script
cwd = os.getcwd()

eval_dir = os.path.join(cwd, "eval")
print(f"eval_dir: {eval_dir} exists: {os.path.exists(eval_dir)}")

all_evals = os.listdir(eval_dir)

eval_dir: /scratch/eb3174/workspace/mllm_eval_hpc/eval exists: True


In [3]:
# experiment_csv_fname = "experiments.csv"
experiment_csv_fname = "experiments_rerun.csv"

evals_order = [
    ## llava
    # 'vqav2',
    'gqa',
    'vizwiz',
    'scienceqa',
    'textvqa',
    'pope',
    'mme',
    'mmbench_en',
    'mmbench_cn',
    'seed',
    # 'llava_w',
    # 'mmvet', # submission
    ## Addtl
    'mmmu',
    'mathvista',
    'ai2d',
    'chartqa',
    # 'docvqa', # submission
    # 'infovqa', # submission
    # 'stvqa', # submission
    'ocrbench',
    'mmstar',
    'realworldqa',
    'qbench',
    'blink',
    'mmvp',
    'vstar',
    'ade',
    'omni',
    'coco'
    # 'synthdog', # seems broken?
]
default_col = "accuracy"

evals_col_overrides = {
    'scienceqa': '100x_multimodal_acc',
    'mme': "Perception",
    'mmbench_en': "100x_circular_accuracy",
    'mmbench_cn': "100x_circular_accuracy",
    'seed': "100x_accuracy",
    ## Addtl
    'mmmu': "100x_accuracy",
    'mathvista': "100x_accuracy",
    'ocrbench': "total_accuracy['accuracy']",
    'qbench': "100x_accuracy",
    'blink': "100x_accuracy",
    'ade': "100x_accuracy",
    'omni': "100x_accuracy",
    'coco': "100x_accuracy",
}

dfs = []
for eval_name in evals_order:
    # print(f"\nProcessing {eval_name}")
    results_path = os.path.join(eval_dir, eval_name, experiment_csv_fname)
    if not os.path.exists(results_path):
        print(f"Skipping {eval_name} as no results file found")
        continue
    df = pd.read_csv(results_path)
    # print(f"Loaded {len(df)} results")
    # print(f"Columns: {df.columns}")
    if eval_name in evals_col_overrides:
        override = evals_col_overrides[eval_name]
        # if override == "100x_accuracy":
        if override.startswith("100x_"):
            override = override[5:]
            df["accuracy"] = df[override] * 100
        elif override == "total_accuracy['accuracy']":
            df["accuracy"] = df["total_accuracy"].apply(lambda x: json.loads(x.replace("'", '"'))["accuracy"])
        else:
            df["accuracy"] = df[override]
    df["eval_name"] = eval_name

    # drop duplicates, use latest "time"
    df = df.sort_values("time")
    df = df.drop_duplicates("model", keep="last")

    # only keep relevant columns
    df = df[["time", "eval_name", "model", "accuracy"]]
    dfs.append(df)

In [4]:
# combine all evals
all_results = pd.concat(dfs)
all_results

Unnamed: 0,time,eval_name,model,accuracy
0,2024-05-17 03:48:21,gqa,llava-vicuna-7b-iJEPA-vit-h-14-737k-bs512,55.374463
1,2024-05-17 07:25:38,gqa,llava-vicuna-7b-DFN-CLIP-737k-bs512,50.478992
2,2024-05-17 12:35:12,gqa,llava-2stage-ft-vicuna-7b-SigLIP-vit-l-737k-bs512,62.561616
3,2024-05-17 15:46:20,gqa,llava-2stage-ft-vicuna-7b-clip-convnext-XXL-re...,62.728574
4,2024-05-17 20:49:31,gqa,llava-vicuna-7b-clip-convnext-XXL-res1024-inte...,55.088249
...,...,...,...,...
1,2024-06-16 20:24:52,coco,llava-2stage_pt0.5M-ft-vicuna-7b-clip-convnext...,55.403727
2,2024-06-17 19:39:53,coco,llava-2stage-ft-vicuna-7b-MOCO-v3-vit-l-16-737...,40.745342
3,2024-06-17 19:46:53,coco,llava-vicuna-7b-MOCO-v3-vit-l-16-737k-bs512,35.155280
4,2024-06-17 20:04:27,coco,llava-2stage-ft-vicuna-7b-MOCO-v3-vit-l-16-737...,41.118012


In [5]:
all_results.sort_values("time").to_csv("all_results.csv", index=False)

In [6]:
# I want model on the rows, eval_name on the columns
pivot = all_results.pivot(index="model", columns="eval_name", values="accuracy")
# reorder the evals in the "evals_order" order
pivot = pivot[evals_order]
pivot

eval_name,gqa,vizwiz,scienceqa,textvqa,pope,mme,mmbench_en,mmbench_cn,seed,mmmu,...,ocrbench,mmstar,realworldqa,qbench,blink,mmvp,vstar,ade,omni,coco
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
llava-2stage-ft-vicuna-7b-DFN-CLIP-vit-l-737k-bs512,60.748927,53.086363,66.955875,52.845,85.322222,1341.135954,56.680851,49.483649,63.742711,33.798604,...,23.2,33.6,51.437908,59.063545,43.820933,26.666667,41.884817,47.077409,52.083333,
llava-2stage-ft-vicuna-7b-DFN-CLIP-vit-l-737k-bs512-mmlr1e-5-unfreeze,59.937987,52.954388,67.278136,52.228,83.822222,1364.911565,55.148936,49.827883,63.057683,35.59322,...,23.4,33.466667,50.849673,57.792642,43.631778,22.666667,39.790576,47.551343,53.5,59.627329
llava-2stage-ft-vicuna-7b-EVACLIP-737k-bs512,,,,,,,,,67.526172,,...,,,,60.535117,44.388398,,,,,
llava-2stage-ft-vicuna-7b-EVACLIP-737k-bs512-mmlr1e-5-unfreeze,65.177294,55.823107,68.616758,56.868,87.622222,1492.354242,65.531915,57.314974,69.75339,34.995015,...,29.9,35.733333,58.954248,59.264214,46.027743,44.666667,49.21466,,,
llava-2stage-ft-vicuna-7b-MAE-vit-l-16-737k-bs512,57.417713,52.095392,66.038671,44.532,81.755556,1132.80122,43.404255,35.197935,55.673435,35.59322,...,3.3,29.133333,47.712418,57.190635,41.740227,16.0,35.078534,53.129549,56.75,
llava-2stage-ft-vicuna-7b-MAE-vit-l-16-737k-bs512-mmlr1e-5-unfreeze,58.753379,50.145867,64.650471,44.674,83.588889,1181.508804,45.531915,38.296041,58.919413,34.995015,...,3.1,30.0,49.673203,58.394649,42.118537,18.666667,40.837696,54.294032,56.833333,
llava-2stage-ft-vicuna-7b-MOCO-v3-vit-l-16-737k-bs512,52.631579,50.173651,65.493307,43.57,79.688889,1015.195378,37.276596,29.948365,48.310265,34.49651,...,3.1,28.8,45.490196,56.38796,40.100883,14.666667,34.554974,41.232227,52.916667,40.745342
llava-2stage-ft-vicuna-7b-MOCO-v3-vit-l-16-737k-bs512-mmlr1e-5-unfreeze,53.649229,51.403103,65.493307,44.214,80.477778,1075.386555,39.148936,31.927711,50.137006,34.596211,...,3.0,28.733333,44.575163,56.32107,39.722573,14.666667,37.172775,40.758294,53.0,41.118012
llava-2stage-ft-vicuna-7b-SigLIP-5186k-bs512-mmlr1e-5-unfreeze,,,,,,1557.594738,,,,,...,,,,,,,,,,
llava-2stage-ft-vicuna-7b-SigLIP-737k-bs512,,,,,,,,,,,...,,,,,45.271122,,,,,


In [7]:
# pivot.to_excel("all_results.xlsx")
pivot.to_excel("all_results_rerun.xlsx")