In [30]:
from bench_lib.evaluation import load_ai_labels, load_human_labels
from bench_lib.evaluation import compute_agreement_score


human_labels_long, questions, comment_cols = load_human_labels(long=True)
human_agreement = compute_agreement_score(human_labels_long)
hf_labels_long = load_ai_labels(
    folders=["qwen-2.5-vl"], questions=questions, comment_cols=comment_cols
).query("`Model ID`.str.startswith('Qwen')")
vllm_labels_long = load_ai_labels(
    folders=["vllm-qwen2.5-vl"], questions=questions, comment_cols=comment_cols
).query("post_id.isin(@hf_labels_long['post_id'].unique()) and `Model ID`.isin(@hf_labels_long['Model ID'].unique())")


In [31]:
import pandas as pd


ground_truth_long = pd.merge(
    human_labels_long,
    human_agreement.query("full_agreement"),
    on=("post_id", "variable"),
)

In [None]:
from bench_lib.evaluation import compute_ai_perfs

perfs = []
for labels_long, name in [(hf_labels_long, "hf"), (vllm_labels_long, "vllm")]:
    ai_perfs = compute_ai_perfs(ground_truth_long, labels_long, questions).assign(type=name)
    perfs.append(ai_perfs)
perfs_df = pd.concat(perfs)

In [None]:
import seaborn as sns

# Looks like vLLM is slightly better than HF.
# We cannot expect identical results because the multimodal pre-processing is apparently different in vLLM.
sns.catplot(data=perfs_df, x="type", y="f1", col="Model ID", kind="bar")