In [122]:
%reload_ext autoreload
%autoreload 2

In [123]:
from bench_lib.utils import enable_info_logs

enable_info_logs()

In [124]:
import pandas as pd

In [None]:
from bench_lib.evaluation import load_human_labels


human_labels, questions, comment_cols = load_human_labels()
human_labels.head()

# Dataset Balance

In [None]:
label_balance = (
    human_labels[questions].apply(lambda s: s.value_counts(normalize=True)).round(3)
)
label_balance


In [None]:
from bench_lib.evaluation import plot_scalars_for_questions

plot_scalars_for_questions(
    label_balance.max(axis=0), questions, "Majority Class %", x_reversed=True
)

# Percentage Agreement


In [None]:
human_labels_long = pd.melt(
    human_labels, id_vars=["classification_by", "post_id"], value_vars=questions
)
human_labels_long.head(2)

In [None]:
from bench_lib.evaluation import compute_agreement_score

human_agreement = compute_agreement_score(human_labels_long)
human_agreement.head(2)

In [None]:
agreement_by_question = human_agreement.groupby("variable", as_index=False).agg(
    avg_agreement_score=("agreement_score", "mean"),
    avg_full_agreement=("full_agreement", "mean"),
)
agreement_by_question.sort_values("avg_full_agreement", inplace=True)
agreement_by_question


In [None]:
plot_scalars_for_questions(
    agreement_by_question["avg_full_agreement"],
    agreement_by_question["variable"],
    "Posts with Full Agreement [%]",
)

# Krippendorff's alpha

In [None]:
import matplotlib.pyplot as plt

from bench_lib.evaluation import krippendorf_alpha


alphas = [
    krippendorf_alpha(human_labels["post_id"], human_labels[q]) for q in questions
]
fig = plot_scalars_for_questions(
    alphas, questions, "Krippendorff's alpha", x_reversed=True
)
# fig.savefig("imgs/krippendorffs_alpha.pdf")
fig

# How does AI perform on the easy posts?
Assuming the easy posts are those where humans are consistent.
We assume here the ground truth to be the human labels, which are unique by construction

In [None]:
ground_truth_long = pd.merge(
    human_labels_long,
    human_agreement.query("full_agreement"),
    on=("post_id", "variable"),
).drop_duplicates()
ground_truth_long

In [None]:
from bench_lib.evaluation import load_ai_labels, compute_ai_perfs


gemma3_folders = [
    "gemma-3-4b-it",
    "gemma-3-12b-it",
    "gemma-3-27b-it_00",
    "gemini-2.5-pro-noschema",
]
gemma3_ai_labels_long = load_ai_labels(gemma3_folders, questions, comment_cols)
gemma3_ai_perfs = compute_ai_perfs(ground_truth_long, gemma3_ai_labels_long, questions)

In [None]:
gt_sizes = ground_truth_long.groupby("variable", as_index=False).size()
plot_scalars_for_questions(gt_sizes["size"], gt_sizes["variable"], "Ground truth size")

In [136]:
from bench_lib.evaluation import plot_ai_perfs


g3_order = [f"google/gemma-3-{n}b-it" for n in [4, 12, 27]] + [
    "google/gemini-2.5-pro-preview-03-25"
]
metrics = ["accuracy", "precision", "recall", "f1"]
for y in metrics:
    fig = plot_ai_perfs(gemma3_ai_perfs, g3_order, list(reversed(questions)), y=y)
    fig.savefig(f"imgs/gemma3_ai_perfs_{y}.pdf", bbox_inches="tight")

In [None]:
qwen_ai_labels = load_ai_labels(
    folders=["qwen-2.5-vl", "gemini-2.5-pro-noschema"],
    questions=questions,
    comment_cols=comment_cols,
)
qwen_ai_labels.query(
    "`Model ID`.str.contains('Qwen') or `Model ID`.str.contains('gemini')", inplace=True
)
qwen_ai_perfs = compute_ai_perfs(ground_truth_long, qwen_ai_labels, questions)
order = [f"Qwen/Qwen2.5-VL-{n}B-Instruct" for n in [3, 7, 72]] + [
    "google/gemini-2.5-pro-preview-03-25"
]
for y in metrics:
    fig = plot_ai_perfs(qwen_ai_perfs, order, x_order=list(reversed(questions)), y=y)
    fig.savefig(f"imgs/qwen_ai_perfs_{y}.pdf", bbox_inches="tight")

# How does AI perform on the hard posts?
The hard posts cannot be evaluated on single-labels ground truth like the easy posts.
Instead we compare how often AI changes its mind, i.e. how self-consistent its answers are.

In [None]:
folders = ["self-consistency"]
ai_labels = load_ai_labels(folders, questions, comment_cols)

In [None]:
ai_consistency_df = compute_agreement_score(
    ai_labels, groupby=["Model ID", "post_id", "variable"]
)

joint_consistency_df = pd.merge(
    ai_consistency_df,
    human_agreement,
    on=["post_id", "variable"],
    suffixes=("_ai", "_human"),
)
joint_consistency_df

# Regression
* Detecting Hedonic entertainment and Intolerance significantly decrease the self-consistency of AI
* Against my expectations, there is no significant effect of human consistency on AI consistency (hypothesis was human are inconsistent in difficutl posts, and so will be AI)


In [None]:
joint_consistency_df

In [141]:
import numpy as np

# Convert categorical variable to dummy variables
model_data = joint_consistency_df.assign(
    model_id=lambda df: df["Model ID"],
    model_size=lambda df: df["Model ID"].str.lower().str.extract(r'-(\d+)b-').astype(int),
    model_family=lambda df: np.where(df["Model ID"].str.contains("Qwen"), "Qwen2.5", "Gemma3"),
) 

In [None]:
# Perform linear regression to analyze how variable type and human consistency
# affect AI consistency
import statsmodels.formula.api as smf

ols_model = smf.ols(
    "agreement_score_ai ~ model_size + C(model_family) + C(variable) + agreement_score_human",
    data=model_data,    
)
result = ols_model.fit()
print("Linear Regression Results:")
print(result.summary())

# Display coefficients
print("\nCoefficients:")
print(result.params)

import seaborn as sns

fig = plt.figure(figsize=(10, 10))
fig = sns.lmplot(
    joint_consistency_df,
    x="agreement_score_human",
    y="agreement_score_ai",
    hue="variable",
    row="Model ID",
    col="variable",
    y_jitter=0.02,
    x_jitter=0.02,
)
fig.savefig("imgs/ai_consistency_by_human_consistency.pdf", bbox_inches="tight")


# BERTScore

In [None]:
from bench_lib.evaluation import bertscore_alignment, plot_alignment_table

df = pd.read_csv("model_labels.csv")
bertscore_df = bertscore_alignment(df)
fig = plot_alignment_table(bertscore_df)
fig