In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [None]:
import os
from pathlib import Path


fpath = (
    Path(os.environ["DSS_HOME"])
    / "toxicainment/2025-02-07-saxony-labeled-data/human-labels.csv"
)
human_labels = pd.read_csv(fpath, dtype={"post_id": str})
human_labels = (
    human_labels.sort_values(by="timestamp")
    .rename(
        columns={
            "is_saxony_election": "is_saxony",
            "is_saxony_election_comment": "is_saxony_comment",
        }
    )  # The column name was wrong while saving, but the UI correctly displayed that it does not HAVE to be politics in saxony
    .groupby(["post_id", "classification_by"])
    .last()
    .reset_index()
)
print(len(human_labels))
# join num raters per post
nraters_by_post = (
    human_labels.groupby("post_id", as_index=False)["classification_by"]
    .agg(["nunique"])
    .rename(columns={"nunique": "nraters"})
)
human_labels = pd.merge(human_labels, nraters_by_post, on="post_id", how="left")
human_labels.head()

In [4]:
questions = [
    "is_political",
    "is_saxony",
    "is_intolerant",
    "is_hedonic_entertainment",
    "is_eudaimonic_entertainment",
]
comment_cols = [f"{col}_comment" for col in questions]


# Dataset Balance

In [None]:
label_balance = (
    human_labels[questions].apply(lambda s: s.value_counts(normalize=True)).round(3)
)
label_balance


In [None]:
from bench_lib.evaluation import plot_scalars_for_questions

plot_scalars_for_questions(label_balance.max(axis=0), questions, "Majority Class %")

# Percentage Agreement


In [None]:
def compute_agreement(human_labels: pd.DataFrame, question: str) -> float:
    df = pd.crosstab(human_labels["post_id"], human_labels[question])
    full_agreement = (df["no"] == 0) | (df["yes"] == 0)
    return float(full_agreement.mean())


agreements = [compute_agreement(human_labels, q) for q in questions]
print(agreements)
plot_scalars_for_questions(agreements, questions, "Percentage Agreement", x_reversed=True)

# Krippendorff's alpha

In [None]:
import matplotlib.pyplot as plt

from bench_lib.evaluation import krippendorf_alpha


alphas = [
    krippendorf_alpha(human_labels["post_id"], human_labels[q]) for q in questions
]
fig = plot_scalars_for_questions(alphas, questions, "Krippendorff's alpha", x_reversed=True)
# fig.savefig("imgs/krippendorffs_alpha.pdf")
fig

# Fleis Kappa

In [None]:
from bench_lib.evaluation import fleiss_kappa

n_raters = 3
q = "is_intolerant"
fk_human_labels = human_labels.query("nraters == @n_raters")
table = pd.crosstab(fk_human_labels["post_id"], fk_human_labels[q]).values
kp, agreements = fleiss_kappa(table, method="fleiss")
ag_counts = pd.Series(agreements).value_counts()
print(kp)
fig = plt.figure(figsize=(4, 3))
plt.bar(ag_counts.index, ag_counts.values, width=0.05)
plt.xlim(-0.1, 1.1)
plt.xlabel("Agreement")
plt.ylabel("Count")
plt.title(f"Agreement for '{q}' with {n_raters} raters. Fleiss Kappa: {kp:.2f}")
plt.tight_layout()
plt.show()

# Human Consistency

In [None]:
human_constistent_df = pd.melt(
    (human_labels.groupby("post_id")[questions].nunique() == 1).reset_index(),
    id_vars="post_id",
    value_vars=questions,
    value_name="human_consistent",
)
human_constistent_df

# How does AI perform on the easy posts?
Assuming the easy posts are those where humans are consistent.
We assume here the ground truth to be the human labels, which are unique by construction

In [None]:
human_labels_long = pd.melt(human_labels, id_vars="post_id", value_vars=questions)
ground_truth_long = pd.merge(
    human_labels_long,
    human_constistent_df.query("human_consistent"),
    on=("post_id", "variable"),
).drop_duplicates()
ground_truth_long

In [None]:
from bench_lib.evaluation import load_ai_labels, compute_ai_perfs


gemma3_folders = [
    "gemma-3-4b-it",
    "gemma-3-12b-it",
    "gemma-3-27b-it_00",
    "gemini-2.0-flash-001",
]
gemma3_ai_labels_long = load_ai_labels(
    gemma3_folders, questions, comment_cols, long=True
)
gemma3_ai_perfs = compute_ai_perfs(ground_truth_long, gemma3_ai_labels_long, questions)

In [None]:
gt_sizes = ground_truth_long.groupby("variable", as_index=False).size()
plot_scalars_for_questions(gt_sizes["size"], gt_sizes["variable"], "Ground truth size")

In [None]:
from bench_lib.evaluation import plot_ai_perfs


g3_order = [f"google/gemma-3-{n}b-it" for n in [4, 12, 27]] + [
    "google/gemini-2.0-flash-001"
]
fig = plot_ai_perfs(gemma3_ai_perfs, g3_order, list(reversed(questions)))
fig.savefig("imgs/gemma3_ai_perfs.pdf", bbox_inches="tight")
fig

In [None]:
qwen_ai_labels = load_ai_labels(
    folders=["qwen-2.5-vl", "gemini-2.0-flash-001"],
    questions=questions,
    comment_cols=comment_cols,
    long=True,
)
qwen_ai_labels.query(
    "`Model ID`.str.contains('Qwen') or `Model ID`.str.contains('gemini')", inplace=True
)
qwen_ai_perfs = compute_ai_perfs(ground_truth_long, qwen_ai_labels, questions)
order = [f"Qwen/Qwen2.5-VL-{n}B-Instruct" for n in [3, 7, 72]] + [
    "google/gemini-2.0-flash-001"
]
plot_ai_perfs(qwen_ai_perfs, order, questions)

In [None]:
full_ai_perfs = pd.concat([gemma3_ai_perfs, qwen_ai_perfs]).drop_duplicates()
full_order = [
    "Qwen/Qwen2.5-VL-3B-Instruct",
    "google/gemma-3-4b-it",
    "Qwen/Qwen2.5-VL-7B-Instruct",
    "google/gemma-3-12b-it",
    "google/gemma-3-27b-it",
    "Qwen/Qwen2.5-VL-72B-Instruct",
    "google/gemini-2.0-flash-001",
]
plot_ai_perfs(full_ai_perfs, full_order, questions)

# How does AI perform on the hard posts?
The hard posts cannot be evaluated on single-labels ground truth like the easy posts.
Instead we compare how often AI changes its mind, i.e. how self-consistent its answers are.

In [None]:
folders = [f"gemma-3-27b-it_{i:02d}" for i in range(3)]
ai_labels = load_ai_labels(folders, questions, comment_cols)

In [None]:
ai_answers_hard.groupby("variable", as_index=False).agg(
    ai_consistent=("ai_consistent", "mean"),
    human_consistent=("human_consistent", "mean"),
)

In [None]:
ai_answers_hard = (
    pd.melt(ai_labels, id_vars="post_id", value_vars=questions)
    .groupby(["post_id", "variable"], as_index=False)["value"]
    .agg([("n_answers", "count"), ("n_unique", "nunique")])
    .assign(ai_consistent=lambda df: df["n_unique"] == 1)
)

ai_answers_hard = pd.merge(
    ai_answers_hard, human_constistent_df, on=["post_id", "variable"]
)
ai_answers_hard
# ai_answers_hard = pd.merge(ai_answers_hard, post_difficulty, on="post_id", how="left")
# ai_answers_hard.groupby(["variable", "is_easy"])["self_consistent"].value_counts(
#     normalize=True
# )

In [None]:
ai_answers_hard.groupby(["variable", "human_consistent"])["ai_consistent"].value_counts(
    normalize=True
)

# Regression
* Detecting Hedonic entertainment and Intolerance significantly decrease the self-consistency of AI
* Against my expectations, there is no significant effect of human consistency on AI consistency (hypothesis was human are inconsistent in difficutl posts, and so will be AI)


In [None]:
# Perform logistic regression to analyze how variable type and human consistency
# affect AI consistency
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np

# Convert categorical variable to dummy variables
model_data = ai_answers_hard.assign(
    variable=lambda df: pd.Categorical(df["variable"]),
    human_consistent=lambda df: df["human_consistent"].astype(int),
    ai_consistent=lambda df: df["ai_consistent"].astype(int),
)

# Fit logistic regression model
logit_model = smf.logit(
    "ai_consistent ~ C(variable) + C(human_consistent)", data=model_data
)
result = logit_model.fit()
print("Logistic Regression Results:")
print(result.summary())

# Display odds ratios
print("\nOdds Ratios:")
print(np.exp(result.params))

# When is AI inconsistent?

In [None]:
idx = 2
row = ai_answers_hard.query("~ai_consistent").iloc[idx]
ai_labels.query("post_id == @row.post_id")[
    [row["variable"], f"{row['variable']}_comment"]
].to_dict(orient="records")