In [1]:
import pandas as pd

In [None]:
human_labels = pd.read_csv("../dataset/saxony-labeled-data/human-labels.csv", dtype={"post_id": str})
human_labels = (
    human_labels.sort_values(by="timestamp")
    .rename(columns={"is_saxony_election": "is_saxony", "is_saxony_election_comment": "is_saxony_comment"}) # The column name was wrong while saving, but the UI correctly displayed that it does not HAVE to be politics in saxony
    .groupby(["post_id", "classification_by"])
    .last()
    .reset_index()
)
print(len(human_labels))
human_labels.head()


In [3]:
def to_binary_series(s: pd.Series):
    assert s.isin(["yes", "no"]).all()
    return s.map({"yes": 1, "no": 0})

def get_means(labels_df: pd.DataFrame, question: str):
    labels_df = labels_df.assign(question_binary=to_binary_series(labels_df[question]))
    return labels_df.groupby("post_id")["question_binary"].mean()

questions = ["is_political", "is_saxony", "is_intolerant", "is_hedonic_entertainment", "is_eudaimonic_entertainment"]
comment_cols = [f"{col}_comment" for col in questions]
q_mens = {q: get_means(human_labels, question=q) for q in questions}

In [None]:
import matplotlib.pyplot as plt
import numpy as np

human_hist_data = {}
for question, means in q_mens.items():
    counts = means.value_counts(normalize=True)
    human_hist_data[question] = (counts.index, counts.values)

def plot_question_hists(hist_data: dict[str, tuple[np.ndarray, np.ndarray]]):
    fig, axes = plt.subplots(1, 5, figsize=(16, 2.5))
    fig.tight_layout(pad=3.0)

    for ax, (question, (x_vals, y_vals)) in zip(axes, hist_data.items()):
        ax.bar(x_vals, y_vals, width=0.05)
        ax.set_title(question)
        ax.set_ylabel("Fraction")
        ax.set_xlabel("No=0 or Yes=1")
        ax.set_ylim(0, 1)
        ax.grid(alpha=0.5)

plot_question_hists(human_hist_data)

In [None]:
def difficulty_score(df: pd.DataFrame):
    difficulty = (1 - ((df - 0.5).abs() / 0.5)).mean(axis=1).sort_values(ascending=False)
    return difficulty

difficulty = difficulty_score(pd.DataFrame(q_mens))
difficulty

In [None]:
diff_counts = difficulty.value_counts()
print(diff_counts)
plt.bar(diff_counts.index, diff_counts.values, width=0.03)

In [None]:
unique_answers_by_post = human_labels.groupby("post_id")[questions].nunique().max(axis=1)
hard_posts = unique_answers_by_post[unique_answers_by_post > 1]
easy_posts = unique_answers_by_post[unique_answers_by_post == 1]
print(len(hard_posts), len(easy_posts))

# How does AI perform on the easy posts?
We assume here the ground truth to be the human labels, which are unique by construction

In [None]:
#ai_labels = pd.read_csv("../dataset/generated-ai-labels/model_labels_openbmb_MiniCPM-V-2_6_03_17_25.csv")
ai_labels = pd.read_csv("../results-gemma-3-12b-it/model_labels.csv", dtype={"post_id": str})
# ai_labels["post_id"] = ai_labels["post_id"].str[-19:]
print(len(ai_labels.columns))
# print(ai_labels.columns)
ai_labels = ai_labels[["post_id", *questions, *comment_cols]]
complete_ai_labels = ai_labels[~ai_labels[questions].isna().any(axis=1)]
print(len(complete_ai_labels))

human_labels_easy_set = human_labels.query("post_id in @easy_posts.index")[["post_id", *questions]].drop_duplicates()
human_labels_easy_set
joined = pd.merge(
    pd.melt(human_labels_easy_set, id_vars="post_id", value_vars=questions),
    pd.melt(complete_ai_labels, id_vars="post_id", value_vars=questions),
    on=["post_id", "variable"],
    suffixes=("_human", "_ai"),
    how="inner"
).assign(
    is_correct=lambda df: df["value_human"] == df["value_ai"]
)
joined.groupby("variable")["is_correct"].mean()

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)

# Why is is_saxony so bad? Pretty much random
# The model always erred on "yes" while the human labels were "no"
# after looking at individual failures in detail its clear that the model mixes up Germany for Saxony, so answers
# "yes" whenever it finds a mention or association with Germany.

# Why is hedonic_entertainment so bad?
# After looking at failures, it seems that the model does not code music (most likely because it cannot process audio!)
ai_bad = joined.query("not is_correct and variable == 'is_intolerant'")
ai_bad


In [None]:
ai_hist_data = {}
for question in questions:
    counts = to_binary_series(complete_ai_labels[question]).value_counts(normalize=True)
    ai_hist_data[question] = (counts.index, counts.values)

plot_question_hists(ai_hist_data)
plot_question_hists(human_hist_data)

In [None]:
q_col = "is_intolerant"
q_col_comment = "is_intolerant_comment"
for _, row in complete_ai_labels.query("post_id.isin(@ai_bad.post_id)")[["post_id", q_col_comment, q_col]].iterrows():
    print(row["post_id"])
    print(row[q_col_comment])
    print(row[q_col])
    print()
