In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [None]:
human_labels = pd.read_csv(
    "../dataset/saxony-labeled-data/human-labels.csv", dtype={"post_id": str}
)
human_labels = (
    human_labels.sort_values(by="timestamp")
    .rename(
        columns={
            "is_saxony_election": "is_saxony",
            "is_saxony_election_comment": "is_saxony_comment",
        }
    )  # The column name was wrong while saving, but the UI correctly displayed that it does not HAVE to be politics in saxony
    .groupby(["post_id", "classification_by"])
    .last()
    .reset_index()
)
print(len(human_labels))
human_labels.head()


In [4]:
from evaluation_lib import difficulty_score, get_means


questions = [
    "is_political",
    "is_saxony",
    "is_intolerant",
    "is_hedonic_entertainment",
    "is_eudaimonic_entertainment",
]
comment_cols = [f"{col}_comment" for col in questions]
q_mens = {q: get_means(human_labels, question=q) for q in questions}
difficulty = difficulty_score(pd.DataFrame(q_mens))

In [None]:
from matplotlib import pyplot as plt


diff_counts = difficulty.value_counts()
plt.bar(diff_counts.index, diff_counts.values, width=0.03)

In [None]:
unique_answers_by_post = (
    human_labels.groupby("post_id")[questions].nunique().max(axis=1)
)
hard_posts = unique_answers_by_post[unique_answers_by_post > 1]
easy_posts = unique_answers_by_post[unique_answers_by_post == 1]
print(len(hard_posts), len(easy_posts))

# How does AI perform on the easy posts?
We assume here the ground truth to be the human labels, which are unique by construction

In [None]:
from evaluation_lib import load_ai_labels, performance_by_category


human_labels_easy_set = human_labels.query("post_id in @easy_posts.index")[
    ["post_id", *questions]
].drop_duplicates()
human_labels_easy_set

models = ["gemma-3-4b-it", "gemma-3-12b-it", "gemma-3-27b-it", "MiniCPM-V-2.6"]
all_ai_labels = load_ai_labels(models, questions, comment_cols)


# performance_by_category(complete_ai_labels, human_labels_easy_set)
all_ai_perfs = (
    all_ai_labels.groupby("model_id")
    .apply(
        performance_by_category,
        ref_wide=human_labels_easy_set,
        questions=questions,
        include_groups=False,
    )
    .reset_index()
    .drop(columns=["level_1"])
)

In [None]:
import plotly.express as px

px.bar(all_ai_perfs, x="variable", y="is_correct", color="model_id", barmode="group")

In [None]:
from evaluation_lib import join_wides


pd.set_option("display.max_columns", 100)
pd.set_option("display.width", None)

model_id = "MiniCPM-V-2.6"
q_col = "is_hedonic_entertainment"
joined = join_wides(
    all_ai_labels.query("model_id == @model_id"), human_labels_easy_set, questions
)
ai_bad = joined.query("not is_correct and variable == @q_col")
ai_bad.head()


In [None]:
from evaluation_lib import to_binary_series

from evaluation_lib import plot_question_hists

human_hist_data = {}
for question, means in q_mens.items():
    counts = means.value_counts(normalize=True)
    human_hist_data[question] = (counts.index, counts.values)

ai_hist_data = {}
for question in questions:
    counts = to_binary_series(
        all_ai_labels.query("model_id == @model_id")[question]
    ).value_counts(normalize=True)
    ai_hist_data[question] = (counts.index, counts.values)

plot_question_hists(ai_hist_data, title=model_id)
plot_question_hists(human_hist_data, title="Human")

In [None]:
q_col_comment = q_col + "_comment"
for _, row in all_ai_labels.query(
    "model_id == @model_id and post_id.isin(@ai_bad.post_id)"
)[["post_id", q_col_comment, q_col]].iterrows():
    print(row["post_id"])
    print(row[q_col_comment])
    print(row[q_col])
    print()
