In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
from sklearn.metrics import cohen_kappa_score

def load_llm_labels(path):
    with open(path, "r") as f:
        data = json.load(f)

    labels = {}
    for item in data["results"]:
        sid = item["sample_id"]
        choice = item.get("choices", [])

        if len(choice) != 1:
            continue  # skip malformed entries

        labels[sid] = choice[0]

    return labels

mistral_labels = load_llm_labels(
    "/content/drive/MyDrive/nlp/output_mistral_annotations.json"
)
qwen_labels = load_llm_labels(
    "/content/drive/MyDrive/nlp/output_qwen.json"
)

# Align by sample_id
common_ids = sorted(set(mistral_labels) & set(qwen_labels))

y_mistral = [mistral_labels[sid] for sid in common_ids]
y_qwen    = [qwen_labels[sid] for sid in common_ids]

# Cohen's kappa
kappa = cohen_kappa_score(y_mistral, y_qwen)

print(f"Cohen’s κ (Mistral vs Qwen): {kappa:.3f}")


Cohen’s κ (Mistral vs Qwen): 0.111


In [None]:
import json
from sklearn.metrics import cohen_kappa_score

def load_llm_labels(path):
    """
    Loads files of the form:
    { "results": [ {sample_id, choices:[x], ...}, ... ] }
    Returns {sample_id: rating}
    """
    with open(path, "r") as f:
        data = json.load(f)

    labels = {}
    for item in data["results"]:
        if "sample_id" in item and "choices" in item and len(item["choices"]) == 1:
            labels[item["sample_id"]] = item["choices"][0]
    return labels


In [None]:
# Dataset A: Human + LLM
mistral_A = load_llm_labels("/content/drive/MyDrive/nlp/mistral_annotated_results.json")
qwen_A    = load_llm_labels("/content/drive/MyDrive/nlp/qwen_annotated_results.json")

# Dataset B: Synthetic-only
mistral_B = load_llm_labels("/content/drive/MyDrive/nlp/output_mistral_annotations.json")
qwen_B    = load_llm_labels("/content/drive/MyDrive/nlp/output_qwen.json")


In [None]:
mistral_all = {**mistral_A, **mistral_B}
qwen_all    = {**qwen_A, **qwen_B}

# Align sample IDs
common_ids = sorted(set(mistral_all) & set(qwen_all))

print("Total combined examples:", len(common_ids))

y_mistral = [mistral_all[sid] for sid in common_ids]
y_qwen    = [qwen_all[sid] for sid in common_ids]

Total combined examples: 3072


In [None]:
kappa = cohen_kappa_score(y_qwen, y_mistral)
kappa_weighted = cohen_kappa_score(y_qwen, y_mistral, weights="quadratic")

print(f"Cohen’s κ (Qwen vs Mistral, combined): {kappa:.3f}")
print(f"Quadratic-weighted κ:                  {kappa_weighted:.3f}")


Cohen’s κ (Qwen vs Mistral, combined): 0.114
Quadratic-weighted κ:                  0.496


In [None]:
import json
import numpy as np
from collections import Counter

TRAIN_JSON = "/content/drive/MyDrive/nlp/train.json"
NUM_CLASSES = 5        # ratings 1–5
EXPECTED_RATERS = 5

with open(TRAIN_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

ratings = []
skipped = 0

for _, item in data.items():
    choices = item.get("choices", [])

    # Only keep items with exactly 5 human raters
    if len(choices) != EXPECTED_RATERS:
        skipped += 1
        continue

    counts = Counter(choices)
    row = [counts.get(c, 0) for c in range(1, NUM_CLASSES + 1)]
    ratings.append(row)

ratings = np.array(ratings)
N_items = ratings.shape[0]
n = EXPECTED_RATERS
k = NUM_CLASSES

print(f"Items used: {N_items}")
print(f"Items skipped: {skipped}")

# QUADRATIC WEIGHT MATRIX
W = np.zeros((k, k))
for i in range(k):
    for j in range(k):
        W[i, j] = ((i - j) ** 2) / ((k - 1) ** 2)

# OBSERVED AGREEMENT
P_o = 0.0
for i in range(N_items):
    for c in range(k):
        for d in range(k):
            P_o += W[c, d] * ratings[i, c] * ratings[i, d]

P_o /= (N_items * n * (n - 1))

# EXPECTED AGREEMENT
category_totals = ratings.sum(axis=0)
p = category_totals / (N_items * n)

P_e = 0.0
for c in range(k):
    for d in range(k):
        P_e += W[c, d] * p[c] * p[d]

# WEIGHTED FLEISS' KAPPA
kappa_weighted = 1 - (P_o / P_e)

print(f"\nQuadratic-weighted Fleiss’ κ: {kappa_weighted:.4f}")


Items used: 2246
Items skipped: 34

Quadratic-weighted Fleiss’ κ: 0.5022
