In [25]:
import json
import itertools
from typing import List, Dict, Set
from pathlib import Path
from collections import defaultdict
from sklearn.metrics import cohen_kappa_score

### Per Label Cohen's Kappa Coefficient

In [2]:
# All possible labels
all_labels = ["ok", "revise text info", "revise text structure", "revise picture"]

In [3]:
def load_data(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

In [4]:
def extract_annotations_per_sample(data):
    # Maps each sample to a dict of annotator -> set of labels
    samples = []
    for entry in data:
        ann = defaultdict(set)
        for label in entry.get("labels", []):
            sentiment = label["sentiment"]
            annotator = label["annotator"]
            # Handle single vs multiple sentiments
            if isinstance(sentiment, str):
                ann[annotator].add(sentiment)
            elif isinstance(sentiment, dict) and "choices" in sentiment:
                ann[annotator].update(sentiment["choices"])
        samples.append(ann)
    return samples

In [5]:
def build_binary_matrix(samples, all_labels):
    per_label_matrix = {label: ([], []) for label in all_labels}

    for ann_dict in samples:
        if len(ann_dict) != 2:
            continue  # Skip if not exactly two annotators
        a1, a2 = sorted(ann_dict.keys())  # Ensure consistent order

        for label in all_labels:
            per_label_matrix[label][0].append(1 if label in ann_dict[a1] else 0)
            per_label_matrix[label][1].append(1 if label in ann_dict[a2] else 0)
    
    return per_label_matrix

In [6]:

def calculate_per_label_kappa(per_label_matrix):
    results = {}
    for label, (ann1, ann2) in per_label_matrix.items():
        if len(set(ann1 + ann2)) <= 1:
            kappa = None  # No variability
        else:
            kappa = cohen_kappa_score(ann1, ann2)
        results[label] = kappa
    return results

### Run to see Results

In [7]:
data = load_data("merged_output.json")  # Replace with your actual file
samples = extract_annotations_per_sample(data)
matrix = build_binary_matrix(samples, all_labels)
kappa_scores = calculate_per_label_kappa(matrix)

# Display results
for label, score in kappa_scores.items():
    print(f"{label}: {score if score is not None else 'undefined (no variance)'}")

ok: 0.524390243902439
revise text info: 0.5575605979393723
revise text structure: 0.6059624108878807
revise picture: 0.18575029852726554


### Krippendorff’s Alpha with Jaccard Distance


In [18]:
# Load data from a JSON file
def load_annotations(file_path: str) -> List[List[Set[str]]]:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    annotations = []
    for item in data:
        label_sets = []
        for label_entry in item["labels"]:
            sentiment = label_entry["sentiment"]
            if isinstance(sentiment, dict) and "choices" in sentiment:
                labels = set(sentiment["choices"])
            else:
                labels = {sentiment}
            label_sets.append(labels)
        annotations.append(label_sets)
    return annotations

In [19]:
# Jaccard distance between two sets
def jaccard_distance(set1: Set[str], set2: Set[str]) -> float:
    if not set1 and not set2:
        return 0.0  # both empty, so no disagreement
    return 1 - len(set1 & set2) / len(set1 | set2)

In [20]:
# Observed disagreement (Do)
def observed_disagreement(units: List[List[Set[str]]]) -> float:
    total_disagreement = 0.0
    count = 0
    for unit in units:
        for (a, b) in itertools.combinations(unit, 2):
            total_disagreement += jaccard_distance(a, b)
            count += 1
    return total_disagreement / count if count > 0 else 0.0


In [21]:
# Expected disagreement (De)
def expected_disagreement(units: List[List[Set[str]]]) -> float:
    all_annotations = list(itertools.chain.from_iterable(units))
    total_disagreement = 0.0
    count = 0
    for (a, b) in itertools.combinations(all_annotations, 2):
        total_disagreement += jaccard_distance(a, b)
        count += 1
    return total_disagreement / count if count > 0 else 0.0


In [22]:
# Krippendorff's Alpha using Jaccard distance
def krippendorffs_alpha_jaccard(units: List[List[Set[str]]]) -> float:
    Do = observed_disagreement(units)
    De = expected_disagreement(units)
    return 1 - Do / De if De != 0 else 1.0  # Perfect agreement if no expected disagreement


### Run to see Results

In [24]:
file_path = "merged_output.json"
units = load_annotations(file_path)
alpha = krippendorffs_alpha_jaccard(units)
print(f"The krippendorff's alpha: {alpha}")

The krippendorff's alpha: 0.47767136229386586
