In [1]:
from sklearn.metrics import cohen_kappa_score
import numpy as np
from itertools import combinations

### Binary Vector Kappa Coefficient
**Per-Label Cohen's Kappa**:

Let:
- *Po* = Observed agreement (proportion of items where annotators agree on presence/absence of the label)
- *Pe* = Expected agreement by chance

Then:

$$
\kappa = \frac{P_o - P_e}{1 - P_e}
$$


1- It is computed for each label  
2- Doesn't compare two sets of annotations  


In [2]:
#Test
# Convert to binary vectors
def to_binary_matrix(annotation_list, label_set):
    binary_matrix = []
    for labels in annotation_list:
        row = [1 if label in labels else 0 for label in label_set]
        binary_matrix.append(row)
    return np.array(binary_matrix)

In [3]:
# All possible labels
all_labels = ["ok", "revise text structure", "revise picture"]

# Example annotations from two annotators for 3 samples
annotations_annotator_1 = [["ok"], ["revise picture", "revise text structure"], ["ok"]]
annotations_annotator_2 = [["ok"], ["revise picture"], ["revise text structure"]]

A1 = to_binary_matrix(annotations_annotator_1, all_labels)
A2 = to_binary_matrix(annotations_annotator_2, all_labels)

# Compute Cohen's Kappa for each label
for i, label in enumerate(all_labels):
    kappa = cohen_kappa_score(A1[:, i], A2[:, i])
    print(f"Kappa for label '{label}': {kappa:.4f}")


Kappa for label 'ok': 0.4000
Kappa for label 'revise text structure': -0.5000
Kappa for label 'revise picture': 1.0000


### Jaccard Index

**Jaccard Index Formula**:

Let:
- *A* = Annotation set by User1
- *B* = Annotation set by User2
  
Then:

$$
J(A, B) = \frac{|A \cap B|}{|A \cup B|}
$$

This measure the overlap between two annotation sets. It doesn't count chance of aggrement. Unlike the per label kappa coefficient it gives us a measure for two annotation sets not for each label.  

### Krippendorff’s Alpha with Jaccard Distance

**Jaccard Similarity** (same as Jaccard Index):

$$
\text{Jaccard}(A, B) = \frac{|A \cap B|}{|A \cup B|}
$$


**Krippendorff's Alpha**:

Let:
- \( D_o \) = Observed disagreement
- \( D_e \) = Expected disagreement by chance
- \( \delta(a, b) \) = Distance function between annotations \( a \) and \( b \)

Then:

$$
\alpha = 1 - \frac{D_o}{D_e}
$$

Where:

$$
D_o = \frac{\sum_{i} \sum_{j > i} w_{ij} \cdot \delta(a_i, a_j)^2}{\sum_{i} \sum_{j > i} w_{ij}}
$$

$$
D_e = \text{Expected disagreement based on observed label distributions}
$$


1- Gives us a measure for two sets of annotations  
2- Overlap + Chance


In [None]:
def jaccard_distance(a, b):
    if not a and not b:
        return 0.0  # Identical (both empty)
    return 1 - len(a & b) / len(a | b)

In [9]:
# Krippendorff's alpha with custom distance
def krippendorff_alpha(data, distance_metric):
    """
    Computes Krippendorff’s alpha using a custom distance metric.
    data: list of lists, [annotator][item]
    """
    n_annotators = len(data)
    n_items = len(data[0])

    # Ensure all annotators rated same number of items
    for row in data:
        if len(row) != n_items:
            raise ValueError("All annotators must have annotated the same number of items")

    # Transpose to [items][annotators]
    item_annotations = list(map(list, zip(*data)))

    # Observed disagreement
    Do_num = 0.0
    Do_den = 0
    for annots in item_annotations:
        clean_annots = [a for a in annots if a is not None]
        if len(clean_annots) < 2:
            continue
        for a, b in combinations(clean_annots, 2):
            d = distance_metric(a, b)
            Do_num += d ** 2
            Do_den += 1
    Do = Do_num / Do_den if Do_den else 0

    # Expected disagreement
    all_annots = [a for annotator in data for a in annotator if a is not None]
    De_num = 0.0
    De_den = 0
    for a, b in combinations(all_annots, 2):
        d = distance_metric(a, b)
        De_num += d ** 2
        De_den += 1
    De = De_num / De_den if De_den else 0

    if De == 0:
        return 1.0 if Do == 0 else float('nan')
    return 1 - (Do / De)

In [14]:
# Data format: Each row = one annotator, each column = one item
# Each cell = a set of multilabel annotations
data = [
    [{"ok", "revise text"}, {"revise text", "revise image"}, {"ok"}, set()],         # Annotator 1
    [{"ok", "revise text"}, {"revise text"}, {"revise text"}, set()]                # Annotator 2
]

alpha = krippendorff_alpha(data, jaccard_distance)
print(f"Krippendorff's alpha (Jaccard distance): {alpha:.3f}")

Krippendorff's alpha (Jaccard distance): 0.511
