In [1]:
import csv
from pathlib import Path

import jsonlines
from sklearn.metrics import cohen_kappa_score

In [2]:
# Define data paths
data_path = Path(".") / "data"

winomt_ende_annotator1_path = data_path / "en-de.annotator1.jsonl"
winomt_ende_annotator2_path = data_path / "en-de.annotator2.jsonl"
winomt_enru_annotator1_path = data_path / "en-ru.annotator1.jsonl"
winomt_enru_annotator2_path = data_path / "en-ru.annotator2.jsonl"

# Original annotations (source: https://github.com/gabrielStanovsky/mt_gender/tree/master/data/human_annotations)
winomt_ende_original_path = data_path / "en-de.previous-work.csv"
winomt_enru_original_path = data_path / "en-ru.previous-work.csv"

In [3]:
# Load annotator labels

def load_annotator_labels(filepath):
  annotator_labels = {}
  with jsonlines.open(filepath) as f:
    for line in f:
      index = int(line["Index"])
      label = line["label"][0]
      if label == "Male":
        label = 0
      elif label == "Female":
        label = 1
      elif label == "Both / Neutral / Ambiguous":
        label = 2
      else:
        continue
      annotator_labels[index] = label
  return annotator_labels

ende_annotator1_labels = load_annotator_labels(winomt_ende_annotator1_path)
ende_annotator2_labels = load_annotator_labels(winomt_ende_annotator2_path)
enru_annotator1_labels = load_annotator_labels(winomt_enru_annotator1_path)
enru_annotator2_labels = load_annotator_labels(winomt_enru_annotator2_path)

In [4]:
# Load original labels
def load_original_labels(filepath):
  original_labels = {}
  with open(filepath) as f:
    for line in csv.DictReader(f):
      index = int(line["Index"])
      if line["Find entity? [Y/N]"] != "Y":
        continue
      label = line["Gender? [M/F/N]"]
      if label == "M":
        label = 0
      elif label == "F":
        label = 1
      elif label == "N":
        label = 2
      else:
        continue
      original_labels[index] = label
  return original_labels

ende_original_labels = load_original_labels(winomt_ende_original_path)
enru_original_labels = load_original_labels(winomt_enru_original_path)

In [5]:
# Compute inter-annotator agreement
def compute_agreement(labels1, labels2):
  # Remove samples that were only partially annotated
  for key in list(labels1.keys()):
      if key not in labels2:
          del labels1[key]
  for key in list(labels2.keys()):
      if key not in labels1:
          del labels2[key]
  print(f"Comparing {len(labels1)} + {len(labels2)} labels")
  keys = list(labels1.keys())
  kappa = cohen_kappa_score([labels1[key] for key in keys], [labels2[key] for key in keys])
  print(kappa)

print("EN–DE:")
compute_agreement(ende_annotator1_labels, ende_original_labels)
compute_agreement(ende_annotator2_labels, ende_original_labels)

print("EN–RU:")
compute_agreement(enru_annotator1_labels, enru_original_labels)
compute_agreement(enru_annotator2_labels, enru_original_labels)

EN–DE:
Comparing 88 + 88 labels
0.27413876758854927
Comparing 84 + 84 labels
0.29602595296025946
EN–RU:
Comparing 88 + 88 labels
0.5657001850709438
Comparing 83 + 83 labels
0.08398802843247288
