In [1]:
import itertools
from collections import Counter, defaultdict
from pathlib import Path

import jsonlines
from sklearn.metrics import cohen_kappa_score

In [2]:
language_pair = "zh-en"

In [3]:
# Define data paths
annotations_path = Path(".") / "annotations"
annotator1_path = annotations_path / f"{language_pair}.annotator1.jsonl"
annotator2_path = annotations_path / f"{language_pair}.annotator2.jsonl"

predictions_path = Path(".") / "predictions"
dev_path = predictions_path / f"{language_pair}.dev.jsonl"
test_path = predictions_path / f"{language_pair}.test.jsonl"

In [4]:
# Load annotations
with jsonlines.open(annotator1_path) as f:
  annotations1 = {(sample["seg_id"], sample["system"], sample["coverage_error_type"]): sample for sample in f if len(sample["label"]) == 2}
with jsonlines.open(annotator2_path) as f:
  annotations2 = {(sample["seg_id"], sample["system"], sample["coverage_error_type"]): sample for sample in f if len(sample["label"]) == 2}
print("Annotations 1 total:", len(annotations1))
print("Annotations 1 OT:", len([sample for sample in annotations1.values() if sample["coverage_error_type"] == "overtranslation"]))
print("Annotations 1 UT:", len([sample for sample in annotations1.values() if sample["coverage_error_type"] == "undertranslation"]))
print("Annotations 2 total:", len(annotations2))
print("Annotations 2 OT:", len([sample for sample in annotations2.values() if sample["coverage_error_type"] == "overtranslation"]))
print("Annotations 2 UT:", len([sample for sample in annotations2.values() if sample["coverage_error_type"] == "undertranslation"]))

Annotations 1 total: 409
Annotations 1 OT: 153
Annotations 1 UT: 256
Annotations 2 total: 742
Annotations 2 OT: 255
Annotations 2 UT: 487


In [5]:
# Filter annotations by whether they apply to the final version of the algorithm
prediction_keys = set()
with jsonlines.open(dev_path) as f:
  prediction_keys |= {(sample["seg_id"], sample["system"], sample["coverage_error_type"]) for sample in f}
with jsonlines.open(test_path) as f:
  prediction_keys |= {(sample["seg_id"], sample["system"], sample["coverage_error_type"]) for sample in f}

annotations1 = {key: value for key, value in annotations1.items() if key in prediction_keys and "source-error" not in value["label"]}
annotations2 = {key: value for key, value in annotations2.items() if key in prediction_keys and "source-error" not in value["label"]}
print("Annotations 1 total:", len(annotations1))
print("Annotations 1 OT:", len([sample for sample in annotations1.values() if sample["coverage_error_type"] == "overtranslation"]))
print("Annotations 1 UT:", len([sample for sample in annotations1.values() if sample["coverage_error_type"] == "undertranslation"]))
print("Annotations 2 total:", len(annotations2))
print("Annotations 2 OT:", len([sample for sample in annotations2.values() if sample["coverage_error_type"] == "overtranslation"]))
print("Annotations 2 UT:", len([sample for sample in annotations2.values() if sample["coverage_error_type"] == "undertranslation"]))

Annotations 1 total: 354
Annotations 1 OT: 149
Annotations 1 UT: 205
Annotations 2 total: 616
Annotations 2 OT: 249
Annotations 2 UT: 367


In [6]:
# Extract overlapping samples for inter-annotator agreement
overlap_samples1 = {key: sample for key, sample in annotations1.items() if key in annotations2}
overlap_samples2 = {key: sample for key, sample in annotations2.items() if key in annotations1}
assert len(overlap_samples1) == len(overlap_samples2)
print("Number of overlapping samples: ", len(overlap_samples1))

Number of overlapping samples:  29


In [7]:
# Question 1 inter-annotator agreement
question1_labels1 = ["bad-translation" in overlap_samples1[key]["label"] for key in sorted(overlap_samples1)]
question1_labels2 = ["bad-translation" in overlap_samples2[key]["label"] for key in sorted(overlap_samples1)]
question1_kappa = cohen_kappa_score(question1_labels1, question1_labels2)
print(question1_kappa)

0.4257425742574257


In [8]:
# Question 1+2 inter-annotator agreement
question2_labels1 = [str(sorted(overlap_samples1[key]["label"])) for key in sorted(overlap_samples1)]
question2_labels2 = [str(sorted(overlap_samples2[key]["label"])) for key in sorted(overlap_samples1)]
question2_kappa = cohen_kappa_score(question2_labels1, question2_labels2)
print(question2_kappa)

0.22065063649222072


In [9]:
question1_counter = Counter()
question2_counter = Counter()
span_counters = defaultdict(Counter)
for sample in itertools.chain(annotations1.values(), annotations2.values()):
  if sample["split"] != "test":
    continue
  for label in ["good-translation", "bad-translation"]:
    question1_counter[sample["coverage_error_type"] + "_" + label] += label in sample["label"]
  for label_pair in [
    ("good-translation", "OT-supported-information"),
    ("good-translation", "OT-fluency"),
    ("good-translation", "UT-redundant-information"),
    ("good-translation", "UT-fluency"),
    ("good-translation", "syntactic-difference"),
    ("good-translation", "unclear"),
    ("bad-translation", "OT-unsupported-information"),
    ("bad-translation", "OT-supported-information"),
    ("bad-translation", "UT-important-information"),
    ("bad-translation", "UT-redundant-information"),
    ("bad-translation", "other-error-accuracy"),
    ("bad-translation", "other-error-fluency"),
  ]:
    label = sample["coverage_error_type"] + "_" + "+".join(label_pair)
    question2_counter[label] += set(label_pair) == set(sample["label"])
    for span in itertools.chain(sample["predicted_overtranslation_words"].split(" | "), sample["predicted_undertranslation_words"].split(" | ")):
      if span.strip() and set(label_pair) == set(sample["label"]):
        span_counters[label][span] += 1
for key in sorted(list(question1_counter)):
  print(key.replace("_", "\t") + "\t" + str(question1_counter[key]))
for key in sorted(list(question2_counter)):
  print(key.replace("_", "\t") + "\t" + str(question2_counter[key]))

overtranslation	bad-translation	42
overtranslation	good-translation	309
undertranslation	bad-translation	95
undertranslation	good-translation	397
overtranslation	bad-translation+OT-supported-information	0
overtranslation	bad-translation+OT-unsupported-information	4
overtranslation	bad-translation+UT-important-information	0
overtranslation	bad-translation+UT-redundant-information	0
overtranslation	bad-translation+other-error-accuracy	27
overtranslation	bad-translation+other-error-fluency	11
overtranslation	good-translation+OT-fluency	29
overtranslation	good-translation+OT-supported-information	60
overtranslation	good-translation+UT-fluency	0
overtranslation	good-translation+UT-redundant-information	0
overtranslation	good-translation+syntactic-difference	12
overtranslation	good-translation+unclear	208
undertranslation	bad-translation+OT-supported-information	0
undertranslation	bad-translation+OT-unsupported-information	0
undertranslation	bad-translation+UT-important-information	67
undert