In [None]:
import os

import numpy as np
import pandas as pd

In [None]:
# path to the ARC-MTQE directory
main_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(main_dir, "data")

In [None]:
def print_data_stats(n_rows, n_segments, n_critical_errors):
    """
    Print data summary statistics.
    """
    print(f"Number of rows: {n_rows}")
    print(f"Number of segments: {n_segments}")
    print(f"Number of critical errors: {n_critical_errors}")
    print(f"Percentage critical errors: {n_critical_errors/n_segments}")

## WMT 2021 critical errors

In [None]:
# contains train, dev and test data
mlqe_pe_data_dir = os.path.join(data_dir, "mlqe-pe", "data")

language_pairs = ["encs", "ende", "enja", "enzh"]

for lp in language_pairs:
    n_rows = 0
    n_segments = 0
    n_critical_errors = 0

    path_dev = os.path.join(mlqe_pe_data_dir, "catastrophic_errors", f"{lp}_majority_dev.tsv")
    path_train = os.path.join(mlqe_pe_data_dir, "catastrophic_errors", f"{lp}_majority_train.tsv")
    path_test = os.path.join(mlqe_pe_data_dir, "catastrophic_errors", f"{lp}_majority_test_blind.tsv")
    path_goldlabels = os.path.join(mlqe_pe_data_dir, "catastrophic_errors_goldlabels", f"{lp}_majority_test_goldlabels", "goldlabels.txt")

    df_dev = pd.read_csv(path_dev, sep="\t", header=None, names=["idx", "source", "target", "annotations", "label"])
    df_train = pd.read_csv(path_train, sep="\t", header=None, names=["idx", "source", "target", "annotations", "label"])
    df_test = pd.read_csv(path_test, sep="\t", header=None, names=["idx", "source", "target"])
    df_labels = pd.read_csv( path_goldlabels, sep="\t", header=None, names=["lang_pair", "ref", "idx", "label"])
    df_test_labelled = pd.merge(df_test, df_labels, on='idx')

    for df in [df_train, df_dev, df_test_labelled]:
        n_rows += df.shape[0]
        n_segments += df["idx"].nunique()
        n_critical_errors += df[df["label"] == "ERR"].shape[0]

    print(lp)
    print_data_stats(n_rows, n_segments, n_critical_errors)
    print("\n")


## WMT 2022 critical errors

In [None]:
wmt_data_dir = os.path.join(data_dir, "wmt-qe-2022-data")

language_pairs = ["en-de", "pt-en"]
for lp in language_pairs:
    n_rows = 0
    n_segments = 0
    n_critical_errors = 0

    path_train = os.path.join(wmt_data_dir, "train-dev_data", "task3_ced", "train", lp, f"{lp}-train", "train.label")
    path_dev = os.path.join(wmt_data_dir, "train-dev_data", "task3_ced", "dev", lp, f"{lp}-dev", "dev.label")
    path_test = os.path.join(wmt_data_dir, "test_data-gold_labels", "task3_ced", lp, f"test.2022.{lp}.label")

    df_train_labels = pd.read_csv(path_train, names=["label"])
    df_dev_labels = pd.read_csv(path_dev, names=["label"])
    df_test_labels = pd.read_csv(path_test, names=["label"])

    for df in [df_train_labels, df_dev_labels, df_test_labels]:
        n = df.shape[0]
        n_bad = df[df["label"]=="BAD"].shape[0]

        n_rows += n
        n_segments += n
        n_critical_errors += n_bad

    print(lp)
    print_data_stats(n_rows, n_segments, n_critical_errors)
    print("\n")

## DEMETR

The below numbers do not correspond to the paper which lists 10 critical error categories. However, the dataset has 12 critical error categories + one of the baselines is listed as a critical error as well. This brings the total to 13.

In [None]:
demetr_data_dir = os.path.join(data_dir, "demetr", "dataset")
dfs = []
for filename in os.listdir(demetr_data_dir):
    f = os.path.join(demetr_data_dir, filename)
    df = pd.read_json(f)
    dfs.append(df)

demetr_df = pd.concat(dfs)

In [None]:
print("Number of segments per language pair: ", demetr_df.groupby('lang_tag')['id'].count().unique())
print("Number of unique segments per language pair: ", demetr_df.groupby('lang_tag')['id'].nunique().unique())
print("Number of error IDs per language pair: ", demetr_df.groupby('lang_tag')['pert_id'].nunique().unique())
print("Number of error names per language pair: ", demetr_df.groupby('lang_tag')['pert_name'].nunique().unique())
print("Number of language pairs per error category: ", demetr_df.groupby('pert_name')['lang_tag'].nunique().unique())

The below cells show that `pert_id` values 5 and 6 are missing. Looking at the corresponding files `major_id5_pp_removed` and `critical_id6_addition.json`, the listed `pert_id` within the files is 8 instead of the expected 5 and 6. However, it seems that the `pert_name` column is used correctly in all files.

In [None]:
set(np.arange(1, 36, 1)) - set(demetr_df['pert_id'].unique())

In [None]:
n_rows = n_segments = demetr_df.shape[0]
# n_critical_errors = demetr_df[demetr_df['severity'] == 'critical'].shape[0]

# note: in the paper, there are 10 critical error categories --> 10,000 segments 
n_critical_errors = 10000

print_data_stats(n_rows, n_segments, n_critical_errors)

## Unbabel MQM

In [None]:
file_path = os.path.join(data_dir, "unbabel", "mqm_generalMT2022_enru.tsv")

df = pd.read_csv(file_path, sep="\t", index_col=False)
df[['seg_id', 'category', 'severity']].tail()

In [None]:
df['crit_error'] = np.where(df["severity"] == "critical", 1, 0)

In [None]:
n_rows = df.shape[0]
n_segments = df['seg_id'].nunique()
n_critical_errors = sum(df.groupby('seg_id')['crit_error'].sum() >= 1)

print_data_stats(n_rows, n_segments, n_critical_errors)