In [1]:
import pandas as pd
import sys
from utils import categories_map


category_replace = {
    "no-error": "No-error",
    "Other/-": "Other",
    "Terminology/Inappropriate for context": "Terminology/Inappropriate",
    "Terminology/Inconsistent use of terminology": "Terminology/Inconsistent",
}


class2id = {
    "Neutral": 1,
    "Minor": 2,
    "Major": 3,
}
id2class = {v: k for k, v in class2id.items()}


def clean_target(text):
    return text.replace("<v>", "").replace("</v>", "")


In [2]:
df_2020 = pd.read_csv(
    "./wmt-mqm-human-evaluation/newstest2020/ende/mqm_newstest2020_ende.tsv", sep="\t"
)

# df_2021 = pd.read_csv(
#     "./wmt-mqm-human-evaluation/newstest2021/ende/mqm-newstest2021_ende.tsv",
#     sep="\t",
#     on_bad_lines="skip",
# )

# df = pd.concat([df_2020, df_2021], ignore_index=True)

df= df_2020


In [3]:
# Replace category column with a dictionary setting
df["category"] = df["category"].replace(category_replace)
df["category"] = df["category"].replace(categories_map)

In [4]:
df.severity = df.severity.replace(
    {
        "no-error": "Neutral",
        "No-error": "Neutral",
        "word order": "Minor",
    }
).values.reshape(-1, 1)


In [5]:
df.shape

(79020, 9)

In [6]:
df.dropna(subset=["severity"], inplace=True)
df.shape

(79020, 9)

In [7]:
print("Severity Levels: ", df.severity.unique())
print("Error Categories: ", df.category.unique())

Severity Levels:  ['Minor' 'Major' 'Neutral']
Error Categories:  ['Other' 'Accuracy' 'No-error' 'Fluency' 'Terminology' 'Locale convention']


In [8]:
df.to_csv("./data/non_reduced_wmt.csv", index=False)

In [9]:
def f(x):
     doc_id = x['doc_id'].value_counts().index[0]
     seg_id = x['seg_id'].value_counts().index[0]
     sample_id = f"{doc_id}_{seg_id}"
     return pd.Series(dict(sample_id=sample_id,
                         source = x['source'].values[0], # takes first item as they are the same
                         target = clean_target(x['target'].values[0]),  # takes first item as they are the same
                         category = list(set(x['category'].values)),  # Create unique set of categories provided by the rater
                         severity = x['severity'].value_counts().index[0], # Takes majority of the values
                         ))


# Grupby system and get majority of severity
df_reduced = df.groupby(["system", "seg_id"]).apply(f)
df_reduced.reset_index(inplace=True)

In [10]:
df_reduced

Unnamed: 0,system,seg_id,sample_id,source,target,category,severity
0,Human-A.0,1,1_1,Michael Jackson wore tape on his nose to get f...,Ehemaliger Bodyguard berichtet: Michael Jackso...,[No-error],Neutral
1,Human-A.0,2,2_2,Michael Jackson's former bodyguard has claimed...,Der ehemalige Bodyguard von Michael Jackson be...,"[Accuracy, Fluency]",Minor
2,Human-A.0,3,3_3,"Matt Fiddes, now a property developer and owne...","Matt Fiddes, jetzt ein Bauträger und Inhaber e...","[Other, Accuracy, Fluency, Terminology]",Minor
3,Human-A.0,4,4_4,"To get front pages, he would reportedly don su...","Um auf Titelseiten zu gelangen, trug er angebl...","[No-error, Accuracy]",Minor
4,Human-A.0,5,5_5,We'll tell you what's true. You can form your ...,Wir sagen Ihnen die Fakten. Sie können sich da...,[No-error],Neutral
...,...,...,...,...,...,...,...
14175,eTranslation.737,1414,10_1414,Listing other activities to be taken up by the...,Herr Chatterjee listete weitere Aktivitäten au...,"[Other, Fluency]",Minor
14176,eTranslation.737,1415,11_1415,Chief Minister Mamata Banerjee had recently an...,Ministerpräsidentin Mamata Banerjee hatte kürz...,"[No-error, Fluency]",Minor
14177,eTranslation.737,1416,12_1416,"On Thursday, a bronze statue of Vidyasagar was...",Am Donnerstag wurde eine Bronzestatue von Vidy...,[No-error],Neutral
14178,eTranslation.737,1417,13_1417,Those who are against the social reforms of Vi...,"„Diejenigen, die gegen die Sozialreformen von ...","[Other, Fluency]",Minor


In [11]:
# Save the data
df_reduced.to_csv("./data/wmt_reduced.csv", index=False)

In [12]:
# Load the data
df_reduced = pd.read_csv("./data/wmt_reduced.csv")