Creating a single dataset with all systems' outputs from German into Romansh; e.g., for human evaluation

In [27]:
# Base dataset: reference translations
from datasets import load_dataset
from romansh_mt_eval.benchmarking.constants import VARIETIES

variety_datasets = dict()
for variety in VARIETIES:
    variety_datasets[variety] = load_dataset(
        "ZurichNLP/wmt24pp-rm", f"de_DE-{variety}"
    )["test"]

In [28]:
variety_datasets["rm-rumgr"]

Dataset({
    features: ['lp', 'domain', 'document_id', 'segment_id', 'is_bad_source', 'source', 'target', 'comment'],
    num_rows: 998
})

In [29]:
from romansh_mt_eval.benchmarking.system_results import get_all_system_translations

system_translations = get_all_system_translations()

In [30]:
system_translations["GPT-4o"]["rm-rumgr"]

SystemTranslations(sys_name='GPT-4o', variety='rm-rumgr', translations_rm_to_de=["I'm sorry, but the text you provided appears to be a GUID (Globally Unique Identifier) and not a segment in Romansh. Could you please provide the correct text for translation?", 'Die Darstellungen von Erdmännchen und Wasser in einer neuen Ausstellung', '„People Swimming in the Swimming Pool“ aus dem Jahr 2022 ist ein Kunstwerk von Vicente Siso, das ab dem 13. Januar in der Galerie „Tierra del Sol“ ausgestellt wird (Foto zur Verfügung gestellt von Vicente Siso).', '„Vicente Siso: Memories of the Land and Water“ – so lautet der Titel der Ausstellung am neuen Standort der Galerie „Tierra del Sol“ in West Hollywood. Siso ist seit 2012 als Künstler im Studio Arts Program dabei und zeigt nun seine erste Einzelausstellung. Siso wurde 1962 in Madrid geboren und wuchs in Venezuela, Trinidad und Miami auf. Mit gut zwanzig Jahren zog er mit seiner Familie nach Südkalifornien.', 'Siso zeigt sein meisterhaftes Können 

In [31]:
for system in system_translations.keys():
    for variety in VARIETIES:
        dataset = variety_datasets[variety]
        sys_obj = system_translations[system][variety]
        translations = sys_obj.translations_de_to_rm
        translations_to_add = []
        for i, row in enumerate(dataset):
            if row["is_bad_source"] and sys_obj.skips_bad_sources:
                translations_to_add.append("")
            else:
                translations_to_add.append(translations.pop(0))
        variety_datasets[variety] = dataset.add_column(f"sys_{system}", translations_to_add)

In [32]:
variety_datasets["rm-rumgr"]

Dataset({
    features: ['lp', 'domain', 'document_id', 'segment_id', 'is_bad_source', 'source', 'target', 'comment', 'sys_madlad400-10b-mt_direct', 'sys_madlad400-10b-mt_pivot_en', 'sys_translaturia', 'sys_supertext', 'sys_Llama-3.3-70b', 'sys_Gemini-2.5-Flash', 'sys_GPT-4o'],
    num_rows: 998
})

In [33]:
import os
output_dir = "system_dataset"
os.makedirs(output_dir, exist_ok=True)
for variety in VARIETIES:
    variety_datasets[variety].to_json(os.path.join(output_dir, f"system_dataset_{variety}.jsonl"))

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]