In [6]:
import os
import collections
import json
import glob
import re
from src.wimbd_ import _load_dataset
from datasets import load_from_disk

HF_HOME = "/share/edc/home/antonis/datasets/huggingface"
save_path = os.makedirs(os.path.join(HF_HOME, 'wmt09_gens'), exist_ok=True)

base_pth = "/share/edc/home/antonis/LLM-Incidental-Supervision/incidental-supervision/models/experiment_6_logits_max_4/"
doc_res_files = glob.glob(os.path.join(base_pth, "**/doc_results.json"), recursive=True)


In [2]:
pattern = re.compile(r'wmt\d{2}-(\w{2}-\w{2})')
language_pairs = [pattern.search(path).group(1) for path in doc_res_files if pattern.search(path)]
doc = json.load(open(doc_res_files[0]))

def find_lang(path):
    pattern = re.compile(r'wmt\d{2}-(\w{2}-\w{2})')
    return pattern.search(path).group(1)

wmt = _load_dataset('wmt')

Loading dataset wmt
Loading dataset for language pair ('cs', 'en')
Loading dataset for language pair ('de', 'en')
Loading dataset for language pair ('fr', 'en')
Loading dataset for language pair ('es', 'en')
Loading dataset for language pair ('it', 'en')
Loading dataset for language pair ('hu', 'en')
Loading dataset for language pair ('en', 'cs')
Loading dataset for language pair ('en', 'de')
Loading dataset for language pair ('en', 'fr')
Loading dataset for language pair ('en', 'es')
Loading dataset for language pair ('en', 'it')
Loading dataset for language pair ('en', 'hu')


In [30]:
from tqdm import tqdm
from datasets import DatasetDict
from datasets import Dataset

def insert_gens_into_wmt(wmt, doc_res_files):
    wmt_model_langs = collections.defaultdict(dict)
    for path in tqdm(doc_res_files, desc="Processing files"):
        wmt_model = wmt.copy()

        with open(path, 'r') as f:
            doc_res = json.load(f)

        model_name = path.split('/')[-5]
        task = path.split('/')[-3]
        lang = find_lang(path)
        lang1, lang2 = lang.split('-')
        total_rows = len(wmt_model[lang]['translation'])
        matched_rows = 0
        doc_res_task_dict = wmt_model[lang].to_dict()
        len_task_dict = len(doc_res_task_dict['translation'])
        unmatched_rows = [i for i in range(len_task_dict)]

        for doc in tqdm(doc_res[task], desc=f"Inserting generations for {task}", leave=False):
            src = doc['src']
            ref = doc['ref']
            gen = doc['result'][0]

            for idx in unmatched_rows:
                row = doc_res_task_dict['translation'][idx]
                # print(row)
                if row[lang1] != src and row[lang2] != ref:
                    continue
                else:
                    doc_res_task_dict['translation'][idx]['gen'] = gen
                    matched_rows += 1
                    unmatched_rows.remove(idx)
                    break
        
        if len(unmatched_rows) > 0:
            print(f"Could not match {len(unmatched_rows)} rows for {task}")
            raise Exception

        wmt_model_langs[model_name][lang] = Dataset.from_dict(doc_res_task_dict)
        print(f"{model_name} {lang} keys: {doc_res_task_dict['translation'][0].keys()}")

    for model_name, model_data in wmt_model_langs.items():
        print(f"{model_name}: {model_data['en-es']['translation'][0].keys()}")
        save_pth = os.path.join(HF_HOME, f'wmt09_gens_{model_name}')
        dataset_dict = DatasetDict(model_data)
        dataset_dict.save_to_disk(save_pth)
        print(f"Saved to {save_pth}")

    return wmt_model_langs


wmt_model = insert_gens_into_wmt(wmt, doc_res_files)

# dataset_dict = {lang: Dataset.from_dict({'translation': value['translation']}) for lang, value in wmt.items()}
# dataset_dict = DatasetDict(dataset_dict)

# Now you can save it to disk
# dataset_dict.save_to_disk(os.path.join(HF_HOME, 'wmt09_gens'))

Processing files:   1%|          | 1/120 [00:00<01:32,  1.28it/s]

OLMo-7B en-hu keys: dict_keys(['en', 'hu', 'gen'])




Processing files:   2%|▏         | 2/120 [00:01<01:09,  1.69it/s]

OLMo-7B de-en keys: dict_keys(['de', 'en', 'gen'])


Processing files:   2%|▎         | 3/120 [00:01<01:03,  1.85it/s]

OLMo-7B en-cs keys: dict_keys(['cs', 'en', 'gen'])


Processing files:   3%|▎         | 4/120 [00:02<00:59,  1.94it/s]

OLMo-7B cs-en keys: dict_keys(['cs', 'en', 'gen'])


Processing files:   4%|▍         | 5/120 [00:02<00:57,  2.00it/s]

OLMo-7B en-fr keys: dict_keys(['en', 'fr', 'gen'])


Processing files:   5%|▌         | 6/120 [00:03<00:55,  2.07it/s]

OLMo-7B en-de keys: dict_keys(['de', 'en', 'gen'])


Processing files:   6%|▌         | 7/120 [00:03<00:54,  2.06it/s]

OLMo-7B en-es keys: dict_keys(['en', 'es', 'gen'])


Processing files:   7%|▋         | 8/120 [00:04<00:54,  2.07it/s]

OLMo-7B hu-en keys: dict_keys(['en', 'hu', 'gen'])


Processing files:   8%|▊         | 9/120 [00:04<00:52,  2.11it/s]

OLMo-7B it-en keys: dict_keys(['en', 'it', 'gen'])


Processing files:   8%|▊         | 10/120 [00:05<00:51,  2.13it/s]

OLMo-7B en-it keys: dict_keys(['en', 'it', 'gen'])


Processing files:   9%|▉         | 11/120 [00:05<00:51,  2.10it/s]

OLMo-7B fr-en keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  10%|█         | 12/120 [00:05<00:51,  2.09it/s]

OLMo-7B es-en keys: dict_keys(['en', 'es', 'gen'])


Processing files:  11%|█         | 13/120 [00:06<00:51,  2.08it/s]

pythia-12b en-hu keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  12%|█▏        | 14/120 [00:06<00:50,  2.10it/s]

pythia-12b de-en keys: dict_keys(['de', 'en', 'gen'])


Processing files:  12%|█▎        | 15/120 [00:07<00:50,  2.09it/s]

pythia-12b en-cs keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  13%|█▎        | 16/120 [00:07<00:49,  2.11it/s]

pythia-12b cs-en keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  14%|█▍        | 17/120 [00:08<00:48,  2.12it/s]

pythia-12b en-fr keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  15%|█▌        | 18/120 [00:08<00:47,  2.14it/s]

pythia-12b en-de keys: dict_keys(['de', 'en', 'gen'])


Processing files:  16%|█▌        | 19/120 [00:09<00:46,  2.15it/s]

pythia-12b en-es keys: dict_keys(['en', 'es', 'gen'])


Processing files:  17%|█▋        | 20/120 [00:09<00:46,  2.14it/s]

pythia-12b hu-en keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  18%|█▊        | 21/120 [00:10<00:46,  2.15it/s]

pythia-12b it-en keys: dict_keys(['en', 'it', 'gen'])


Processing files:  18%|█▊        | 22/120 [00:10<00:45,  2.15it/s]

pythia-12b en-it keys: dict_keys(['en', 'it', 'gen'])


Processing files:  19%|█▉        | 23/120 [00:11<00:45,  2.14it/s]

pythia-12b fr-en keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  20%|██        | 24/120 [00:11<00:44,  2.14it/s]

pythia-12b es-en keys: dict_keys(['en', 'es', 'gen'])


Processing files:  21%|██        | 25/120 [00:12<00:44,  2.12it/s]

pythia-31m en-hu keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  22%|██▏       | 26/120 [00:12<00:45,  2.07it/s]

pythia-31m de-en keys: dict_keys(['de', 'en', 'gen'])


Processing files:  22%|██▎       | 27/120 [00:13<00:44,  2.08it/s]

pythia-31m en-cs keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  23%|██▎       | 28/120 [00:13<00:44,  2.08it/s]

pythia-31m cs-en keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  24%|██▍       | 29/120 [00:14<00:43,  2.10it/s]

pythia-31m en-fr keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  25%|██▌       | 30/120 [00:14<00:42,  2.14it/s]

pythia-31m en-de keys: dict_keys(['de', 'en', 'gen'])


Processing files:  26%|██▌       | 31/120 [00:14<00:41,  2.15it/s]

pythia-31m en-es keys: dict_keys(['en', 'es', 'gen'])


Processing files:  27%|██▋       | 32/120 [00:15<00:40,  2.15it/s]

pythia-31m hu-en keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  28%|██▊       | 33/120 [00:15<00:40,  2.16it/s]

pythia-31m it-en keys: dict_keys(['en', 'it', 'gen'])


Processing files:  28%|██▊       | 34/120 [00:16<00:39,  2.17it/s]

pythia-31m en-it keys: dict_keys(['en', 'it', 'gen'])


Processing files:  29%|██▉       | 35/120 [00:16<00:38,  2.18it/s]

pythia-31m fr-en keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  30%|███       | 36/120 [00:17<00:38,  2.20it/s]

pythia-31m es-en keys: dict_keys(['en', 'es', 'gen'])


Processing files:  31%|███       | 37/120 [00:17<00:38,  2.18it/s]

pythia-1.4b en-hu keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  32%|███▏      | 38/120 [00:18<00:37,  2.16it/s]

pythia-1.4b de-en keys: dict_keys(['de', 'en', 'gen'])


Processing files:  32%|███▎      | 39/120 [00:18<00:37,  2.15it/s]

pythia-1.4b en-cs keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  33%|███▎      | 40/120 [00:19<00:37,  2.13it/s]

pythia-1.4b cs-en keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  34%|███▍      | 41/120 [00:19<00:36,  2.15it/s]

pythia-1.4b en-fr keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  35%|███▌      | 42/120 [00:19<00:35,  2.17it/s]

pythia-1.4b en-de keys: dict_keys(['de', 'en', 'gen'])


Processing files:  36%|███▌      | 43/120 [00:20<00:35,  2.18it/s]

pythia-1.4b en-es keys: dict_keys(['en', 'es', 'gen'])


Processing files:  37%|███▋      | 44/120 [00:20<00:35,  2.17it/s]

pythia-1.4b hu-en keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  38%|███▊      | 45/120 [00:21<00:34,  2.18it/s]

pythia-1.4b it-en keys: dict_keys(['en', 'it', 'gen'])


Processing files:  38%|███▊      | 46/120 [00:21<00:33,  2.19it/s]

pythia-1.4b en-it keys: dict_keys(['en', 'it', 'gen'])


Processing files:  39%|███▉      | 47/120 [00:22<00:39,  1.87it/s]

pythia-1.4b fr-en keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  40%|████      | 48/120 [00:22<00:36,  1.96it/s]

pythia-1.4b es-en keys: dict_keys(['en', 'es', 'gen'])


Processing files:  41%|████      | 49/120 [00:23<00:35,  2.01it/s]

pythia-410m en-hu keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  42%|████▏     | 50/120 [00:23<00:35,  1.98it/s]

pythia-410m de-en keys: dict_keys(['de', 'en', 'gen'])


Processing files:  42%|████▎     | 51/120 [00:24<00:34,  2.02it/s]

pythia-410m en-cs keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  43%|████▎     | 52/120 [00:24<00:34,  1.99it/s]

pythia-410m cs-en keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  44%|████▍     | 53/120 [00:25<00:32,  2.03it/s]

pythia-410m en-fr keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  45%|████▌     | 54/120 [00:25<00:32,  2.05it/s]

pythia-410m en-de keys: dict_keys(['de', 'en', 'gen'])


Processing files:  46%|████▌     | 55/120 [00:26<00:31,  2.08it/s]

pythia-410m en-es keys: dict_keys(['en', 'es', 'gen'])


Processing files:  47%|████▋     | 56/120 [00:26<00:30,  2.09it/s]

pythia-410m hu-en keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  48%|████▊     | 57/120 [00:27<00:29,  2.12it/s]

pythia-410m it-en keys: dict_keys(['en', 'it', 'gen'])


Processing files:  48%|████▊     | 58/120 [00:27<00:29,  2.14it/s]

pythia-410m en-it keys: dict_keys(['en', 'it', 'gen'])


Processing files:  49%|████▉     | 59/120 [00:28<00:28,  2.14it/s]

pythia-410m fr-en keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  50%|█████     | 60/120 [00:28<00:27,  2.15it/s]

pythia-410m es-en keys: dict_keys(['en', 'es', 'gen'])


Processing files:  51%|█████     | 61/120 [00:29<00:28,  2.05it/s]

pythia-70m en-hu keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  52%|█████▏    | 62/120 [00:29<00:27,  2.07it/s]

pythia-70m de-en keys: dict_keys(['de', 'en', 'gen'])


Processing files:  52%|█████▎    | 63/120 [00:30<00:27,  2.09it/s]

pythia-70m en-cs keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  53%|█████▎    | 64/120 [00:30<00:26,  2.09it/s]

pythia-70m cs-en keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  54%|█████▍    | 65/120 [00:31<00:26,  2.10it/s]

pythia-70m en-fr keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  55%|█████▌    | 66/120 [00:31<00:25,  2.13it/s]

pythia-70m en-de keys: dict_keys(['de', 'en', 'gen'])


Processing files:  56%|█████▌    | 67/120 [00:32<00:24,  2.14it/s]

pythia-70m en-es keys: dict_keys(['en', 'es', 'gen'])


Processing files:  57%|█████▋    | 68/120 [00:32<00:24,  2.12it/s]

pythia-70m hu-en keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  57%|█████▊    | 69/120 [00:33<00:24,  2.07it/s]

pythia-70m it-en keys: dict_keys(['en', 'it', 'gen'])


Processing files:  58%|█████▊    | 70/120 [00:33<00:24,  2.08it/s]

pythia-70m en-it keys: dict_keys(['en', 'it', 'gen'])


Processing files:  59%|█████▉    | 71/120 [00:33<00:23,  2.08it/s]

pythia-70m fr-en keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  60%|██████    | 72/120 [00:34<00:23,  2.05it/s]

pythia-70m es-en keys: dict_keys(['en', 'es', 'gen'])


Processing files:  61%|██████    | 73/120 [00:34<00:23,  2.04it/s]

pythia-14m en-hu keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  62%|██████▏   | 74/120 [00:35<00:22,  2.04it/s]

pythia-14m de-en keys: dict_keys(['de', 'en', 'gen'])


Processing files:  62%|██████▎   | 75/120 [00:35<00:22,  2.01it/s]

pythia-14m en-cs keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  63%|██████▎   | 76/120 [00:36<00:21,  2.00it/s]

pythia-14m cs-en keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  64%|██████▍   | 77/120 [00:37<00:21,  1.98it/s]

pythia-14m en-fr keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  65%|██████▌   | 78/120 [00:37<00:20,  2.00it/s]

pythia-14m en-de keys: dict_keys(['de', 'en', 'gen'])


Processing files:  66%|██████▌   | 79/120 [00:38<00:20,  1.99it/s]

pythia-14m en-es keys: dict_keys(['en', 'es', 'gen'])


Processing files:  67%|██████▋   | 80/120 [00:38<00:19,  2.03it/s]

pythia-14m hu-en keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  68%|██████▊   | 81/120 [00:38<00:18,  2.08it/s]

pythia-14m it-en keys: dict_keys(['en', 'it', 'gen'])


Processing files:  68%|██████▊   | 82/120 [00:39<00:17,  2.12it/s]

pythia-14m en-it keys: dict_keys(['en', 'it', 'gen'])


Processing files:  69%|██████▉   | 83/120 [00:39<00:17,  2.14it/s]

pythia-14m fr-en keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  70%|███████   | 84/120 [00:40<00:16,  2.15it/s]

pythia-14m es-en keys: dict_keys(['en', 'es', 'gen'])


Processing files:  71%|███████   | 85/120 [00:40<00:16,  2.15it/s]

pythia-2.8b en-hu keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  72%|███████▏  | 86/120 [00:41<00:15,  2.16it/s]

pythia-2.8b de-en keys: dict_keys(['de', 'en', 'gen'])


Processing files:  72%|███████▎  | 87/120 [00:41<00:15,  2.07it/s]

pythia-2.8b en-cs keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  73%|███████▎  | 88/120 [00:42<00:15,  2.09it/s]

pythia-2.8b cs-en keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  74%|███████▍  | 89/120 [00:42<00:14,  2.11it/s]

pythia-2.8b en-fr keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  75%|███████▌  | 90/120 [00:43<00:14,  2.12it/s]

pythia-2.8b en-de keys: dict_keys(['de', 'en', 'gen'])


Processing files:  76%|███████▌  | 91/120 [00:43<00:13,  2.14it/s]

pythia-2.8b en-es keys: dict_keys(['en', 'es', 'gen'])


Processing files:  77%|███████▋  | 92/120 [00:44<00:15,  1.83it/s]

pythia-2.8b hu-en keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  78%|███████▊  | 93/120 [00:44<00:14,  1.92it/s]

pythia-2.8b it-en keys: dict_keys(['en', 'it', 'gen'])


Processing files:  78%|███████▊  | 94/120 [00:45<00:13,  1.99it/s]

pythia-2.8b en-it keys: dict_keys(['en', 'it', 'gen'])


Processing files:  79%|███████▉  | 95/120 [00:45<00:12,  2.03it/s]

pythia-2.8b fr-en keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  80%|████████  | 96/120 [00:46<00:11,  2.06it/s]

pythia-2.8b es-en keys: dict_keys(['en', 'es', 'gen'])


Processing files:  81%|████████  | 97/120 [00:46<00:11,  2.08it/s]

pythia-6.9b en-hu keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  82%|████████▏ | 98/120 [00:47<00:10,  2.11it/s]

pythia-6.9b de-en keys: dict_keys(['de', 'en', 'gen'])


Processing files:  82%|████████▎ | 99/120 [00:47<00:09,  2.11it/s]

pythia-6.9b en-cs keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  83%|████████▎ | 100/120 [00:48<00:09,  2.12it/s]

pythia-6.9b cs-en keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  84%|████████▍ | 101/120 [00:48<00:08,  2.15it/s]

pythia-6.9b en-fr keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  85%|████████▌ | 102/120 [00:48<00:08,  2.16it/s]

pythia-6.9b en-de keys: dict_keys(['de', 'en', 'gen'])


Processing files:  86%|████████▌ | 103/120 [00:49<00:07,  2.16it/s]

pythia-6.9b en-es keys: dict_keys(['en', 'es', 'gen'])


Processing files:  87%|████████▋ | 104/120 [00:49<00:07,  2.15it/s]

pythia-6.9b hu-en keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  88%|████████▊ | 105/120 [00:50<00:06,  2.16it/s]

pythia-6.9b it-en keys: dict_keys(['en', 'it', 'gen'])


Processing files:  88%|████████▊ | 106/120 [00:50<00:06,  2.16it/s]

pythia-6.9b en-it keys: dict_keys(['en', 'it', 'gen'])


Processing files:  89%|████████▉ | 107/120 [00:51<00:05,  2.17it/s]

pythia-6.9b fr-en keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  90%|█████████ | 108/120 [00:51<00:05,  2.17it/s]

pythia-6.9b es-en keys: dict_keys(['en', 'es', 'gen'])


Processing files:  91%|█████████ | 109/120 [00:52<00:05,  2.16it/s]

pythia-160m en-hu keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  92%|█████████▏| 110/120 [00:52<00:04,  2.16it/s]

pythia-160m de-en keys: dict_keys(['de', 'en', 'gen'])


Processing files:  92%|█████████▎| 111/120 [00:53<00:04,  2.16it/s]

pythia-160m en-cs keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  93%|█████████▎| 112/120 [00:53<00:03,  2.15it/s]

pythia-160m cs-en keys: dict_keys(['cs', 'en', 'gen'])


Processing files:  94%|█████████▍| 113/120 [00:54<00:03,  2.16it/s]

pythia-160m en-fr keys: dict_keys(['en', 'fr', 'gen'])


Processing files:  95%|█████████▌| 114/120 [00:54<00:02,  2.18it/s]

pythia-160m en-de keys: dict_keys(['de', 'en', 'gen'])


Processing files:  96%|█████████▌| 115/120 [00:54<00:02,  2.18it/s]

pythia-160m en-es keys: dict_keys(['en', 'es', 'gen'])


Processing files:  97%|█████████▋| 116/120 [00:55<00:01,  2.16it/s]

pythia-160m hu-en keys: dict_keys(['en', 'hu', 'gen'])


Processing files:  98%|█████████▊| 117/120 [00:55<00:01,  2.17it/s]

pythia-160m it-en keys: dict_keys(['en', 'it', 'gen'])


Processing files:  98%|█████████▊| 118/120 [00:56<00:00,  2.16it/s]

pythia-160m en-it keys: dict_keys(['en', 'it', 'gen'])


Processing files:  99%|█████████▉| 119/120 [00:56<00:00,  2.17it/s]

pythia-160m fr-en keys: dict_keys(['en', 'fr', 'gen'])


Processing files: 100%|██████████| 120/120 [00:57<00:00,  2.09it/s]


pythia-160m es-en keys: dict_keys(['en', 'es', 'gen'])
OLMo-7B: dict_keys(['en', 'es', 'gen'])


Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saved to /share/edc/home/antonis/datasets/huggingface/wmt09_gens_OLMo-7B
pythia-12b: dict_keys(['en', 'es', 'gen'])


Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saved to /share/edc/home/antonis/datasets/huggingface/wmt09_gens_pythia-12b
pythia-31m: dict_keys(['en', 'es', 'gen'])


Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saved to /share/edc/home/antonis/datasets/huggingface/wmt09_gens_pythia-31m
pythia-1.4b: dict_keys(['en', 'es', 'gen'])


Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saved to /share/edc/home/antonis/datasets/huggingface/wmt09_gens_pythia-1.4b
pythia-410m: dict_keys(['en', 'es', 'gen'])


Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saved to /share/edc/home/antonis/datasets/huggingface/wmt09_gens_pythia-410m
pythia-70m: dict_keys(['en', 'es', 'gen'])


Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saved to /share/edc/home/antonis/datasets/huggingface/wmt09_gens_pythia-70m
pythia-14m: dict_keys(['en', 'es', 'gen'])


Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saved to /share/edc/home/antonis/datasets/huggingface/wmt09_gens_pythia-14m
pythia-2.8b: dict_keys(['en', 'es', 'gen'])


Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saved to /share/edc/home/antonis/datasets/huggingface/wmt09_gens_pythia-2.8b
pythia-6.9b: dict_keys(['en', 'es', 'gen'])


Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saved to /share/edc/home/antonis/datasets/huggingface/wmt09_gens_pythia-6.9b
pythia-160m: dict_keys(['en', 'es', 'gen'])


Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3027 [00:00<?, ? examples/s]

Saved to /share/edc/home/antonis/datasets/huggingface/wmt09_gens_pythia-160m


In [31]:
wmt_model_pth = "/share/edc/home/antonis/datasets/huggingface/wmt09_gens_pythia-160m"
wmt_model = load_from_disk(wmt_model_pth)
wmt_model['en-es']['translation'][1].keys()

dict_keys(['en', 'es', 'gen'])