# Step 1

In [5]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
#from datasets import load_metric  not being used
import torch
from unidecode import unidecode
from torch.utils.data import Dataset, DataLoader, random_split
import nltk
import numpy as np

In [6]:
!nvidia-smi  # not using cuda due to memory issues. (access to a 4GB ram)

Fri May 27 14:32:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.68.01    Driver Version: 512.59       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| N/A   51C    P8    N/A /  N/A |      0MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

To properly analyze mBART-50, I chose an exisiting parallel [corpus](https://www.statmt.org/europarl/v7/fr-en.tgz) from [Europarl](https://www.statmt.org/europarl/). This corpus is in French and English and has already been aligned with the Church and Gale algorithm. There are a total of *2007723* sentences of which only 225 are used as the final dataset.

For preprocessing, both unidecode and random_split from torch were used in initial runs. However, my current implementation of random_split with random seeds was still perturbing the alignment and as such I did not shuffle the sentence ordering. This is problematic in terms of evaluation, because we are passing the data in the same order. Alternatively, another split that could be considered for an extension of this project is what the [MT-Adapted Datasheets for Datasets](https://arxiv.org/pdf/2005.13156.pdf ) paper stated: *It is recommended to use data from the last quarter of 2000 as a test set, while the rest should be used as training data.* Nevertheless, this seems to be a heuristic and a general recommendation and as such, I still stuck to my "ordinal" split. To be more precise, 80% of my data went to the training set and the remaining 20% to a test set. The training set split was given such a big margin in case I'd like to add a validation set split. 

Additionally, unidecode was needed to standardize certain characters such as quotation marks and dashes '-'.

In [15]:
print(torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'  # forcing cpu. I left other device statement for runs on the cloud
print(device)

True
cpu


In [7]:
# loading mBART's model, tokenizer, and passing language values
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.src_lang = "fr_XX"
tokenizer.tgt_lang = "en_XX"

# Calling corpora files. It is expected that they are in the root dir of this notebook.

src_file = "europarl-v7.fr-en.fr"
trg_file = "europarl-v7.fr-en.en"

In [8]:
def open_file(corpus_file, n=225):
    """
    :param corpus_file: file
    :param n: number of sentences to be fetched. The default is 225
    Opens corpus file. tokenization is not called after reading the file if the user needs to 
    perform any custom data splits
    returns: 225 sentences out of the corpus
    """
    with open(corpus_file, 'r', encoding="utf-8") as f:
        corpus = f.readlines()
    f.close()
    return corpus[:n]

def tokenize_set(corpus):  # 500
    """
    :param n: corpus
    returns: tokenized sentences in a list. Each tokenized sentence represents a dictionary with input_ids
    and attention masks
    """   
    return [tokenizer(unidecode(line), return_tensors="pt", padding=True) for line in corpus]

def translate(sentences, trg_lang_code=tokenizer.tgt_lang, model=model):
    """
    : param sentences: a list containing tokenized sentences
    : trg_lang_code: a string with the target language's code. Default is the tokenizer's tgt_lang attribute. I.e. en_XX
    returns: decoded tokens of a translated sentence. I.e. a translation
    """
    translated_sents = list()
    for sent in sentences:
        # mBART50 may accept a max size of 512 or 1024. Beam size can also be played around with.
        generated_tokens = model.generate(**sent, num_beams=1, max_length=512, 
                    forced_bos_token_id=tokenizer.lang_code_to_id[trg_lang_code])
        decoded_tokens = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        translated_sents.append(decoded_tokens)
    return translated_sents

def idtensor_to_tokens(tensor):
    """
    : param tensor: a tensor 
    it is flattened and cast to...
    returns: convert ids to tokens
    """
    flattened = tensor.squeeze().detach().numpy()
    return tokenizer.convert_ids_to_tokens(flattened)

def iterate_tensors(tokenized_sents):
    """
    : param tokenized_sents: expects tokenization with the encoded tokens. Dictionary with 2 keys, input_ids and attention mask
    returns: nested encoded tokens (input_ids)
    """
    return [idtensor_to_tokens(t["input_ids"]) for t in tokenized_sents]   


In [9]:
# load up corpus and zip it in a list
parallel_corpus = list(zip(open_file(src_file, n=225), open_file(trg_file, n=225)))
# split corpus
train_set_size = int(len(parallel_corpus) * 0.8)  # value can be changed to change data splits
test_set_size = len(parallel_corpus) - train_set_size
train_set = parallel_corpus[:train_set_size]
test_set = parallel_corpus[train_set_size:]
# random_split no longer being used but left in final version for future testing
#train_set, test_set = random_split(parallel_corpus, [train_set_size, test_set_size], generator=torch.Generator().manual_seed(42))

# unzipping the data splits into source and target sentences. Doing this with the current random_split would 
# disturb alignment
src_train_sents, trg_train_sents = zip(*train_set)
src_test_sents, trg_test_sents = zip(*test_set)

In [10]:
# Showing size of train split. 
print(len(src_train_sents))
print(len(trg_train_sents))
print(src_train_sents[:3])
print(trg_train_sents[:3])

180
180
('Reprise de la session\n', 'Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances.\n', 'Comme vous avez pu le constater, le grand "bogue de l\'an 2000" ne s\'est pas produit. En revanche, les citoyens d\'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles.\n')
('Resumption of the session\n', 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.\n', "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.\n")


# Step 2

By using the previously written tokenization function, I tokenized all of the source and target sentences. We convert the ids to tokens to visualize the sentences. Note that the first index corresponds to the language code. Since this step only required to perform translations with the test set, the example corresponds to the first sentence in the test set.

In [11]:
src = tokenize_set(src_test_sents)
trg = tokenize_set(trg_test_sents)
src_sents_from_ids = iterate_tensors(src)
eval_sents_from_ids = iterate_tensors(trg)
print(src[0]["input_ids"])
print()
print(src_sents_from_ids[0])
print()
print(trg[0]["input_ids"])
print()
print(eval_sents_from_ids[0]) 

tensor([[250008, 125876,  72581,  10274,     40,      8,  88500,  16865,      8,
           5460,  40276,  40514,      4,  10274,  43529,  72581,     21,  54159,
           1505,   1609,    104,     25,  19988,  40276,  40514,      7,      4,
            405,    569,  16095,  81581,    773,   6432,      8, 123196,      5,
              6,      2]])

['fr_XX', '▁Elles', '▁vont', '▁soit', '▁se', '▁de', 'barra', 'sser', '▁de', '▁leur', '▁carga', 'ison', ',', '▁soit', '▁elles', '▁vont', '▁la', '▁melan', 'ger', '▁avec', '▁d', "'", 'autres', '▁carga', 'ison', 's', ',', '▁ce', '▁qui', '▁eng', 'endre', '▁une', '▁serie', '▁de', '▁problemes', '.', '▁', '</s>']

tensor([[250008,  32255, 164917,  53095,  40101,  60458,    111,   2363,  33362,
            707,  17664,    442,    678,   3789,  33362,      4,   3129, 113660,
          44402,      5,      6,      2]])

['fr_XX', '▁These', '▁smaller', '▁companies', '▁either', '▁dispose', '▁of', '▁their', '▁cargo', '▁or', '▁mix', '▁it', '▁with', '▁other

After performing translations once, I stored them in a file to facilitate testing and debugging. Note that to this submission the *translations_output.txt* has been attached. Otherwise, the translate function may be run locally. As a final note, in terms of writing the file, the *writelines()* method may be a more efficient way.
```Python
translations = translate(src)

with open("translation_output.txt", 'w') as f:
    for trans in translations:        
        f.write(trans[0] + '\n')  # writelines()
f.close()
```

In [33]:
# If the translation file has been provided, we simply load them up.

with open("translation_output_45_sents.txt", 'r') as f:
    translations = f.readlines()
f.close()

In [34]:
hypotheses = [tokenizer(sent, return_tensors="pt") for sent in translations]  # do I need the tensors?
hypotheses = iterate_tensors(hypotheses)
references = eval_sents_from_ids
original_french_sents = src_sents_from_ids
# outputting 3 sample translations
for i in range(40, 43):  # since we used the test split, i.e. 20% of total data set (225), we only have 45 sentences.
    print(f"Original sentence {original_french_sents[i]}" + '\n')
    print(f"Manual translation from Eurparl: {references[i]}" + '\n' * 2 + f"mBART translation {hypotheses[i]}" + '\n' * 3)


Original sentence ['fr_XX', '▁Mais', '▁je', '▁tien', 's', '▁a', '▁dire', ',', '▁Mes', 'da', 'mes', '▁et', '▁Messi', 'eurs', '▁les', '▁Deput', 'es', ',', '▁que', '▁la', '▁securi', 'te', '▁est', '▁un', '▁', 'objectif', '▁priorit', 'aire', '▁de', '▁la', '▁Commission', '.', '▁', '</s>']

Manual translation from Eurparl: ['fr_XX', '▁But', '▁I', '▁would', '▁like', '▁to', '▁say', '▁that', '▁safety', '▁is', '▁a', '▁priorit', 'y', '▁objective', '▁for', '▁the', '▁Commission', '.', '▁', '</s>']

mBART translation ['fr_XX', '▁But', '▁I', '▁would', '▁like', '▁to', '▁say', ',', '▁la', 'dies', '▁and', '▁gent', 'le', 'men', ',', '▁that', '▁security', '▁is', '▁a', '▁priorit', 'y', '▁objective', '▁of', '▁the', '▁Commission', '.', '▁', '</s>']



Original sentence ['fr_XX', '▁Comme', '▁je', '▁le', '▁dira', 'i', '▁lors', '▁du', '▁debat', '▁sur', '▁l', "'", 'E', 'rika', ',', '▁nous', '▁n', "'", 'attend', 'ons', '▁pas', '▁qu', "'", 'une', '▁cata', 'stro', 'phe', '▁sur', 'vien', 'ne', '▁pour', '▁nous', '▁con

In [35]:
# We add a sanity check to make sure we have the same amount of references and hypotheses. We also print out the number
# of sentences that share the same length. 
assert len(references) == len(translations)
for ref, hyp in zip(references, hypotheses):
    if len(ref) == len(hyp):
        print(ref)
        print(hyp)
        print()    

['fr_XX', '▁I', '▁would', '▁like', '▁to', '▁mention', '▁one', '▁final', '▁point', '.', '▁', '</s>']
['fr_XX', '▁I', '▁would', '▁like', '▁to', '▁raise', '▁one', '▁last', '▁point', '.', '▁', '</s>']

['fr_XX', '▁I', '▁should', '▁like', '▁to', '▁make', '▁just', '▁a', '▁few', '▁comments', '.', '▁', '</s>']
['fr_XX', '▁I', '▁would', '▁like', '▁to', '▁make', '▁a', '▁few', '▁re', 'marks', '.', '▁', '</s>']

['fr_XX', '▁Consider', 'ing', '▁that', '▁it', '▁is', '▁only', '▁today', '▁that', '▁we', '▁are', '▁dealing', '▁with', '▁a', '▁Commission', '▁proposal', '▁first', '▁made', '▁on', '▁19', '▁March', '▁1998', ',', '▁even', '▁though', '▁Parliament', '▁responde', 'd', '▁relative', 'ly', '▁quickly', ',', '▁this', '▁time', '▁lag', '▁is', '▁a', '▁little', '▁too', '▁long', '.', '▁', '</s>']
['fr_XX', '▁When', '▁I', '▁consider', '▁that', '▁the', '▁first', '▁Commission', '▁proposal', '▁was', '▁table', 'd', '▁on', '▁19', '▁March', '▁1998', '▁and', '▁that', '▁we', '▁are', '▁now', '▁dealing', '▁with', '▁it

In [36]:
# these results are with only 45 sentences
corpus_references = [[ref] for ref in references]
smoothing = nltk.translate.bleu_score.SmoothingFunction().method5  # methods 2 and 5 provide the highest bleu
bleu = nltk.translate.bleu_score.corpus_bleu(corpus_references, hypotheses, smoothing_function=smoothing)
print(bleu)

smoothing = nltk.translate.bleu_score.SmoothingFunction().method2  # methods 2 and 5 provide the highest bleu
bleu = nltk.translate.bleu_score.corpus_bleu(corpus_references, hypotheses, smoothing_function=smoothing)
print(bleu)


0.5254438794181694
0.39037920561827


In [36]:
# if we increase the data size to 200 translated sentences then...

hypotheses = [tokenizer(sent, return_tensors="pt") for sent in translations]  # do I need the tensors?
hypotheses = iterate_tensors(hypotheses)
references = eval_sents_from_ids
original_french_sents = src_sents_from_ids

for i in range(150, 153):  # since we used the test split, i.e. 20% of total data set (1000), we only have 200 sentences.
    print(f"Original sentence {original_french_sents[i]}" + '\n')
    print(f"Manual translation from Eurparl: {references[i]}" + '\n' * 2 + f"mBART translation {hypotheses[i]}" + '\n' * 3)

Original sentence ['fr_XX', '▁Les', '▁rapport', 's', '▁economi', 'ques', '▁dans', '▁le', '▁monde', '▁reel', '▁de', 'mont', 'rent', '▁a', '▁suffi', 's', 'ance', '▁que', '▁l', "'", 'eli', 'mination', '▁de', '▁toute', '▁intervention', '▁publique', '▁dans', '▁le', '▁marche', '▁ne', '▁produit', '▁null', 'ement', '▁une', '▁concurrence', '▁parfait', 'e', '▁et', '▁une', '▁al', 'location', '▁optimale', '▁des', '▁ressources', '.', '▁', '</s>']

Manual translation from Eurparl: ['fr_XX', '▁Economic', '▁relationships', '▁in', '▁the', '▁real', '▁world', '▁adequat', 'ely', '▁demonstrat', 'e', '▁that', '▁elimina', 'ting', '▁all', '▁public', '▁intervention', '▁in', '▁the', '▁market', '▁does', '▁not', '▁in', '▁any', '▁way', '▁bring', '▁about', '▁perfect', '▁competition', '▁and', '▁the', '▁optim', 'um', '▁distribution', '▁of', '▁resources', '.', '▁', '</s>']

mBART translation ['fr_XX', '▁Economic', '▁reports', '▁in', '▁the', '▁real', '▁world', '▁show', '▁that', '▁it', '▁is', '▁', 'sufficient', '▁that',

In [39]:
assert len(references) == len(translations)  # sanity check passed.
proper_aligned_ref_hyp = 0
for ref, hyp in zip(references, hypotheses):
    if len(ref) == len(hyp):
        proper_aligned_ref_hyp += 1
print(proper_aligned_ref_hyp)

23


Adding more sentences to the test set did not improve the bleu score... Probably because only 23 out of 200 sentences have the same length as their references. This means only 11.5% of total test sentences.

In [38]:
corpus_references = [[ref] for ref in references]
smoothing = nltk.translate.bleu_score.SmoothingFunction().method5  # methods 2 and 5 provide the highest bleu
bleu = nltk.translate.bleu_score.corpus_bleu(corpus_references, hypotheses, smoothing_function=smoothing)
print(bleu)

smoothing = nltk.translate.bleu_score.SmoothingFunction().method2  # methods 2 and 5 provide the highest bleu
bleu = nltk.translate.bleu_score.corpus_bleu(corpus_references, hypotheses, smoothing_function=smoothing)
print(bleu)

0.4194307038995439
0.2945431298371375


# Step 3

Sentences were originally loaded as tuples for the sake of efficiency. However, when we fed them to tokenizer.as_target_tokenizer, issues come up when casting them to tensors. As a result of this, they have been cast to lists before being passed to the tokenizer and the model.

In [52]:
trg_train_sents = list(trg_train_sents)
src_train_sents = list(src_train_sents)

trg_test_sents = list(trg_test_sents)
src_test_sents = list(src_test_sents)

model_inputs = tokenizer(src_train_sents, return_tensors='pt', padding=True)

def get_target_sent_info(trg_sents):
    """
    :param trg sents: target sentences could be from train, test, or validation set
    returns trg_tokenizer_output with the attention mask that we need, the non-
    """
    with tokenizer.as_target_tokenizer():
        trg_tokenizer_output = tokenizer(trg_sents, return_tensors='pt', padding=True)    
    label_ids = trg_tokenizer_output['input_ids']
    pad_mask = trg_tokenizer_output['attention_mask'] == 0    
    # using padding mask to convert pad tokens to -100, careful when converting these ids to tokens.
    # negative numbers are not part of mBART's ids.
    label_ids = torch.where(pad_mask, torch.full_like(label_ids, -100), label_ids)
    # concatenaing the eos_taken in the first index (col) per tensor   
    label_ids = torch.concat((torch.full((label_ids.shape[0], 1), tokenizer.eos_token_id), label_ids), 1) 
    return label_ids

label_ids = get_target_sent_info(trg_train_sents)

The training loop as explained on [hugging face](https://huggingface.co/docs/transformers/model_doc/mbart) involves passing the model inputs and the labels, which we can visualize in the next cell.

In [53]:
#model_inputs = model_inputs.to(device)
#torch.cuda.empty_cache() uncomment if using gpu
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri May 27 14:57:35 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.68.01    Driver Version: 512.59       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| N/A   45C    P8    N/A /  N/A |      0MiB /  4096MiB |      1%      Default |
|                               |            

In [56]:
optimizer = torch.optim.Adam([{'params': model.parameters(), 'lr': 2e-5}])  # 0.001
minibatch_size = 8
length_train_x = input_ids.shape[0]
num_minibatches = int(np.ceil(length_train_x/minibatch_size))
model.to(device)
model.train(True)
print('epoch', 'error')
for epoch in range(1, 5+1):
    indexes = np.arange(length_train_x)
    np.random.shuffle(indexes)            
    for i in range(num_minibatches):
        minibatch_indexes = indexes[i*minibatch_size:(i+1)*minibatch_size]
        # in case of GPU, uncomment and make sure device == cuda!
        #minibatch_x = torch.tensor(input_ids[minibatch_indexes], dtype=torch.long, device=device)          
        #minibatch_y = torch.tensor(label_ids[minibatch_indexes], dtype=torch.long, device=device)        
        optimizer.zero_grad()     
        #loss = model(minibatch_x, labels=minibatch_y).loss   
        loss = model(model_inputs["input_ids"][minibatch_indexes], labels=label_ids[minibatch_indexes]).loss   
        loss.backward()
        optimizer.step() 
    if epoch%1 == 0:
        print(epoch, loss.detach().tolist())
model.train(False)

torch.save(model, "pretrained_model.pth")
bart_model = torch.load("pretrained_model.pth")

epoch error
1 1.2499650716781616
2 0.5504726767539978
3 0.3144689202308655
4 0.23181627690792084
5 0.09578200429677963


Loss running for 15 epochs and training on 180 sentences on google colab. Minibatch_size == 8 and learning_rate == 2e-5
```Python
epoch error
1 1.8225083351135254
2 1.0663105249404907
3 0.7641125917434692
4 0.5679468512535095
5 0.46091803908348083
6 0.3314163088798523
7 0.29134687781333923
8 0.2458421289920807
9 0.20358489453792572
10 0.18593527376651764
11 0.12719759345054626
12 0.11407511681318283
13 0.11325351893901825
14 0.11906187981367111
15 0.08755840361118317
```

In [None]:
finetuned_translations = translate(src, model=bart_model)

with open("fine_trans_45_sents.txt", 'w') as f:
    for trans in finetuned_translations:        
        f.write(trans[0] + '\n')  # writelines()
f.close()

In [None]:
fine_tuned_hypotheses = [tokenizer(sent, return_tensors="pt") for sent in finetuned_translations]
fine_tuned_hypotheses = iterate_tensors(fine_tuned_hypotheses)
for i in range(4):
    print(f"Manual translation from Eurparl: {references[i]}" + '\n' * 2 + f"mBART translation {hypotheses[i]}" + "\n" * 3)  

In [None]:
corpus_references = [[ref] for ref in references]
smoothing = nltk.translate.bleu_score.SmoothingFunction().method5  # methods 2 and 5 provide the highest bleu
bleu = nltk.translate.bleu_score.corpus_bleu(corpus_references, hypotheses, smoothing_function=smoothing)
print(bleu)

smoothing = nltk.translate.bleu_score.SmoothingFunction().method2  # methods 2 and 5 provide the highest bleu
bleu = nltk.translate.bleu_score.corpus_bleu(corpus_references, hypotheses, smoothing_function=smoothing)
print(bleu)

In this case, we used the 200 total data set to translate 45 sentences. The training loop has an added minibatch loop to facilitate training, a learning rate of 2e-5 as the learning rate should remain small.

# Step 4

In [None]:
MBartForConditionalGeneration.from_pretrained(base_model)

# with torch.no_grad():        
#     loss = model(**trg_tokenizer_output, labels=label_ids).loss
#     logits = model(indexed_val_x, mask_val_x)  # should we use dataloader?
#     batch_probs = torch.softmax(logits, dim=1)
#     for (indexes, probs) in zip(indexed_val_x, batch_probs):
#         target = np.array(indexed_val_y, np.int64)
#         output = probs.detach().numpy().argmax(axis=1)
#         accuracy = (target == output).sum() / len(target)        
#         #print(probs.detach().numpy().argmax(axis=1))
#         print(tokenizer.convert_ids_to_tokens(indexes))
#         print()
#         print([tag_set[index] for index in probs.numpy().argmax(1).tolist()])
#         print(accuracy)
#         print()

In [80]:
corpus_references = [[ref] for ref in references]
smoothing = nltk.translate.bleu_score.SmoothingFunction().method2 
bleu = nltk.translate.bleu_score.corpus_bleu(corpus_references, hypotheses, smoothing_function=smoothing)  # hypothesis should be the google translate. reference replaced by example
print(bleu)


FROM COLAB

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# References:
\* not previously linked
- https://machinelearningmastery.com/prepare-french-english-dataset-machine-translation/
- https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/translation.ipynb
- https://tmramalho.github.io/science/2020/06/10/fine-tune-neural-translation-models-with-mBART/
- https://huggingface.co/blog/how-to-train
- https://zhuanlan.zhihu.com/p/379071787
- https://huggingface.co/docs/transformers/training
- https://towardsdatascience.com/mbart50-multilingual-fine-tuning-of-extensible-multilingual-pretraining-70a7305d4838