## **Model Fine-tuning** (Notebook sourced from translation notebook [here](https://huggingface.co/docs/transformers/notebooks))

Enable logging with Weights and Biases:

In [1]:
# import gc
# del model
# gc.collect()
# torch.cuda.empty_cache()

In [2]:
wb = True  # Enable WeightsAndBiases tracking

In [3]:
import os
work_dir = os.getcwd()
if work_dir == '/content':
  from google.colab import drive
  drive.mount('/content/drive')
  os.chdir('drive/MyDrive/github_repos/XLdefgen')

If running this on Colab, uncomment the following cell to install requisite packages.

In [4]:
# !pip install datasets transformers sacrebleu sentencepiece wandb
# !apt install git-lfs

In [5]:
if wb:
  import wandb
  print(wandb.__path__)
  wandb.login()
  %env WANDB_PROJECT=XLdefgen

['/home/wildeb1/anaconda3/envs/XLdefgen/lib/python3.8/site-packages/wandb']


[34m[1mwandb[0m: Currently logged in as: [33mbrandonwilde[0m (use `wandb login --relogin` to force relogin)


env: WANDB_PROJECT=XLdefgen


If storing model on HF Model Hub, uncomment the following:

In [6]:
# from huggingface_hub import notebook_login
# notebook_login()

A script version of this notebook to fine-tune the model in a distributed fashion using multiple GPUs or TPUs is available [here](https://github.com/huggingface/transformers/tree/master/examples/seq2seq).

Specify model checkpoint to load (from HF Model Hub)


In [7]:
model_checkpoint = "google/mt5-small"

## Loading the dataset

In [8]:
import datasets
from datasets import load_dataset, load_metric, Dataset
import csv
import torch
import numpy as np
import pandas as pd

# data_path = "codwoe_data.csv"

# class csvDataset(Dataset):

#     def __init__(self,file_name):
#         self.data_df = pd.read_csv(file_name)
#         self.data_dict = data_df.to_dict(orient='index')

#     def __len__(self):
#         return len(self.y)
  
#     def __getitem__(self,idx):
#         import numbers
#         if isinstance(idx, numbers.Integral):  # item is an integer
#             idx = [idx]
#         elif isinstance(idx, slice):  # item is a slice
#             idx = list(range(idx.start or 0, idx.stop or len(self), idx.step or 1))
#         else:  # invalid index type
#             raise TypeError('{cls} indices must be integers or slices, not {idx}'.format(
#                 cls=type(self).__name__,
#                 idx=type(idx).__name__,
#             ))

#         return [self.data_dict[i] for i in idx]

# codwoe_data = csvDataset(data_path)

# raw_datasets = datasets.load_from_disk("de-en_wmt16_tokd")
raw_datasets = load_dataset("wmt16", "de-en")

metric = load_metric("sacrebleu")

Reusing dataset wmt16 (/home/wildeb1/.cache/huggingface/datasets/wmt16/de-en/1.0.0/0d9fb3e814712c785176ad8cdb9f465fbe6479000ee6546725db30ad8a8b5f8a)


In [9]:
raw_datasets['validation']['translation'][:2]
# codwoe_data[:2]

[{'de': 'Die Premierminister Indiens und Japans trafen sich in Tokio.',
  'en': 'India and Japan prime ministers meet in Tokyo'},
 {'de': 'Indiens neuer Premierminister Narendra Modi trifft bei seinem ersten wichtigen Auslandsbesuch seit seinem Wahlsieg im Mai seinen japanischen Amtskollegen Shinzo Abe in Toko, um wirtschaftliche und sicherheitspolitische Beziehungen zu besprechen.',
  'en': "India's new prime minister, Narendra Modi, is meeting his Japanese counterpart, Shinzo Abe, in Tokyo to discuss economic and security ties, on his first major foreign visit since winning May's election."}]

To get a sense of what the data looks like, the following function shows some examples picked randomly from the dataset.

In [10]:
# import datasets
# import random
# import pandas as pd
# from IPython.display import display, HTML

# def show_random_elements(dataset, num_examples=5):
#     assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
#     picks = []
#     for _ in range(num_examples):
#         pick = random.randint(0, len(dataset)-1)
#         while pick in picks:
#             pick = random.randint(0, len(dataset)-1)
#         picks.append(pick)
    
#     df = pd.DataFrame(dataset[picks])
#     for column, typ in dataset.features.items():
#         if isinstance(typ, datasets.ClassLabel):
#             df[column] = df[column].transform(lambda i: typ.names[i])
#     display(HTML(df.to_html()))

In [11]:
# show_random_elements(raw_datasets["train"])

Demonstration of the metric in use:

In [12]:
fake_preds = ["hello there", "general kenobi"]
fake_labels = [["hello there"], ["general kenobi"]]
metric.compute(predictions=fake_preds, references=fake_labels)

{'score': 0.0,
 'counts': [4, 2, 0, 0],
 'totals': [4, 2, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 1.0,
 'sys_len': 4,
 'ref_len': 4}

## Preprocessing the data

In [13]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Model-specific tokenizer adaptations

In [14]:
if "t5" in model_checkpoint:
    prefix = "translate German to English: "
#     prefix = ""
    print("Inputs will include prefix!")
else:
    prefix = ""
    print("Inputs will not include prefix!")

if "mbart" in model_checkpoint:
    tokenizer.src_lang = "en-XX"
    tokenizer.tgt_lang = "de-DE"

Inputs will include prefix!


Create preprocessing function

In [15]:
max_input_length = 64
max_target_length = 64
source_lang = "de"
target_lang = "en"

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            padding="max_length",
            truncation=True).input_ids
    
    labels_with_ignore_index = []
    for labels_example in labels:
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)

    model_inputs["labels"] = labels_with_ignore_index
#     model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Specify whether reduced dataset should be passed to model

In [16]:
trim_datasets = True
train_size = 10000
eval_size = 100

Preprocess data

In [17]:
if trim_datasets:
    small_train_dataset = raw_datasets["train"].shuffle(seed=42).select(range(train_size))
    small_eval_dataset = raw_datasets["validation"].shuffle(seed=42).select(range(eval_size))
    raw_datasets_trim = datasets.DatasetDict({'train': small_train_dataset, 'validation': small_eval_dataset})
    tokenized_datasets = raw_datasets_trim.map(preprocess_function, batched=True)
    print("Datasets trimmed and tokenized.")
else:
    tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
    print("Raw datasets tokenized.")

del raw_datasets #to clear memory
torch.cuda.empty_cache()

Loading cached shuffled indices for dataset at /home/wildeb1/.cache/huggingface/datasets/wmt16/de-en/1.0.0/0d9fb3e814712c785176ad8cdb9f465fbe6479000ee6546725db30ad8a8b5f8a/cache-4574fe47268cd3fd.arrow
Loading cached shuffled indices for dataset at /home/wildeb1/.cache/huggingface/datasets/wmt16/de-en/1.0.0/0d9fb3e814712c785176ad8cdb9f465fbe6479000ee6546725db30ad8a8b5f8a/cache-bf1487bfb5cd2cad.arrow


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Datasets trimmed and tokenized.


The results are automatically cached by the 🤗 Datasets library to avoid spending time on this step the next time you run your notebook. The 🤗 Datasets library is normally smart enough to detect when the function you pass to map has changed (and thus requires to not use the cache data). 🤗 Datasets warns you when it uses cached files, but you can pass `load_from_cache_file=False` in the call to `map` to not use the cached files and force the preprocessing to be applied again.

## Fine-tuning the model

Now that our data is ready, we can download the pretrained model and fine-tune it. Since our task is of the sequence-to-sequence kind, we use the `AutoModelForSeq2SeqLM` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us.

In [18]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

cuda:0


Specify batch size and training arguments

In [19]:
batch_size = 2
learning_rate = 2e-4
optim = 'adamw_hf'
model_name = model_checkpoint.split("/")[-1]
if wb:
  report = "wandb"
else:
  report = "none"
train_k = int(train_size/1000)
args = Seq2SeqTrainingArguments(
    # f"drive/MyDrive/{model_name}-finetuned-{source_lang}-to-{target_lang}",
    # f"XLdefgen-{source_lang}-to-{target_lang}",
#     f"XLd-trans-{source_lang}2{target_lang}-tr{train_k}k-b{batch_size}-lr{learning_rate}-{optim}", #output directory
    "XLd-trans-fixed_padding",
    evaluation_strategy = "steps",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
#     optim=optim,
    adafactor=False,
    weight_decay=0.01,
    save_total_limit=3, #max num of checkpoints to keep
    num_train_epochs=15,
    predict_with_generate=True,
    fp16=False,         #mixed precision (acceleration) - doesn't work well with t5 models
    push_to_hub=False,  #push to HF Model Hub
    report_to=report,   #for data logging
#     run_name='Run_continued',     #for data logging
    ignore_data_skip=False,   #if true and loading from checkpoint, this will start at beginning of dataset rather than where left off
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,  #defaults to true unless 'loss' is metric for best model
    prediction_loss_only=False, #save space by not storing predictions for metrics
)

Add data collator to pad inputs and labels to max length for each batch

In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Post-processing and compute metrics

In [21]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    result = {k: round(v, 4) for k, v in result.items()}

    return result

# def compute_metrics(eval_pred):
#   '''Example for logging multiple metrics'''
#     metric1 = load_metric("precision")
#     metric2 = load_metric("recall")
    
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     precision = metric1.compute(predictions=predictions, references=labels)["precision"]
#     recall = metric2.compute(predictions=predictions, references=labels)["recall"]
#     return {"precision": precision, "recall": recall}

Instantiate Trainer

In [22]:
from typing import Optional, List, Dict
from torch.utils.data import Dataset
import time
import math
from transformers.debug_utils import DebugOption
from transformers.trainer_utils import speed_metrics

class PPLTrainer(Seq2SeqTrainer):
    """
    Just adapting Trainer to also log perplexity
    """
    def evaluate(
        self,
        eval_dataset: Optional[Dataset] = None,
        ignore_keys: Optional[List[str]] = None,
        metric_key_prefix: str = "eval",
        max_length: Optional[int] = None,
        num_beams: Optional[int] = None,
    ) -> Dict[str, float]:
                
        # memory metrics - must set up as early as possible
        self._memory_tracker.start()
        
        self._max_length = max_length if max_length is not None else self.args.generation_max_length
        self._num_beams = num_beams if num_beams is not None else self.args.generation_num_beams

        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        start_time = time.time()

        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        output = eval_loop(
            eval_dataloader,
            description="Evaluation",
            # No point gathering the predictions if there are no metrics, otherwise we defer to
            # self.args.prediction_loss_only
            prediction_loss_only=True if self.compute_metrics is None else None,
            ignore_keys=ignore_keys,
            metric_key_prefix=metric_key_prefix,
        )
        
        total_batch_size = self.args.eval_batch_size * self.args.world_size
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )
        
        output.metrics.update(
            {'eval_perplexity': round(math.exp(output.metrics['eval_loss']),4)}
        )

        self.log(output.metrics)

        if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
            xm.master_print(met.metrics_report())

        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)

        self._memory_tracker.stop_and_update_metrics(output.metrics)

        return output.metrics
    
    
trainer = PPLTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

Train/fine-tune the model

In [23]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_loss': 22.351808547973633,
 'eval_bleu': 0.1885,
 'eval_gen_len': 2.38,
 'eval_runtime': 4.586,
 'eval_samples_per_second': 21.805,
 'eval_steps_per_second': 10.903,
 'eval_perplexity': 5096442315.0861}

In [24]:
# import gc
# gc.collect()
# torch.cuda.empty_cache() #to free up space
# if wb:
#   wandb.init(resume=True) #this is performed by the trainer
trainer.train(resume_from_checkpoint=False)

The following columns in the training set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running training *****
  Num examples = 10000
  Num Epochs = 15
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 75000


Step,Training Loss,Validation Loss,Bleu,Gen Len,Perplexity
500,6.3038,3.156665,2.7112,17.39,23.4921
1000,4.137,2.942436,3.2676,17.39,18.962
1500,3.9604,2.908991,4.1019,16.93,18.3383
2000,3.7345,2.828471,4.5707,16.98,16.9196
2500,3.6604,2.772385,5.0701,17.02,15.9967
3000,3.5885,2.70092,5.9849,17.28,14.8934
3500,3.5477,2.695486,4.8136,16.85,14.8127
4000,3.4855,2.663942,4.6077,17.11,14.3528
4500,3.4479,2.618433,5.7068,16.97,13.7142
5000,3.4409,2.592508,5.6734,17.42,13.3632


The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-500
Configuration saved in XLd-trans-fixed_padding/checkpoint-500/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-500/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-500/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-500/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-500/spiece.model
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-1000
Conf

tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-5500/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-5500/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-5500/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-4000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-6000
Configuration saved in XLd-trans-fixed_padding/checkpoint-6000/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-6000/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-6000/special_tokens_map.json
Copy voca

Copy vocab file to XLd-trans-fixed_padding/checkpoint-10500/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-9000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-11000
Configuration saved in XLd-trans-fixed_padding/checkpoint-11000/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-11000/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-11000/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-11000/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-11000/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-9500] due to args.save_total_limit
The following colum

The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-16000
Configuration saved in XLd-trans-fixed_padding/checkpoint-16000/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-16000/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-16000/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-16000/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-16000/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-14500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****

***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-21000
Configuration saved in XLd-trans-fixed_padding/checkpoint-21000/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-21000/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-21000/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-21000/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-21000/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-19500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-21500
Configuration saved in XLd-trans-fixed_padding/chec

Configuration saved in XLd-trans-fixed_padding/checkpoint-26000/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-26000/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-26000/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-26000/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-26000/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-24500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-26500
Configuration saved in XLd-trans-fixed_padding/checkpoint-26500/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-26500/pytorch_model.bin
tokenizer config file saved in

Model weights saved in XLd-trans-fixed_padding/checkpoint-31000/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-31000/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-31000/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-31000/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-30000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-31500
Configuration saved in XLd-trans-fixed_padding/checkpoint-31500/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-31500/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-31500/tokenizer_config.json
Special toke

tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-36000/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-36000/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-36000/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-35000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-36500
Configuration saved in XLd-trans-fixed_padding/checkpoint-36500/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-36500/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-36500/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-36500/special_tokens_map.json


Special tokens file saved in XLd-trans-fixed_padding/checkpoint-41000/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-41000/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-40000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-41500
Configuration saved in XLd-trans-fixed_padding/checkpoint-41500/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-41500/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-41500/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-41500/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-41500/spiece.model
Deleting older checkp

Copy vocab file to XLd-trans-fixed_padding/checkpoint-46000/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-45000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-46500
Configuration saved in XLd-trans-fixed_padding/checkpoint-46500/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-46500/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-46500/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-46500/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-46500/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-45500] due to args.save_total_limit
The following col

Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-50000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-51500
Configuration saved in XLd-trans-fixed_padding/checkpoint-51500/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-51500/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-51500/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-51500/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-51500/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-50500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5Fo

The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-56500
Configuration saved in XLd-trans-fixed_padding/checkpoint-56500/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-56500/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-56500/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-56500/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-56500/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-55500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****

***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-61500
Configuration saved in XLd-trans-fixed_padding/checkpoint-61500/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-61500/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-61500/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-61500/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-61500/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-60500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-62000
Configuration saved in XLd-trans-fixed_padding/chec

Configuration saved in XLd-trans-fixed_padding/checkpoint-66500/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-66500/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-66500/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-66500/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-66500/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-65500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-67000
Configuration saved in XLd-trans-fixed_padding/checkpoint-67000/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-67000/pytorch_model.bin
tokenizer config file saved in

Model weights saved in XLd-trans-fixed_padding/checkpoint-71500/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-71500/tokenizer_config.json
Special tokens file saved in XLd-trans-fixed_padding/checkpoint-71500/special_tokens_map.json
Copy vocab file to XLd-trans-fixed_padding/checkpoint-71500/spiece.model
Deleting older checkpoint [XLd-trans-fixed_padding/checkpoint-70500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLd-trans-fixed_padding/checkpoint-72000
Configuration saved in XLd-trans-fixed_padding/checkpoint-72000/config.json
Model weights saved in XLd-trans-fixed_padding/checkpoint-72000/pytorch_model.bin
tokenizer config file saved in XLd-trans-fixed_padding/checkpoint-72000/tokenizer_config.json
Special toke

TrainOutput(global_step=75000, training_loss=2.2463020670572917, metrics={'train_runtime': 31825.6835, 'train_samples_per_second': 4.713, 'train_steps_per_second': 2.357, 'total_flos': 1.9828113408e+16, 'train_loss': 2.2463020670572917, 'epoch': 15.0})

In [25]:
trainer.evaluate()
if wb:
  wandb.finish()

The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/bleu,▁▃▄▆▅▅▆▅▇▆▇█▇▇▇▇▆▇▇▆▇▆▇▇▆▆▇▇▆▆▇██▇█▇▇██▇
eval/gen_len,▇▂▁▆▄▄▆▇▆▇█▆▆▇▆▇▇▆▆▆▇▆▇▅▇▆▅▇▇▅▆▅▆▇▅▆▆▆▇▇
eval/loss,█▆▄▃▂▂▂▂▁▁▁▂▁▁▁▁▁▂▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁
eval/perplexity,█▅▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁
eval/runtime,▂▂▄▂▂▁▁▂▂▂▃▂▁▃▂▄▃▄▃▂▂▃▁▂█▂▁▂▄▄▂▄▃▃▃▃▄▂▂▃
eval/samples_per_second,▇▇▅▇▇█▇▇▇▇▅▇█▆▇▄▅▅▆▇▇▆█▇▁▇█▇▅▅▇▄▆▅▆▅▅▇▇▆
eval/steps_per_second,▇▇▅▇▇█▇▇▇▇▅▇█▆▇▄▅▅▆▇▇▆█▇▁▇█▇▅▅▇▄▆▅▆▅▅▇▇▆
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
eval/bleu,7.6405
eval/gen_len,17.4
eval/loss,2.42665
eval/perplexity,11.3209
eval/runtime,15.797
eval/samples_per_second,6.33
eval/steps_per_second,3.165
train/epoch,15.0
train/global_step,75000.0
train/learning_rate,0.0


## Model testing

Test model predictive capacity with an example

In [26]:
input_ids = tokenizer.encode(prefix + 'I enjoy walking with my cute dog', return_tensors='pt')
print(input_ids)

input_ids = tokenizer(prefix + 'I enjoy walking with my cute dog', return_tensors='pt').input_ids
print(input_ids)

input_ids = tokenizer(prefix + 'Ich gehe gern spazierien mit meinem süßen Hündchen', return_tensors='pt').input_ids
print(input_ids)

input_ids = tokenizer(prefix + "Die Professorin kann die Sache nicht betragen.", return_tensors='pt').input_ids
print(input_ids)
input_ids = input_ids.to(device)

greedy_output = model.generate(input_ids)
print("\nGreedy Output:")
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, min_length=5))

outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3)
print("\n" + 100 * '-' + "\n\nBeam Output:")
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

tensor([[37194, 20567,   288,  5413,   267,   336,  9070,   259, 42822,   514,
          1037, 64712, 10990,     1]])
tensor([[37194, 20567,   288,  5413,   267,   336,  9070,   259, 42822,   514,
          1037, 64712, 10990,     1]])
tensor([[ 37194,  20567,    288,   5413,    267,   4824,  65941,    259,  69474,
         176055,  18156,    278,    749,    326,   2786,    259,    263,  71632,
            272,    447, 114328,   4573,      1]])
tensor([[37194, 20567,   288,  5413,   267,  1089, 32397,   348,  2504,   398,
         29671,   265,  1230,   390, 40481,   260,     1]])

Greedy Output:
The professor cannot carry out the matter.

----------------------------------------------------------------------------------------------------

Beam Output:
['The professor cannot carry out the matter.', 'The professor cannot carry out the case.', 'The professor cannot take the case.']


In [27]:
# wandb.finish()

Push Model to HF Model Hub

In [28]:
# trainer.push_to_hub()