## **Model Fine-tuning** (Notebook sourced from translation notebook [here](https://huggingface.co/docs/transformers/notebooks))

Enable logging with Weights and Biases:

In [1]:
# import gc
# del model
# gc.collect()
# torch.cuda.empty_cache()

In [2]:
wb = True

In [3]:
import os
work_dir = os.getcwd()
if work_dir == '/content':
  from google.colab import drive
  drive.mount('/content/drive')
  os.chdir('drive/MyDrive/github_repos/XLdefgen')

If running this on Colab, uncomment the following cell to install requisite packages.

In [4]:
# !pip install datasets transformers sacrebleu sentencepiece wandb
# !apt install git-lfs

In [5]:
if wb:
  import wandb
  wandb.login()
  %env WANDB_PROJECT=XLdefgen

[34m[1mwandb[0m: Currently logged in as: [33mbrandonwilde[0m (use `wandb login --relogin` to force relogin)


env: WANDB_PROJECT=XLdefgen


If storing model on HF Model Hub, uncomment the following:

In [6]:
# from huggingface_hub import notebook_login
# notebook_login()

A script version of this notebook to fine-tune the model in a distributed fashion using multiple GPUs or TPUs is available [here](https://github.com/huggingface/transformers/tree/master/examples/seq2seq).

Specify model checkpoint to load (from HF Model Hub)


In [4]:
model_checkpoint = "google/mt5-small"

'C:\\Users\\brand\\Documents\\Projects\\XLdefgen'

## Loading the dataset

In [147]:
### import datasets
from datasets import load_dataset, load_metric, Dataset
import csv
import torch
import numpy as np
import pandas as pd

data_path = "codwoe_data.csv"

class csvDataset(Dataset):

    def __init__(self,file_name):
        self.data_df = pd.read_csv(file_name)
        self.data_dict = data_df.to_dict(orient='index')

    def __len__(self):
        return len(self.y)
  
    def __getitem__(self,idx):
        import numbers
        if isinstance(idx, numbers.Integral):  # item is an integer
            idx = [idx]
        elif isinstance(idx, slice):  # item is a slice
            idx = list(range(idx.start or 0, idx.stop or len(self), idx.step or 1))
        else:  # invalid index type
            raise TypeError('{cls} indices must be integers or slices, not {idx}'.format(
                cls=type(self).__name__,
                idx=type(idx).__name__,
            ))

        return [self.data_dict[i] for i in idx]

codwoe_data = csvDataset(data_path)

# raw_datasets = datasets.load_from_disk("de-en_wmt16_tokd")
# raw_datasets = load_dataset("wmt16", "de-en")

metric = load_metric("sacrebleu")

In [153]:
raw_datasets['validation']['translation'][:2]
codwoe_data[:2]

[{'word': 'insurrectionalism',
  'pos': 'noun',
  'gloss': 'The belief that insurrection is the best way to achieve a communist or socialist revolution .',
  'example': "Those already wary of the party 's electoral maneuvers in 1919 further resented the Socialists ' calls , after the May 1920 strike failure , to abandon revolutionary insurrectionalism and place all hopes in `` le bulletin rouge '' ."},
 {'word': 'ofay',
  'pos': 'noun',
  'gloss': 'A white person .',
  'example': 'You get outa my alley , Lucas -- and take that ofay with you , hear ?'}]

To get a sense of what the data looks like, the following function shows some examples picked randomly from the dataset.

In [148]:
# import datasets
# import random
# import pandas as pd
# from IPython.display import display, HTML

# def show_random_elements(dataset, num_examples=5):
#     assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
#     picks = []
#     for _ in range(num_examples):
#         pick = random.randint(0, len(dataset)-1)
#         while pick in picks:
#             pick = random.randint(0, len(dataset)-1)
#         picks.append(pick)
    
#     df = pd.DataFrame(dataset[picks])
#     for column, typ in dataset.features.items():
#         if isinstance(typ, datasets.ClassLabel):
#             df[column] = df[column].transform(lambda i: typ.names[i])
#     display(HTML(df.to_html()))

In [151]:
# show_random_elements(raw_datasets["train"])

Demonstration of the metric in use:

In [11]:
fake_preds = ["hello there", "general kenobi"]
fake_labels = [["hello there"], ["general kenobi"]]
metric.compute(predictions=fake_preds, references=fake_labels)

{'score': 0.0,
 'counts': [4, 2, 0, 0],
 'totals': [4, 2, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 1.0,
 'sys_len': 4,
 'ref_len': 4}

## Preprocessing the data

In [12]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Model-specific tokenizer adaptations

In [13]:
if "t5" in model_checkpoint:
    prefix = "translate German to English: "
    print("Inputs will include prefix!")
else:
    prefix = ""
    print("Inputs will not include prefix!")

if "mbart" in model_checkpoint:
    tokenizer.src_lang = "en-XX"
    tokenizer.tgt_lang = "de-DE"

Inputs will include prefix!


Create preprocessing function

In [14]:
max_input_length = 128
max_target_length = 128
source_lang = "de"
target_lang = "en"

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Specify whether reduced dataset should be passed to model

In [15]:
trim_datasets = True
train_size = 10000
eval_size = 100

Preprocess data

In [16]:
if trim_datasets:
  small_train_dataset = raw_datasets["train"].shuffle(seed=42).select(range(train_size))
  small_eval_dataset = raw_datasets["validation"].shuffle(seed=42).select(range(eval_size))
  raw_datasets_trim = datasets.DatasetDict({'train': small_train_dataset, 'validation': small_eval_dataset})
  tokenized_datasets = raw_datasets_trim.map(preprocess_function, batched=True)
  print("Datasets trimmed and tokenized.")
else:
  tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
  print("Raw datasets tokenized.")

del raw_datasets #to clear memory
# torch.cuda.empty_cache()

Loading cached shuffled indices for dataset at /home/wildeb1/.cache/huggingface/datasets/wmt16/de-en/1.0.0/0d9fb3e814712c785176ad8cdb9f465fbe6479000ee6546725db30ad8a8b5f8a/cache-4574fe47268cd3fd.arrow
Loading cached shuffled indices for dataset at /home/wildeb1/.cache/huggingface/datasets/wmt16/de-en/1.0.0/0d9fb3e814712c785176ad8cdb9f465fbe6479000ee6546725db30ad8a8b5f8a/cache-bf1487bfb5cd2cad.arrow


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Datasets trimmed and tokenized.


The results are automatically cached by the 🤗 Datasets library to avoid spending time on this step the next time you run your notebook. The 🤗 Datasets library is normally smart enough to detect when the function you pass to map has changed (and thus requires to not use the cache data). 🤗 Datasets warns you when it uses cached files, but you can pass `load_from_cache_file=False` in the call to `map` to not use the cached files and force the preprocessing to be applied again.

## Fine-tuning the model

Now that our data is ready, we can download the pretrained model and fine-tune it. Since our task is of the sequence-to-sequence kind, we use the `AutoModelForSeq2SeqLM` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us.

In [17]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)


# class MyTrainer(Trainer): #Subclass trainer to access 
#   def init(self, model,
#            args = None,
#            data_collator = None,
#            train_dataset = None,
#            eval_dataset = None,
#            tokenizer = None,
#            model_init = None,
#            compute_metrics = None,
#            callbacks = None,
#            optimizers = (None,None)
#            ):

# super().__init__(model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init,
#               compute_metrics, callbacks, optimizers) 

# def evaluate(
# self,
# train_dataset = None,
# eval_dataset: Optional[Dataset] = None,
# ignore_keys: Optional[List[str]] = None,
# metric_key_prefix: str = “eval”,
# ) → Dict[str, float]:


cuda:0


Specify batch size and training arguments

In [18]:
batch_size = 2
model_name = model_checkpoint.split("/")[-1]
if wb:
  report = "wandb"
else:
  report = "none"
train_k = int(train_size/1000)
args = Seq2SeqTrainingArguments(
    # f"drive/MyDrive/{model_name}-finetuned-{source_lang}-to-{target_lang}",
    # f"XLdefgen-{source_lang}-to-{target_lang}",
    f"XLdefgen-trans-{source_lang}-to-{target_lang}-train{train_k}k-bat{batch_size}", #output directory
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2, #max num of checkpoints to keep
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=False,         #mixed precision (acceleration) - doesn't work well with t5 models
    push_to_hub=False,  #push to HF Model Hub
    report_to=report,   #for data logging
    ignore_data_skip=False,   #if true and loading from checkpoint, this will start at beginning of dataset rather than where left off
    load_best_model_at_end=False,
    prediction_loss_only=False, #save space by not storing predictions for metrics
)

Add data collator to pad inputs and labels to max length for each batch

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Post-processing and compute metrics

In [20]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    result = {k: round(v, 4) for k, v in result.items()}

    return result

# def compute_metrics(eval_pred):
#   '''Example for logging multiple metrics'''
#     metric1 = load_metric("precision")
#     metric2 = load_metric("recall")
    
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     precision = metric1.compute(predictions=predictions, references=labels)["precision"]
#     recall = metric2.compute(predictions=predictions, references=labels)["recall"]
#     return {"precision": precision, "recall": recall}

Instantiate Trainer

In [21]:
from typing import Optional, List, Dict
from torch.utils.data import Dataset
import time
import math
class MyTrainer(Seq2SeqTrainer):
  # Adapt Trainer to also log perplexity - this only works on trainer.evaluate() and not trainer.train()
  def evaluate(
        self,
        eval_dataset: Optional[Dataset] = None,
        ignore_keys: Optional[List[str]] = None,
        metric_key_prefix: str = "eval",
  ) -> Dict[str, float]:
    output = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
    output['eval_perplexity'] = round(math.exp(output['eval_loss']),4)
    return output
  
trainer = MyTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [22]:
# trainer = Seq2SeqTrainer(
#     model,
#     args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

# from typing import Optional, List, Dict
# from torch.utils.data import Dataset
# import time

# class MyTrainer(Seq2SeqTrainer):
  
#   def evaluate(
#         self,
#         eval_dataset: Optional[Dataset] = None,
#         ignore_keys: Optional[List[str]] = None,
#         metric_key_prefix: str = "eval",
#   ) -> Dict[str, float]:
#         """
#         Run evaluation and returns metrics.
#         The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
#         (pass it to the init `compute_metrics` argument).
#         You can also subclass and override this method to inject custom behavior.
#         Args:
#             eval_dataset (`Dataset`, *optional*):
#                 Pass a dataset if you wish to override `self.eval_dataset`. If it is an `datasets.Dataset`, columns not
#                 accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
#                 method.
#             ignore_keys (`Lst[str]`, *optional*):
#                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
#                 gathering predictions.
#             metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
#                 An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
#                 "eval_bleu" if the prefix is "eval" (default)
#         Returns:
#             A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
#             dictionary also contains the epoch number which comes from the training state.
#         """
#         # memory metrics - must set up as early as possible
#         self._memory_tracker.start()

#         eval_dataloader = self.get_eval_dataloader(eval_dataset)
#         start_time = time.time()

#         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
#         output = eval_loop(
#             eval_dataloader,
#             description="Evaluation",
#             # No point gathering the predictions if there are no metrics, otherwise we defer to
#             # self.args.prediction_loss_only
#             prediction_loss_only=True if self.compute_metrics is None else None,
#             ignore_keys=ignore_keys,
#             metric_key_prefix=metric_key_prefix,
#         )

#         print("output:",output)
#         print("output.metrics:", output.metrics)

#         total_batch_size = self.args.eval_batch_size * self.args.world_size
#         output.metrics.update(
#             speed_metrics(
#                 metric_key_prefix,
#                 start_time,
#                 num_samples=output.num_samples,
#                 num_steps=math.ceil(output.num_samples / total_batch_size),
#             )
#         )

#         self.log(output.metrics)

#         if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
#             # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
#             xm.master_print(met.metrics_report())

#         self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)

#         self._memory_tracker.stop_and_update_metrics(output.metrics)

#         return output.metrics

# trainer = MyTrainer(
#     model,
#     args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )


Train/fine-tune the model

In [23]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_loss': 22.424962997436523,
 'eval_bleu': 0.1892,
 'eval_gen_len': 2.4,
 'eval_runtime': 4.61,
 'eval_samples_per_second': 21.692,
 'eval_steps_per_second': 10.846,
 'eval_perplexity': 5483245446.8795}

In [24]:
# import gc
# gc.collect()
# torch.cuda.empty_cache() #to free up space
# if wb:
#   wandb.init(resume=True)
trainer.train(resume_from_checkpoint=True)
# if wb:
#   wandb.finish()

Loading model from XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-5000).
The following columns in the training set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running training *****
  Num examples = 10000
  Num Epochs = 2
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 10000
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 5000
  Will skip the first 1 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




Step,Training Loss,Validation Loss,Bleu,Gen Len
5500,4.3844,2.91707,2.1738,16.52
6000,4.2915,2.898247,2.0922,16.88
6500,4.2572,2.880438,2.5964,16.68
7000,4.2365,2.861647,2.2649,16.83
7500,4.1722,2.850832,2.0797,16.92
8000,4.2085,2.841267,1.8282,16.84
8500,4.1817,2.833421,1.8853,16.74
9000,4.2112,2.829205,2.3476,16.91
9500,4.1075,2.826155,2.3871,16.87
10000,4.1369,2.824935,2.6236,16.88


The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-5500
Configuration saved in XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-5500/config.json
Model weights saved in XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-5500/tokenizer_config.json
Special tokens file saved in XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-5500/special_tokens_map.json
Copy vocab file to XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-5500/spiece.model
Deleting older checkpoint [XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-4500] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `MT5ForCondit

***** Running Evaluation *****
  Num examples = 100
  Batch size = 2
Saving model checkpoint to XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-10000
Configuration saved in XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-10000/config.json
Model weights saved in XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-10000/tokenizer_config.json
Special tokens file saved in XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-10000/special_tokens_map.json
Copy vocab file to XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-10000/spiece.model
Deleting older checkpoint [XLdefgen-trans-de-to-en-train10k-bat2/checkpoint-9000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=10000, training_loss=2.1093772705078124, metrics={'train_runtime': 2082.3419, 'train_samples_per_second': 9.605, 'train_steps_per_second': 4.802, 'total_flos': 2101922535813120.0, 'train_loss': 2.1093772705078124, 'epoch': 2.0})

In [25]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 2


{'eval_loss': 2.824934720993042,
 'eval_bleu': 2.6236,
 'eval_gen_len': 16.88,
 'eval_runtime': 16.6286,
 'eval_samples_per_second': 6.014,
 'eval_steps_per_second': 3.007,
 'epoch': 2.0,
 'eval_perplexity': 16.8598}

## Model testing

Test model predictive capacity with an example

In [26]:
input_ids = tokenizer.encode(prefix + 'I enjoy walking with my cute dog', return_tensors='pt')
print(input_ids)

input_ids = tokenizer(prefix + 'I enjoy walking with my cute dog', return_tensors='pt').input_ids
print(input_ids)

input_ids = tokenizer(prefix + 'Ich gehe gern mit meinem süßen Hund Gassi', return_tensors='pt').input_ids
print(input_ids)

input_ids = tokenizer(prefix + "Ich wohne in der USA", return_tensors='pt').input_ids
print(input_ids)
input_ids = input_ids.to(device)

greedy_output = model.generate(input_ids)
print("\nGreedy Output:")
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True, min_length=5))

outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3)
print("\n" + 100 * '-' + "\n\nBeam Output:")
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

tensor([[37194, 20567,   288,  5413,   267,   336,  9070,   259, 42822,   514,
          1037, 64712, 10990,     1]])
tensor([[37194, 20567,   288,  5413,   267,   336,  9070,   259, 42822,   514,
          1037, 64712, 10990,     1]])
tensor([[37194, 20567,   288,  5413,   267,  4824, 65941,   259, 69474,   749,
           326,  2786,   259,   263, 71632,   272, 59498, 13651,   522,     1]])
tensor([[ 37194,  20567,    288,   5413,    267,   4824, 122540,    265,    281,
            442,   4208,      1]])

Greedy Output:
The USA is in USA.

----------------------------------------------------------------------------------------------------

Beam Output:
['It is in the USA.', 'It is in USA.', 'It is in the USA, in the USA.']


Push Model to HF Model Hub

In [27]:
# trainer.push_to_hub()