SOURCE: https://colab.research.google.com/drive/1syXmhEQ5s7C59zU8RtHVru0wAvMXTSQ8#scrollTo=HS8mNXq6bdxq

# ByT5 fine-tuning

This notebook is derived from Suraj Patil's T5 fine-tuning notebook
https://github.com/patil-suraj/exploring-T5

The main differences are:
- switching to ByT5
- updating PyTorch Lightning and Transformers (+ changes to avoid naming conflict, tokenization issues with newer versions)
- including SentencePiece and Datasets
- loading the text from HuggingFace's Datasets

In [1]:
%%capture
! pip install transformers pytorch_lightning sentencepiece datasets

## T5 fine-tuning

This notebook is to showcase how to fine-tune [T5 model](https://arxiv.org/abs/1910.10683) with Huggigface's [Transformers](https://github.com/huggingface/transformers/) to solve different NLP tasks using text-2-text approach proposed in the T5 paper. For demo I chose 3 non text-2-text problems just to reiterate the fact from the paper that how widely applicable this text-2-text framework is and how it can be used for different tasks without changing the model at all.


In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from torch.optim import AdamW
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

ModuleNotFoundError: No module named 'pytorch_lightning'

In [None]:
import os
import shutil
import json
import time
import re
import random
import pandas as pd
import numpy as np
from tqdm import tqdm

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [95]:
from transformers import AutoTokenizer


class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    #self.save_hyperparameters()
    self.myparams = hparams

    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = AutoTokenizer.from_pretrained(hparams.tokenizer_name_or_path)
    self.val_outputs = []


  def is_logger(self):
    return True  # self.trainer.proc_rank <= 0

  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}

  def on_train_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    # {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
    return None

  def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        # Collect validation outputs manually
        self.val_outputs.append(loss)
        return {"val_loss": loss}

  def on_validation_epoch_end(self):
      # Calculate the average validation loss after all steps
      avg_loss = torch.stack(self.val_outputs).mean()
      tensorboard_logs = {"val_loss": avg_loss}
      self.val_outputs.clear()
      return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.myparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.myparams.learning_rate, eps=self.myparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]

  # handled automatically by PyTorch Lightning
  # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
  #   if self.trainer.use_tpu:
  #     xm.optimizer_step(optimizer)
  #   else:
  #     optimizer.step()
  #   optimizer.zero_grad()
  #   self.lr_scheduler.step()

  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.myparams)
    dataloader = DataLoader(train_dataset, batch_size=self.myparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.myparams.train_batch_size * max(1, self.myparams.n_gpu)))
        // self.myparams.gradient_accumulation_steps
        * float(self.myparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.myparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="test", args=self.myparams)
    return DataLoader(val_dataset, batch_size=self.myparams.eval_batch_size, num_workers=4)

In [69]:
args_dict = dict(
    output_dir="", # path to save the checkpoints
    model_name_or_path='google/byt5-small',
    tokenizer_name_or_path='google/byt5-small',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False,
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

###  Data

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')

tokenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

In [88]:
import re
import pandas as pd
from torch.utils.data import Dataset
from random import random

class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, dataframe):
        # Shuffle the dataset (like .shuffle() in the original)
        self.df = dataframe.sample(frac=1).reset_index(drop=True)
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()

        return {
            "source_ids": source_ids,
            "source_mask": src_mask,
            "target_ids": target_ids,
            "target_mask": target_mask
        }

    def _build(self):
        text_max_length = self.df['text'].apply(len).max()
        label_max_length = self.df['label'].apply(len).max()

        for _, row in self.df.iterrows():
            input_text = str(row["text"]).strip()
            target_text = str(row["label"]).strip()

            # Tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_text], max_length=text_max_length, padding='max_length', truncation=True, return_tensors="pt"
            )
            # Tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target_text], max_length=label_max_length, padding='max_length', truncation=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)


In [None]:
UKRAINIAN_LETTERS = 'абвгґдеєжзиіїйклмнопрстуфхцчшщьюя'
UKRAINAIN_VOWELS = 'аеєиіїоуюя'
ENGLISH_LETTERS = 'abcdefghijklmnopqrstuvwxyz'

voa_df = pd.read_csv('./voa_stressed_cleaned_data.csv')
unique_letters = set(''.join(voa_df['text'].to_list()))

unique_letters = unique_letters - set(UKRAINIAN_LETTERS) \
                                - set(UKRAINIAN_LETTERS.upper()) \
                                - set(allowed_punctuation) \
                                - set(other_punctuation)

voa_df = voa_df[~voa_df['text'].apply(lambda x: any(c in unique_letters for c in x))]
voa_df = voa_df[['text', 'labels']]
voa_df = voa_df.rename(columns={
    'labels': 'label'
})
voa_df.shape

In [55]:
dataset = CustomTextDataset(tokenizer, voa_df)

64029it [01:34, 679.85it/s]


### Train

In [59]:
!mkdir -p t5

In [76]:
args_dict.update({'output_dir': 't5', 'num_train_epochs': 1, 'vocab_file': 'tokenizer_config.json'})
args = argparse.Namespace(**args_dict)

# add this in?:
# from pytorch_lightning.callbacks.early_stopping import EarlyStopping
# callbacks=[EarlyStopping(monitor='val_loss')]

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    # gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    # early_stop_callback=False,
    precision= 16,
    # amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
)

In [86]:
def get_dataset(tokenizer, type_path, args):
  return CustomTextDataset(tokenizer, voa_df)

In [96]:
model = T5FineTuner(args)
trainer = pl.Trainer(**train_params)

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 299 M  | eval
------------------------------------------------------------
299 M     Trainable params
0         Non-trainable params
299 M     Total params
1,198.551 Total estimated model params size (MB)
0         Modules in train mode
349       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

In [None]:
!mkdir t5_base

In [None]:
## save the model this way so next time you can load it using T5ForConditionalGeneration.from_pretrained
model.model.save_pretrained('t5_base')
model.model.eval()