## Install required libraries (only in Google Colab!)

- Otherwise libraries should be installed within the virtual environment before running this notebook.
- In Jupyter-Notebook install requirements_nlg.txt into conda environment

In [1]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install pytorch_lightning
!pip install rouge_score
!pip install GPUtil
!pip install wandb

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 2.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 17.8MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█

## Imports

In [1]:
import argparse
import glob
import os
from pathlib import Path
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

#import nltk
#nltk.download('punkt')
#from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader, Subset
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from datasets import load_dataset, load_metric
import datasets
import wandb
import GPUtil


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

# For more effective memory usage
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True

## Configure Google Drive mount

- Skip this if runnng locally

In [3]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/t5-fine-tuning

Mounted at /content/drive
/content/drive/MyDrive/t5-fine-tuning


## Resolve current directory

In [2]:
try:
    FILE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    print('__file__ does not exist for notebook, use current directory instead')
    FILE_DIR = Path().resolve()
    #FILE_DIR = "local/aallen/runs/"
    
print(f'current directory is: {FILE_DIR}')

__file__ does not exist for notebook, use current directory instead
current directory is: /local/aallen/runs


## Create a torch dataset for CNN / Daily Mail data

In [3]:
class cnn_dailymail(Dataset):
    def __init__(self, tokenizer, type_path, input_length, output_length, test_run=False):
        raw_dataset =  load_dataset('cnn_dailymail', '2.0.0', cache_dir=os.path.join(FILE_DIR, 'dataset'))
        self.dataset = raw_dataset[type_path]

        self.input_length = input_length
        self.tokenizer = tokenizer
        self.output_length = output_length
        self.article_min_len = 150
        self.summary_min_len = 15

        self.clean_examples = self.filter_data()
        self.dataset = Subset(self.dataset, self.clean_examples)

        if test_run == True:
          self.dataset = Subset(self.dataset, np.arange(1000))
    
    def filter_data(self):
        clean_examples = []
        for i, e in enumerate(self.dataset):
          if len(e['article'].split(" ")) >= self.article_min_len \
           and len(e['article'].split(" ")) <= self.input_length \
           and len(e['highlights'].split(" ")) >= self.summary_min_len \
           and len(e['highlights'].split(" ")) <= self.output_length:
            clean_examples.append(i)
          else:
            continue
        return clean_examples

      
    def __len__(self):
        #return self.dataset.shape[0]
        return len(self.dataset)
    
    def clean_text(self, text):
        text = text.replace('\n','')
        text = text.replace('``', '')
        text = text.replace('"', '')
        
        return text
    
    
    def convert_to_features(self, example_batch):
        # Tokenize contexts and questions (as pairs of inputs)
        
        input_ = self.clean_text(example_batch['article'])
        target_ = self.clean_text(example_batch['highlights'])
        
        source = self.tokenizer.batch_encode_plus([input_], max_length=self.input_length, 
                                                     padding='max_length', truncation=True, return_tensors="pt")
        
        targets = self.tokenizer.batch_encode_plus([target_], max_length=self.output_length, 
                                                     padding='max_length', truncation=True, return_tensors="pt")
    
       
        return source, targets
  
    def __getitem__(self, index):
        source, targets = self.convert_to_features(self.dataset[index])
        
        source_ids = source["input_ids"].squeeze()
        target_ids = targets["input_ids"].squeeze()

        src_mask    = source["attention_mask"].squeeze()
        target_mask = targets["attention_mask"].squeeze()

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

## Initialize train/val/test datasets

In [4]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

train_dataset = cnn_dailymail(tokenizer, 'train', 1024, 250)
val_dataset = cnn_dailymail(tokenizer, 'validation', 1024, 250)
test_dataset = cnn_dailymail(tokenizer, 'test', 1024, 250) #, test_run=True

Reusing dataset cnn_dailymail (/local/aallen/runs/dataset/cnn_dailymail/2.0.0/2.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
Reusing dataset cnn_dailymail (/local/aallen/runs/dataset/cnn_dailymail/2.0.0/2.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
Reusing dataset cnn_dailymail (/local/aallen/runs/dataset/cnn_dailymail/2.0.0/2.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


## Cofigure Model

## Initialize random seed

In [5]:
pl.seed_everything(hash("setting random seeds") % 2**32 - 1)

Global seed set to 2503961154


2503961154

#### Setup Logger

In [6]:
logger_name = 'wandb'
#logger_name = 'tensorboard'

logging=True

if logger_name == 'wandb':
  logger=WandbLogger(save_dir=os.path.join(FILE_DIR, 'wandb_logs'), project="t5-fine-tuning")
elif logger_name == 'tensorboard':
  logger=TensorBoardLogger(os.path.join(FILE_DIR, 'tb_logs'), name='t5_finetuner')
else:
  logging=False

In [7]:
logger_name

'wandb'

#### Run only if using wandb as a logger

In [7]:
#!wandb login
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33maxel_a[0m (use `wandb login --relogin` to force relogin)


True

#### Hyperparameters

In [8]:
args_dict = dict(
    output_dir=os.path.join(FILE_DIR, 'model_outputs'), # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=1024,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=1,
    gradient_accumulation_steps=1,
    n_gpu=4,
    max_grad_norm=1.0,
    freeze_embeds=False,
    freeze_encoder=True,
    logging=logging
)

args = argparse.Namespace(**args_dict)

## Configure Pytorch Lightning t5 Model

In [9]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.save_hyperparameters(hparams)
        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
        self.rouge_metric = load_metric('rouge', keep_in_memory=True) 
        
        if self.hparams.freeze_embeds:
            self.freeze_embeds()
        if self.hparams.freeze_encoder:
            self.freeze_params(self.model.get_encoder())
        
    
    def freeze_params(self, model):
        for par in model.parameters():
            par.requires_grad = False
            
            
    def freeze_embeds(self):
        """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
        try:
            self.freeze_params(self.model.model.shared)
            for d in [self.model.model.encoder, self.model.model.decoder]:
                freeze_params(d.embed_positions)
                freeze_params(d.embed_tokens)
        except AttributeError:
            self.freeze_params(self.model.shared)
            for d in [self.model.encoder, self.model.decoder]:
                self.freeze_params(d.embed_tokens)
    
    def lmap(self, f, x):
        """list(map(f, x))"""
        return list(map(f, x))
    
    
    def is_logger(self):
        return self.trainer.proc_rank <= 0
    
    def parse_score(self, result):
        return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
        
    def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
    )

    def _step(self, batch):
        labels = batch["target_ids"]
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss
    
    
    def ids_to_clean_text(self, generated_ids):
        gen_text = self.tokenizer.batch_decode(
            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        return self.lmap(str.strip, gen_text)
    
    
    def _generative_step(self, batch) :
        
        t0 = time.time()
        
        generated_ids = self.model.generate(
            batch["source_ids"],
            attention_mask=batch["source_mask"],
            use_cache=True,
            decoder_attention_mask=batch['target_mask'],
            max_length=150, 
            num_beams=2,
            repetition_penalty=2.5, 
            length_penalty=1.0, 
            early_stopping=True
        )
        preds = self.ids_to_clean_text(generated_ids)
        target = self.ids_to_clean_text(batch["target_ids"])
            
        gen_time = (time.time() - t0) / batch["source_ids"].shape[0]  
    
        loss = self._step(batch)
        base_metrics = {'val_loss': loss}
        summ_len = np.mean(self.lmap(len, generated_ids))
        base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target)

        self.rouge_metric.add_batch(predictions=preds, references=target)
        
        return base_metrics
    

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        if self.hparams.logging:
          self.log('train_loss', loss)
        return loss
  
    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
       
        if self.hparams.logging:
          self.log('avg_train_loss', avg_train_loss, on_epoch=True, prog_bar=True)

    def validation_step(self, batch, batch_idx):
        return self._generative_step(batch)
    
  
    def validation_epoch_end(self, outputs):
        
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        
        if self.hparams.logging:
            if self.hparams.n_gpu > 1:
                self.log('val_loss', avg_loss, on_epoch=True, prog_bar=True, sync_dist=True)
            else:
                self.log('val_loss', avg_loss, on_epoch=True, prog_bar=True)
        
        rouge_results = self.rouge_metric.compute() 
        rouge_dict = self.parse_score(rouge_results)
    
        if self.hparams.logging:
            if self.hparams.n_gpu > 1:
                self.log_dict(rouge_dict, sync_dist=True)
            else:
                self.log_dict(rouge_dict)
        
        ## Clear out the lists for next epoch
        self.target_gen= []
        self.prediction_gen=[]
        return {"avg_val_loss": avg_loss, 
                "rouge1" : rouge_results['rouge1'],
                "rougeL" : rouge_results['rougeL']}

    def test_step(self, batch, batch_idx):
        return self._generative_step(batch)
    
  
    def test_epoch_end(self, outputs):
        
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
     
        if self.hparams.logging:
            if self.hparams.n_gpu > 1:
                self.log('test_loss', avg_loss, on_epoch=True, prog_bar=True, sync_dist=True)
            else:
                self.log('test_loss', avg_loss, on_epoch=True, prog_bar=True)
        
        rouge_results = self.rouge_metric.compute() 
        rouge_dict = self.parse_score(rouge_results)
    
        if self.hparams.logging:
            if self.hparams.n_gpu > 1:
                self.log_dict(rouge_dict, sync_dist=True)
            else:
                self.log_dict(rouge_dict)
        
        ## Clear out the lists for next epoch
        self.target_gen= []
        self.prediction_gen=[]
        return {"avg_val_loss": avg_loss, 
                "rouge1" : rouge_results['rouge1'],
                "rougeL" : rouge_results['rougeL']}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]
    
    def optimizer_step(self,
                     epoch=None,
                     batch_idx=None,
                     optimizer=None,
                     optimizer_idx=None,
                     optimizer_closure=None,
                     on_tpu=None,
                     using_native_amp=None,
                     using_lbfgs=None):

      optimizer.step(closure=optimizer_closure)
      optimizer.zero_grad()
      self.lr_scheduler.step()
  
    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict
    
    

    def train_dataloader(self):   
        #train_dataset =  cnn_dailymail(tokenizer=self.tokenizer, type_path="train", input_length=512, output_length=250, test_run=True)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=self.hparams.n_gpu)
        if self.hparams.n_gpu > 1:
             t_total = ((len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                        // self.hparams.gradient_accumulation_steps
                        * float(self.hparams.num_train_epochs))
        else:
            t_total = ((len(dataloader.dataset) // (self.hparams.train_batch_size * 1)) 
                       // self.hparams.gradient_accumulation_steps
                       * float(self.hparams.num_train_epochs))
        
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total)
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        #val_dataset =  cnn_dailymail(tokenizer=self.tokenizer, type_path="validation", input_length=512, output_length=250, test_run=True)       
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=self.hparams.n_gpu)
    
    
    def test_dataloader(self):
        #test_dataset =  cnn_dailymail(tokenizer=self.tokenizer, type_path="test", input_length=512, output_length=250, test_run=True)       
        return DataLoader(test_dataset, batch_size=self.hparams.eval_batch_size, num_workers=self.hparams.n_gpu)

## Define trainer hyperparameters

In [10]:
# Do we want to save the checkpoint, toggle False if not
save_checkpoint=True

if save_checkpoint:
  custom_checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, filename='freeze_encoder_epoch=2', monitor="val_loss", mode="min", save_top_k=3
  )
  checkpoint_callback=True
else:
  checkpoint_callback=False

## If resuming from checkpoint, add an arg resume_from_checkpoint
train_params = dict(
    default_root_dir=args.output_dir,
    accumulate_grad_batches=args.gradient_accumulation_steps,
    max_epochs=args.num_train_epochs,
    gradient_clip_val=args.max_grad_norm,
    gpus=args.n_gpu,
    checkpoint_callback=checkpoint_callback,
    logger=logger,
    log_gpu_memory='all',
    callbacks=[custom_checkpoint_callback],
    accelerator='dp',
    auto_scale_batch_size='binsearch'
)



## Initialize Model and Trainer

In [11]:
model = T5FineTuner(args)
trainer = pl.Trainer(**train_params) # fast_dev_run=True --> for a quick test run

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [12]:
ckpt_path = os.path.join(args.output_dir, 'freeze_encoder_epoch=1.ckpt')
checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
model.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

## Train Model

In [13]:
# you can put '%pdb on' on top of this cell for debugging
trainer.fit(model)
GPUtil.showUtilization()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]



  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
113 M     Trainable params
109 M     Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  value = torch.tensor(value, device=device, dtype=torch.float)
Global seed set to 2503961154


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Test Model

In [14]:
# argument: ckpt_path='path/to/checkpoint' --> for loading a custom checkpoint, automatically loads best checkpoint if available
# example: ckpt_path='./model_outputs/epoch=0-step=249.ckpt'
trainer.test(ckpt_path=os.path.join(args.output_dir, 'freeze_encoder_epoch=2.ckpt'))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

  value = torch.tensor(value, device=device, dtype=torch.float)



--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'rouge1': 41.33530044555664,
 'rouge2': 19.56999969482422,
 'rougeL': 29.417699813842773,
 'rougeLsum': 29.408300399780273,
 'test_loss': 1.403151035308838}
--------------------------------------------------------------------------------


[{'test_loss': 1.403151035308838,
  'rouge1': 41.33530044555664,
  'rouge2': 19.56999969482422,
  'rougeL': 29.417699813842773,
  'rougeLsum': 29.408300399780273}]

## Run only if using tensorboard as logger

In [None]:
#%load_ext tensorboard
%tensorboard --logdir local/aallen/runs/tb_logs

## Run only if using wandb as logger

In [15]:
# stops logging experiments
wandb.finish()



VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…





0,1
train_loss,2.1323
gpu_id: 0/memory.used (MB),3771.0
gpu_id: 1/memory.used (MB),2441.0
gpu_id: 2/memory.used (MB),2441.0
gpu_id: 3/memory.used (MB),2441.0
epoch,0.0
trainer/global_step,29714.0
_runtime,87299.0
_timestamp,1623850596.0
_step,596.0


0,1
train_loss,▄▃▃▅▃▃▄█▄▅▄▃▄▁▃▄▄▂▃▅▅▄▄▃▄▄▄▄▅▄▃▅▃▄▄▃▂▄▄▃
gpu_id: 0/memory.used (MB),▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇██
gpu_id: 1/memory.used (MB),▁▁▁▁▁▁▁▁████████████████████████████████
gpu_id: 2/memory.used (MB),▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅████████████
gpu_id: 3/memory.used (MB),▁▁▃▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▆██████████
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
