Adapted from Suraj Patil's notebook on fine-tuning T5, see here: https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb

In [1]:
from transformers.models.t5.modeling_t5 import T5Model

ImportError: cannot import name '_T_co' from 'typing' (C:\Users\hster\AppData\Local\Programs\Python\Python37\lib\typing.py)

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    BartForConditionalGeneration,
    BartTokenizer,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [None]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.hparams.update(hparams)
    
    self.model = T5ForConditionalGeneration.from_pretrained(hparams["model_name_or_path"])
    self.adapter_name = "imdb"
    self.model.train()
    self.model.add_adapter(self.adapter_name)
    self.model.train_adapter(self.adapter_name)
    
    self.tokenizer = T5Tokenizer.from_pretrained(hparams["tokenizer_name_or_path"])

  
  def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None):

    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]
    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)
    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"].mean() for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure=None, lambda_closure=None, second_order_closure=None, using_native_amp=None, using_lbfgs=None, on_tpu=False):
    optimizer.step(closure=optimizer_closure)
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    print(f"retrieved {len(train_dataset)} examples for train")
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=1)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    print(f"Retrieved {t_total} examples for dataloader")
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=1)

In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    metrics = trainer.callback_metrics
    # Log results
    for key in sorted(metrics):
      if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")
    metrics = trainer.callback_metrics

    # Log and save results to file
    output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
    with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [None]:
args_dict = dict(
    data_dir='aclImdb', # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=256,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=100,
    train_batch_size=16,
    eval_batch_size=8,
    num_train_epochs=2,
    gradient_accumulation_steps=1,
    n_gpu=1,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!rm -rf aclImdb
!tar -xvf aclImdb_v1.tar.gz
!clear

In [None]:
train_pos_files = glob.glob('aclImdb/train/pos/*.txt')
train_neg_files = glob.glob('aclImdb/train/neg/*.txt')

In [None]:
len(train_pos_files), len(train_neg_files)

In [None]:
!mkdir aclImdb/val aclImdb/val/pos aclImdb/val/neg

In [None]:
random.shuffle(train_pos_files)
random.shuffle(train_neg_files)

val_pos_files = train_pos_files[:1000]
val_neg_files = train_neg_files[:1000]

In [None]:
import shutil
for f in val_pos_files:
  shutil.move(f,  'aclImdb/val/pos')
for f in val_neg_files:
  shutil.move(f,  'aclImdb/val/neg')

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [None]:
class ImdbDataset(Dataset):
  def __init__(self, tokenizer, data_dir, type_path,  max_len=512):
    self.pos_file_path = os.path.join(data_dir, type_path, 'pos')
    self.neg_file_path = os.path.join(data_dir, type_path, 'neg')
    
    self.pos_files = glob.glob("%s/*.txt" % self.pos_file_path)
    self.neg_files = glob.glob("%s/*.txt" % self.neg_file_path)
    
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self._build()
  
  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
  def _build(self):
    self._buil_examples_from_files(self.pos_files, 'positive')
    self._buil_examples_from_files(self.neg_files, 'negative')
  
  def _buil_examples_from_files(self, files, sentiment):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

    for path in files:
      with open(path, 'r') as f:
        text = f.read()
      
        line = text.strip()
        line = REPLACE_NO_SPACE.sub("", line) 
        line = REPLACE_WITH_SPACE.sub("", line)
        line = line

        target = sentiment

        # tokenize inputs
        tokenized_inputs = self.tokenizer.batch_encode_plus(
          [line], max_length=self.max_len, padding="max_length", return_tensors="pt",
          truncation=True
        )
      
        # tokenize targets
        tokenized_targets = self.tokenizer.batch_encode_plus(
          [target], max_length=2, padding="max_length", return_tensors="pt",
          truncation=True
        )

        self.inputs.append(tokenized_inputs)
        self.targets.append(tokenized_targets)

In [None]:
string = "This is the most energetic and entertaining ten minutes of film >Ive seen in a long time As a film student at NYU where this >short has been screened several times I salute Jim Cox for his >astute sense of style and pace for our generation Im sure >Ill see his name later on the big screen Hopefully this short >will find a market on TV or somewhere so this inspiring work >can get the wide distribution it des"
b = tokenizer.encode(string)

In [None]:
!mkdir -p t5_imdb_sentiment

In [None]:
args_dict.update({'data_dir': 'aclImdb', 'output_dir': 't5_imdb_sentiment', 'num_train_epochs': 2})
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor="val_loss", mode="min", save_top_k=1
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
)

In [None]:
def get_dataset(tokenizer, type_path, args):
  return ImdbDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

In [None]:
model = T5FineTuner(args_dict)

In [None]:
def get_trainable_params(model):
    for name, parameters in model.named_parameters():
        if parameters.requires_grad:
            print(name)
            
get_trainable_params(model.model.encoder.block[0].layer[1])

In [None]:
trainer = pl.Trainer(**train_params)
trainer.fit(model)

In [None]:
train_dataset = get_dataset(tokenizer=tokenizer, type_path="train", args=model.hparams)
len(train_dataset)

In [None]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

In [None]:
dataset = ImdbDataset(tokenizer, 'aclImdb', 'test',  max_len=512)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
it = iter(loader)
batch = next(it)
batch["source_ids"].shape

In [None]:
current_model = model.model.cpu()

outs = current_model.generate(
    input_ids=batch['source_ids'].cpu(),
    attention_mask=batch['source_mask'].cpu(), 
    max_length=2
)

dec = [tokenizer.decode(ids) for ids in outs]

texts = [tokenizer.decode(ids) for ids in batch['source_ids']]
targets = [tokenizer.decode(ids) for ids in batch['target_ids']]

In [None]:
def strip_padding_tokens(input_string):
    final_string = input_string.replace("<pad>", "")
    final_string = final_string.replace("</s>", "")
    return final_string.strip()

In [None]:
import textwrap
for i in range(32):
    lines = textwrap.wrap("Review:\n%s\n" % texts[i], width=100)
    print("\n".join(lines))
    print("\nActual sentiment: %s" % targets[i])
    print("Predicted sentiment: %s" % dec[i])
    print("=====================================================================\n")

In [None]:
num_texts = len(dataset)
correct = 0
incorrect = 0

In [None]:
all_outputs = []
all_targets = []
current_model = model.model.cuda()

print(current_model.device)

dataset = ImdbDataset(tokenizer, 'aclImdb', 'test',  max_len=256)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
it = iter(loader)

for batch in tqdm(it, total=int(len(dataset)/32)):
    outs = current_model.generate(
        input_ids=batch['source_ids'].cuda(),
        attention_mask=batch['source_mask'].cuda(), 
        max_length=2,
        num_beams=1
    )

    decs = [tokenizer.decode(ids) for ids in outs]
    targets = [tokenizer.decode(ids) for ids in batch['target_ids']]
    
    stripped_decs = [strip_padding_tokens(input_string) for input_string in decs]
    stripped_targets = [strip_padding_tokens(target_string) for target_string in targets]
    
    all_outputs.extend(stripped_decs)
    all_targets.extend(stripped_targets)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(all_outputs, all_targets)