# T5 Fine-Tunning for sentiment analysis

T5 fine-tuning for sentiment analysis. It is very simple to adapt this model to other tasks such as recommendations, summaries, etc. See the paper for more details: [Exploring the Limits of Transfer Learning with a Unified
Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf)

## Referencies

* [Huggingface T5 documentation](https://huggingface.co/docs/transformers/v4.35.1/en/model_doc/t5#t5)
* [T5 Fine tunning](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb#scrollTo=SDVQ04fGRb1v)
* [Exploring the Limits of Transfer Learning with a UnifiedText-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append(f'./src')

import os
import argparse
import glob
import random
import util as ut 
import model as ml
import shutil

import dataset as ds
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
    
from transformers import T5Tokenizer

import nltk
from bunch import Bunch

import torch
from torch.utils.data import DataLoader

## Setup

In [3]:
ut.set_seed(42)

In [4]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/adrian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[3;92mTrue[0m

In [5]:
import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [6]:
torch.backends.cuda.matmul.allow_tf32 = True

## Helper Functions

In [7]:
def sets_counts(path):
    train_pos_size = len(glob.glob(f'{path}/dataset/train/pos/*.txt'))
    eval_pos_size  = len(glob.glob(f'{path}/dataset/eval/pos/*.txt'))
    test_pos_size  = len(glob.glob(f'{path}/dataset/test/pos/*.txt'))
    
    train_neg_size = len(glob.glob(f'{path}/dataset/train/neg/*.txt'))
    eval_neg_size  = len(glob.glob(f'{path}/dataset/eval/neg/*.txt'))
    test_neg_size  = len(glob.glob(f'{path}/dataset/test/neg/*.txt'))

    all_pos_size   = train_pos_size + eval_pos_size + test_pos_size
    all_neg_size   = train_neg_size + eval_neg_size + test_neg_size

    print(f'All: {(all_pos_size, all_neg_size)}, Train: {(train_pos_size, train_neg_size)}, Eval: {(eval_pos_size, eval_neg_size)}, Test: {(test_pos_size, test_neg_size)}')

    return (train_pos_size, eval_pos_size, test_pos_size)



def download_dataset(
    path          = '.',
    train_percent = 0.7, 
    eval_percent  = 0.2, 
    test_percent  = 0.1
):
    if os.path.exists(f'{path}/dataset'):
        return sets_counts(path)

    !rm -rf {path}/aclImdb
    !rm -rf {path}/aclImdb_v1.tar.gz

    !cd {path}; wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

    !cd {path}; tar -xf aclImdb_v1.tar.gz
    
    !cd {path}; rm aclImdb_v1.tar.gz
        
    !cd {path}; mv aclImdb dataset
    
    !cd {path}/dataset; mv train all
    !cd {path}/dataset; mv test/pos/* all/pos/
    !cd {path}/dataset; mv test/neg/* all/neg/
    !cd {path}/dataset; rm -rf test

    pos_files = glob.glob(f'{path}/dataset/all/pos/*.txt')
    neg_files = glob.glob(f'{path}/dataset/all/neg/*.txt')
  
    random.shuffle(pos_files)
    random.shuffle(neg_files)

    ut.mkdir(f'{path}/dataset/train/pos')
    ut.mkdir(f'{path}/dataset/train/neg')
    
    ut.mkdir(f'{path}/dataset/eval/pos')
    ut.mkdir(f'{path}/dataset/eval/neg')
    
    ut.mkdir(f'{path}/dataset/test/pos')
    ut.mkdir(f'{path}/dataset/test/neg')
 
    train_size = int(len(pos_files) * train_percent)
    eval_size  = int(len(pos_files) * eval_percent)
    test_size  = int(len(pos_files) * test_percent)

    for f in pos_files[:train_size]: shutil.copy(f,  f'{path}/dataset/train/pos')
    for f in neg_files[:train_size]: shutil.copy(f,  f'{path}/dataset/train/neg')

    for f in pos_files[train_size:train_size+eval_size]: shutil.copy(f,  f'{path}/dataset/eval/pos')
    for f in neg_files[train_size:train_size+eval_size]: shutil.copy(f,  f'{path}/dataset/eval/neg')

    for f in pos_files[train_size+eval_size:train_size+eval_size+test_size]: shutil.copy(f,  f'{path}/dataset/test/pos')
    for f in neg_files[train_size+eval_size:train_size+eval_size+test_size]: shutil.copy(f,  f'{path}/dataset/test/neg')
    
    !cd {path}/dataset; rm -rf all

    return sets_counts(path)

In [8]:
def get_dataset(tokenizer, path, hyper_params):
    return ds.ImdbReviewsDataset(tokenizer, path, max_len = hyper_params.max_seq_length)

## Build Dataset

In [9]:
(train_size, eval_size, test_size) = download_dataset(
    train_percent = 0.07, 
    eval_percent  = 0.03, 
    test_percent  = 0.01
)

## Train Model

In [10]:
params = Bunch(
    train_path                  = 'dataset/train',
    eval_path                   = 'dataset/eval',
    test_path                   = 'dataset/test',
    tokenizer_truncation        =  True,
    checkpoints_path            = 'checkpoints', 
    model_name_or_path          = 't5-base',
    tokenizer_name_or_path      = 't5-base',
    max_seq_length              = 512,
    lr                          = 0.0003,
    accelerator                 = 'gpu',
    val_check_interval          = train_size,
    train_batch_size            = 2,
    eval_batch_size             = 2,
    test_batch_size             = 32,
    epochs                      = 2,
    gradient_accumulation_steps = 16,
    # If you want to enable 16-bit training
    # then install apex and set this to true
    fp_16                       = False,
    # If you enable 16-bit training then 
    # set this to a sensible value, 0.5 is a good default
    gradient_clip_val           = 1.0
)

In [11]:
model = ml.T5(params)


dl_builder = ml.T5DataLoaderBuilder(
    model.tokenizer,
    params,
    get_dataset
)

fine_tunner = ml.T5FineTuner(model, dl_builder)

trainer = pl.Trainer(
    accumulate_grad_batches  = params.gradient_accumulation_steps,
    accelerator              = params.accelerator,
    max_epochs               = params.epochs,
    precision                = 16 if params.fp_16 else 32,
    gradient_clip_val        = params.gradient_clip_val,
    val_check_interval       = params.val_check_interval,
    callbacks                = [
        ModelCheckpoint(
            dirpath    = params.checkpoints_path,
            monitor    = 'val_loss',
            mode       = 'min',
            save_top_k = 5
        )
    ]
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(fine_tunner)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type | Params
-------------------------------
0 | model | T5   | 222 M 
-------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=2` reached.


## Evaluate Model

In [13]:
batch = next(iter(dl_builder.test()))

texts       = model.decode(batch['source_ids'])
predictions = model.predict(batch)
targets     = model.decode(batch['target_ids'])

In [14]:
import textwrap

for i in range(len(texts)):
    lines = textwrap.wrap("Review:\n%s\n" % texts[i], width=80)
    print("\n".join(lines))
    print("\nActual sentiment: %s" % targets[i])
    print("Predicted sentiment: %s" % predictions[i])
    print("================================================================================\n")