# Dependencies and constancts

In [None]:
! pip install datasets transformers

from tqdm.notebook import tqdm
from IPython import display

import numpy as np
import pandas as pd
import math

from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn

from datasets import load_dataset
from transformers import T5TokenizerFast, T5ForConditionalGeneration, DataCollatorForSeq2Seq

In [17]:
BASE_MODEL_NAME = 't5-small'

BATCH_SIZE = 32
LEARNING_RATE = 1e-4
EPOCHS = 5
RANK = 8
ALPHA = 4.0

# Dataset

In [None]:
dataset = load_dataset('imdb')
dataset.pop('unsupervised')

In [5]:
def id2label(ids):
    label_names = ['negative', 'positive']
    return [label_names[id] for id in ids]

def label2id(labels):
    label_names_dict = {
        'negative': 0,
        'positive': 1
    }
    return [
        label_names_dict.get(label, 2)
        for label in labels
    ]

In [None]:
tokenizer = T5TokenizerFast.from_pretrained(BASE_MODEL_NAME)

In [None]:
def preprocess_input(text):
    text = text.lower()
    text = text.replace('<br />', ' ')
    return text

def map_function(row):
    processed_input = [
        preprocess_input(text)
        for text in row['text']
    ]
    input_info = tokenizer(processed_input, truncation=True, max_length=256)
    output_info = tokenizer(id2label(row['label']))
    return {
        **input_info,
        'labels': output_info.input_ids
    }


dataset = dataset.map(map_function, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [13]:
col_fn = DataCollatorForSeq2Seq(
    tokenizer, return_tensors='pt', padding='longest',
)

train_loader = torch.utils.data.DataLoader(
    dataset['train'],
    batch_size=BATCH_SIZE,
    collate_fn=col_fn,
    shuffle=True
)

test_loader = torch.utils.data.DataLoader(
    dataset['test'],
    batch_size=BATCH_SIZE,
    collate_fn=col_fn,
    shuffle=True
)

# model and training loop

In [None]:
model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_NAME)

In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
compute_metrics = accuracy_score

In [None]:
!pip install peft

In [11]:
def train_loop(model, loader, optimizer):
    model.train()

    batch_losses = []

    for row in tqdm(loader, desc='Training:'):
        optimizer.zero_grad()

        out = model(**row.to(model.device))
        loss = out.loss

        batch_loss_value = loss.item()
        loss.backward()
        optimizer.step()

        batch_losses.append(batch_loss_value)

    loss_value = np.mean(batch_losses)
    return {'train_loss': loss_value}

def _predict(model, row):
    return model.generate(
        input_ids=row.input_ids,
        attention_mask=row.attention_mask,
        max_length=5
    )

def tokenizer_ids_to_label(all_input_ids):
    return tokenizer.batch_decode(all_input_ids, skip_special_tokens=True)

def valid_loop(model, loader, compute_metrics):
    model.eval()

    all_true = []
    all_pred = []

    with torch.no_grad():
        for row in tqdm(loader, desc='Validating:'):
            row.to(model.device)
            pred = _predict(model, row)

            all_true += row.labels.detach().cpu().tolist()
            all_pred += pred.detach().cpu().tolist()

    all_true = label2id(tokenizer_ids_to_label(all_true))
    all_pred = label2id(tokenizer_ids_to_label(all_pred))

    return {'valid_acc': compute_metrics(y_true=all_true, y_pred=all_pred)}

In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
compute_metrics = accuracy_score

In [19]:
def Lora_loop(model):
    all_results = []
    for epoch in range(EPOCHS):
        epoch_results = {'epoch': epoch}

        epoch_results.update(
            train_loop(
                model=model,
                loader=train_loader,
                optimizer=optimizer,
            )
        )

        epoch_results.update(
            valid_loop(
                model=model,
                loader=test_loader,
                compute_metrics=compute_metrics,
            )
        )
        all_results.append(epoch_results)

        display.clear_output()
        display.display(pd.DataFrame(all_results).set_index('epoch'))

    display.clear_output()

    best_score = pd.DataFrame(all_results)['valid_acc'].max() * 100
    print('Best model preformance is: %%%.1f' % best_score)

    return model

# using PEFT for LoRA

In [20]:
from peft import LoraConfig, get_peft_model, TaskType

RANK = 8
model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_NAME)

lora_config = LoraConfig(
    r=RANK,
    lora_alpha = ALPHA,
    target_modules = ["q", "v"],
    lora_dropout = 0,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850403779272945


In [None]:
Lora_loop(model)