# Finetuning T5

The purpose of this notebook is to document the process of finetuning Google's T5 model for translating from Literary Tibetan to English. This notebook relies on a dataset in the form of a pickled pandas dataframe which consists of a single column, 'translation'. Entries in that column should be a python dictionary of the structure: {'bo':'Tibetan text', 'en': 'English text'}.

In creating this notebook I drew on the following tutorial from HuggingFace: https://huggingface.co/learn/nlp-course/chapter7/4?fw=pt

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset('pandas', data_files='/home/j/Documents/Projects/MLotsawa/notebooks/t5/100k-sample-dataframe.p')

In [3]:
dataset = dataset['train'].train_test_split(test_size=.2)

In [14]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
source_lang = 'bo'
target_lang = 'en'
prefix = "translate Tibetan to English: "

def preprocess_function(examples):

    inputs = [prefix + example[source_lang] for example in examples['translation']]
    targets = [example[target_lang] for example in examples['translation']]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)

    return model_inputs


In [6]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

2024-07-06 16:20:14.808938: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-06 16:20:14.808988: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-06 16:20:14.809035: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-06 16:20:14.819160: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
import evaluate

metric = evaluate.load("sacrebleu")

  torch.utils._pytree._register_pytree_node(


In [9]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [10]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [15]:
model.to("cuda:0")

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [21]:
training_args = Seq2SeqTrainingArguments(
    output_dir=".",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    save_steps=25000
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/75000 [00:00<?, ?it/s]

{'loss': 3.5089, 'learning_rate': 1.9866666666666667e-05, 'epoch': 0.2}
{'loss': 3.5687, 'learning_rate': 1.97336e-05, 'epoch': 0.4}
{'loss': 3.5619, 'learning_rate': 1.9600266666666666e-05, 'epoch': 0.6}
{'loss': 3.5236, 'learning_rate': 1.9466933333333335e-05, 'epoch': 0.8}
{'loss': 3.5155, 'learning_rate': 1.93336e-05, 'epoch': 1.0}
{'loss': 3.5051, 'learning_rate': 1.9200533333333337e-05, 'epoch': 1.2}
{'loss': 3.4719, 'learning_rate': 1.9067200000000003e-05, 'epoch': 1.4}
{'loss': 3.478, 'learning_rate': 1.893386666666667e-05, 'epoch': 1.6}
{'loss': 3.4501, 'learning_rate': 1.8800533333333334e-05, 'epoch': 1.8}
{'loss': 3.4425, 'learning_rate': 1.86672e-05, 'epoch': 2.0}
{'loss': 3.4254, 'learning_rate': 1.8534133333333336e-05, 'epoch': 2.2}
{'loss': 3.3976, 'learning_rate': 1.8400800000000002e-05, 'epoch': 2.4}
{'loss': 3.4021, 'learning_rate': 1.8267466666666667e-05, 'epoch': 2.6}
{'loss': 3.4068, 'learning_rate': 1.8134133333333333e-05, 'epoch': 2.8}
{'loss': 3.3961, 'learning_

TrainOutput(global_step=75000, training_loss=3.064906062825521, metrics={'train_runtime': 7935.4616, 'train_samples_per_second': 302.44, 'train_steps_per_second': 9.451, 'train_loss': 3.064906062825521, 'epoch': 30.0})