# FLAN-T5 Finetuning for QA on Yahoo Data

Code inspired from Taughdata tutorial

https://www.toughdata.net/blog/post/finetune-flan-t5-question-answer-quora-dataset

## Prerequisites

In [1]:
%%bash
pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install rouge_score
pip install sentencepiece
pip install huggingface_hub



## Import the libraries

In [2]:
import nltk
import evaluate
import numpy as np
import os
import json
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

## Data Loading

In [3]:
categories_file_path = os.path.join(os.getcwd(),"../preprocessed_data/FLAN", "SYNONYMS_FLAN.txt")
print(os.path.exists(categories_file_path))
dataset = load_dataset("text", data_files=categories_file_path)

True


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 78
    })
})

In [8]:
# # Acquire the training data from Hugging Face
# yahoo_answers_qa = load_dataset("yahoo_answers_qa")

# yahoo_answers_qa = yahoo_answers_qa["train"].train_test_split(test_size=0.3)

In [9]:
# # Check the length of the data and its structure
# yahoo_answers_qa

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'nbestanswers', 'main_category'],
        num_rows: 61153
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'nbestanswers', 'main_category'],
        num_rows: 26209
    })
})

In [6]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# # We prefix our tasks with "answer the question"
# prefix = "Please answer this question: "

# # Define the preprocessing function

# def preprocess_function(examples):
#     """Add prefix to the sentences, tokenize the text, and set the labels"""
#     # The "inputs" are the tokenized answer:
#     inputs = [prefix + doc for doc in examples["question"]]
#     model_inputs = tokenizer(inputs, max_length=128, truncation=True)

#     # The "labels" are the tokenized outputs:
#     labels = tokenizer(text_target=examples["answer"], max_length=512, truncation=True)
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

In [13]:
# # Map the preprocessing function across our dataset
# tokenized_dataset = yahoo_answers_qa.map(preprocess_function, batched=True)

Map:   0%|          | 0/61153 [00:00<?, ? examples/s]



Map:   0%|          | 0/26209 [00:00<?, ? examples/s]

In [14]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [15]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Map (num_proc=4):   0%|          | 0/78 [00:00<?, ? examples/s]

In [16]:
tokenized_datasets["train"][1]

{'input_ids': [96,
  20347,
  33,
  82,
  662,
  1968,
  1234,
  10,
  13402,
  52,
  7,
  6,
  13348,
  6,
  3,
  26219,
  7,
  6,
  3,
  15603,
  7,
  121,
  10,
  96,
  855,
  1370,
  1686,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [20]:
tokenizer.decode(tokenized_datasets["train"][1]["input_ids"])

'"These are my four associated words: badgers, bugs, hounds, nags": "pester",</s>'

In [8]:
# Set up Rouge score for evaluation
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [9]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

In [10]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=L_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
    weight_decay=WEIGHT_DECAY,
    save_total_limit=SAVE_TOTAL_LIM,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    push_to_hub=False
)

In [17]:
# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
# Trigger the model training
trainer.train()

  0%|          | 0/30 [00:00<?, ?it/s]

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

In [21]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [22]:
!zip -r results.zip results

  adding: results/ (stored 0%)
  adding: results/runs/ (stored 0%)
  adding: results/runs/Nov02_21-54-44_54c8c32c6436/ (stored 0%)
  adding: results/runs/Nov02_21-54-44_54c8c32c6436/events.out.tfevents.1698962106.54c8c32c6436.631.0 (deflated 63%)
  adding: results/checkpoint-21500/ (stored 0%)
  adding: results/checkpoint-21500/special_tokens_map.json (deflated 85%)
  adding: results/checkpoint-21500/model.safetensors (deflated 7%)
  adding: results/checkpoint-21500/trainer_state.json (deflated 79%)
  adding: results/checkpoint-21500/rng_state.pth (deflated 25%)
  adding: results/checkpoint-21500/training_args.bin (deflated 51%)
  adding: results/checkpoint-21500/config.json (deflated 62%)
  adding: results/checkpoint-21500/scheduler.pt (deflated 55%)
  adding: results/checkpoint-21500/optimizer.pt (deflated 12%)
  adding: results/checkpoint-21500/generation_config.json (deflated 29%)
  adding: results/checkpoint-21500/spiece.model (deflated 48%)
  adding: results/checkpoint-21500/adde

In [23]:
!ls results

checkpoint-21500  checkpoint-22000  checkpoint-22500  runs


## Run in inference mode

In [25]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

last_checkpoint = "./results/checkpoint-22500"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
my_question = "What do you think about the benefit of Artificial Intelligence?"
inputs = "Please answer to this question: " + my_question

inputs = tokenizer(inputs, return_tensors="pt")

In [35]:
outputs = finetuned_model.generate(**inputs)
answer = tokenizer.decode(outputs[0])

print("Answer:", fill(answer, width=80))

Answer: <pad>I think it's a great way to get people to think about things. It'
