# Fine-Tuning T5 for Question Answering (SQuAD)
## 1. Setup Environment and Installation
We install transformers, datasets, and rouge_score (often used to evaluate generative tasks).

In [17]:
# Install required libraries
!pip install transformers datasets accelerate evaluate rouge_score -q

import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    pipeline
)

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone




Using device: cpu


## 2. Load and Explore SQuAD Dataset
We load the SQuAD v1.1 dataset. We will inspect the context, question, and answers fields.

In [18]:
# Load SQuAD v1.1
raw_datasets = load_dataset("squad")

# Display a sample to understand the structure
print("SQuAD Data Sample:")
sample = raw_datasets["train"][0]
print(f"Context: {sample['context'][:100]}...")
print(f"Question: {sample['question']}")
print(f"Answer: {sample['answers']['text'][0]}")

SQuAD Data Sample:
Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden...
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer: Saint Bernadette Soubirous


## 3. Data Preprocessing for T5
T5 requires a specific input format: "question: [QUESTION] context: [CONTEXT]". We also need to tokenize the target answers.

In [19]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    # Format inputs for T5
    inputs = [f"question: {q}  context: {c}" for q, c in zip(examples["question"], examples["context"])]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize targets (the answers)
    # SQuAD answers are in a list; we take the first one for training
    targets = [a["text"][0] for a in examples["answers"]]
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map the preprocessing across the dataset
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

## 4. Load T5 Model
We use t5-small to ensure training is efficient while still demonstrating the fine-tuning process required for the assignment.

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
model.to(device)

# Data collator specifically for Seq2Seq tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## 5. Fine-Tuning the Model
We will fine-tune the model using the Trainer API. Due to the large size of SQuAD, we will use a subset for this task.

In [22]:
training_args = TrainingArguments(
    output_dir="finetuning-t5-question-answering",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(5000)), # Subset for UAS task
    eval_dataset=tokenized_datasets["validation"].shuffle(seed=42).select(range(1000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 1


[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true&ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfitranhp[0m ([33mfitranhp-telkom-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## 6. Inference and Testing
After training, we use the text2text-generation pipeline to test the model on new context-question pairs.

In [None]:
# Create a pipeline for Question Answering using the fine-tuned T5
qa_pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

def ask_t5(question, context):
    input_text = f"question: {question} context: {context}"
    result = qa_pipe(input_text, max_length=50)
    return result[0]['generated_text']

# Test with a sample
context_sample = "The University of Notre Dame is a Catholic research university located in South Bend, Indiana."
question_sample = "Where is the University of Notre Dame located?"

print(f"Context: {context_sample}")
print(f"Question: {question_sample}")
print(f"Predicted Answer: {ask_t5(question_sample, context_sample)}")

In [25]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    # Format inputs for T5
    inputs = [f"question: {q}  context: {c}" for q, c in zip(examples["question"], examples["context"])]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize targets (the answers)
    # SQuAD answers are in a list; we take the first one for training
    targets = [a["text"][0] for a in examples["answers"]]
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict

def stratified_sample_indices(dataset: Dataset, sample_size: int, seed: int = 42):
    # Convert to pandas DataFrame for easier grouping and sampling
    df = pd.DataFrame(dataset)
    df['original_index'] = df.index

    # Group by 'title' and calculate proportional sample sizes
    grouped = df.groupby('title')
    total_count = len(df)
    selected_indices = []

    for title, group in grouped:
        group_size = len(group)
        # Calculate proportional sample count
        prop_sample_count = int(np.round((group_size / total_count) * sample_size))

        # Ensure at least one sample if the group exists and does not exceed group size
        num_samples = max(1, prop_sample_count) if prop_sample_count > 0 else 0
        num_samples = min(num_samples, group_size)

        # Randomly select indices from this group
        sampled_group_indices = group['original_index'].sample(n=num_samples, random_state=seed).tolist()
        selected_indices.extend(sampled_group_indices)

    # Shuffle the combined indices and trim to the exact sample_size
    np.random.seed(seed)
    np.random.shuffle(selected_indices)
    final_indices = selected_indices[:sample_size]

    return final_indices

# Set sample sizes
train_sample_size = 500
validation_sample_size = 100

# Get stratified sample indices
train_indices = stratified_sample_indices(raw_datasets["train"], train_sample_size)
validation_indices = stratified_sample_indices(raw_datasets["validation"], validation_sample_size)

# Create stratified raw datasets
stratified_raw_train_dataset = raw_datasets["train"].select(train_indices)
stratified_raw_validation_dataset = raw_datasets["validation"].select(validation_indices)

# Apply preprocessing to stratified datasets
tokenized_train = stratified_raw_train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=stratified_raw_train_dataset.column_names
)
tokenized_validation = stratified_raw_validation_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=stratified_raw_validation_dataset.column_names
)

# Combine into a DatasetDict
tokenized_datasets = DatasetDict({
    "train": tokenized_train,
    "validation": tokenized_validation,
})

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [26]:
training_args = TrainingArguments(
    output_dir="finetuning-t5-question-answering",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,5.087446
2,No log,0.33506




TrainOutput(global_step=126, training_loss=6.612504262772817, metrics={'train_runtime': 3128.3064, 'train_samples_per_second': 0.32, 'train_steps_per_second': 0.04, 'total_flos': 135341801472000.0, 'train_loss': 6.612504262772817, 'epoch': 2.0})