In [1]:
# Import all required libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
import torch

In [2]:
# Load a small subset of the CNN/DailyMail dataset for testing
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

# Display one example
print(dataset[0])


{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [3]:
# Load a pretrained T5 tokenizer (for summarization)
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Tokenize both article (input) and highlights (target)
def tokenize_fn(examples):
    model_inputs = tokenizer(
        examples["article"], max_length=512, truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["highlights"], max_length=128, truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization to dataset
tokenized_ds = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["article", "highlights", "id"]
)

print(tokenized_ds[0].keys())


Map:   0%|          | 0/2871 [00:00<?, ? examples/s]



dict_keys(['input_ids', 'attention_mask', 'labels'])


In [4]:
# We group tokenized sequences into blocks of fixed size (e.g., 128 tokens)
block_size = 128

def group_texts(examples):
    # Concatenate all sequences within a batch
    concatenated_inputs = sum(examples["input_ids"], [])
    concatenated_masks  = sum(examples["attention_mask"], [])
    concatenated_labels = sum(examples["labels"], [])

    # Cut to a multiple of block_size
    total_len = (len(concatenated_inputs) // block_size) * block_size

    concatenated_inputs = concatenated_inputs[:total_len]
    concatenated_masks  = concatenated_masks[:total_len]
    concatenated_labels = concatenated_labels[:total_len]

    # Split each into blocks of equal size
    input_chunks = [concatenated_inputs[i:i+block_size] for i in range(0, total_len, block_size)]
    mask_chunks  = [concatenated_masks[i:i+block_size] for i in range(0, total_len, block_size)]
    label_chunks = [concatenated_labels[i:i+block_size] for i in range(0, total_len, block_size)]

    # Filter out empty label chunks to avoid IndexError
    valid_data = [
        (inp, mask, lab)
        for inp, mask, lab in zip(input_chunks, mask_chunks, label_chunks)
        if len(lab) > 0
    ]
    if not valid_data:
        return {"input_ids": [], "attention_mask": [], "labels": []}

    input_chunks, mask_chunks, label_chunks = zip(*valid_data)

    return {
        "input_ids": list(input_chunks),
        "attention_mask": list(mask_chunks),
        "labels": list(label_chunks)
    }

# Apply grouping
lm_ds = tokenized_ds.map(group_texts, batched=True, batch_size=1000)
print(f"LM training sequences: {len(lm_ds)}")


Map:   0%|          | 0/2871 [00:00<?, ? examples/s]

LM training sequences: 1451


In [5]:
# Custom collate function for batching
def collate_fn(batch):
    input_ids = torch.tensor([b["input_ids"] for b in batch], dtype=torch.long)
    labels = torch.tensor([b["labels"] for b in batch], dtype=torch.long)
    return {"input_ids": input_ids, "labels": labels}

# Initialize DataLoader
train_loader = DataLoader(lm_ds, batch_size=8, shuffle=True, collate_fn=collate_fn)

# Inspect one batch
for batch in train_loader:
    print(batch["input_ids"].shape, batch["labels"].shape)
    break


torch.Size([8, 128]) torch.Size([8, 128])


In [6]:
# Custom collate function for batching
def collate_fn(batch):
    input_ids = torch.tensor([b["input_ids"] for b in batch], dtype=torch.long)
    labels = torch.tensor([b["labels"] for b in batch], dtype=torch.long)
    return {"input_ids": input_ids, "labels": labels}

# Initialize DataLoader
train_loader = DataLoader(lm_ds, batch_size=8, shuffle=True, collate_fn=collate_fn)

# Inspect one batch
for batch in train_loader:
    print(batch["input_ids"].shape, batch["labels"].shape)
    break


torch.Size([8, 128]) torch.Size([8, 128])


In [7]:
# Load a small pretrained generative model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Take one batch
batch = next(iter(train_loader))
batch = {k: v.to(device) for k, v in batch.items()}

# Forward pass to compute loss
outputs = model(**batch)
print(f"Training loss: {outputs.loss.item():.4f}")


Training loss: 7.2819
