In [1]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

In [2]:
dataset.column_names

['article', 'highlights', 'id']

In [3]:
# Convert to pandas
df = pd.DataFrame(dataset)
df = df[["article", "highlights"]]  # Keep only the columns we need

In [4]:
# Save to disk for DVC tracking
df.to_csv("data/cnn_dm_subset.csv", index=False)
print("Dataset saved to data/cnn_dm_subset.csv")

Dataset saved to data/cnn_dm_subset.csv


In [6]:
# 🔧 Step 2: Tokenize + Prepare for Training
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize(batch):
    inputs = tokenizer(
        ["summarize: " + text for text in batch["article"]],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    targets = tokenizer(
        batch["highlights"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

dataset = Dataset.from_pandas(df) # Wrap your data in HuggingFace's Dataset object
tokenized = dataset.map(tokenize, batched=True) # Tokenize the entire dataset
tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) # Set format for PyTorch training

# preparing article-summary pairs into tokenized input/output tensors to fine-tune a t5-small model to summarize text

Map:   0%|          | 0/2871 [00:00<?, ? examples/s]