In [None]:
! pip install PyPDF2



In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import torch

In [None]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

In [None]:
model_name = "t5-small"  # You can use a larger model like 't5-base' or 't5-large'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
texts = ["Long PDF text example 1.", "Another long document text example."]
summaries = ["Summary of document 1.", "Summary of document 2."]

In [None]:
class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=512):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_text = self.texts[idx]
        target_text = self.summaries[idx]

        # Tokenize inputs and targets
        input_enc = self.tokenizer(input_text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        target_enc = self.tokenizer(target_text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        # Return as a dictionary
        return {
            "input_ids": input_enc["input_ids"].squeeze(),
            "attention_mask": input_enc["attention_mask"].squeeze(),
            "labels": target_enc["input_ids"].squeeze()
        }


In [None]:
dataset = SummarizationDataset(texts, summaries, tokenizer)
dataloader = DataLoader(dataset, batch_size=2)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [None]:
model.train()
for batch in dataloader:
    optimizer.zero_grad()


In [None]:
outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [None]:
# Get loss (cross-entropy)
loss = outputs.loss

In [None]:
# Backward pass and optimization
loss.backward()
optimizer.step()

print(f"Loss: {loss.item()}")

Loss: 13.109818458557129
