In [9]:
!pip install pytorch_lightning
!pip install transformers
!pip install datasets
!pip install rouge_score
!pip install sentencepiece
!pip install tokenizers
!pip install tensorboard





In [10]:
import warnings
warnings.simplefilter("ignore")

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

import numpy as np
import torch
import datasets 
import pytorch_lightning as pl
import matplotlib.pyplot as plt


from datasets import load_dataset, load_metric
from pytorch_lightning.loggers import TensorBoardLogger


import transformers
from transformers import (
    AutoModel,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from transformers import T5ForConditionalGeneration, T5Tokenizer


In [11]:
class MyDataModule(pl.LightningDataModule):
    def __init__(self, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.tokenizer = AutoTokenizer.from_pretrained("t5-base")
    
    def prepare_data(self):
        load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
        load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")
        load_dataset("cnn_dailymail", "3.0.0", split="test[:1%]")
    
    def setup(self, stage=None):
        train_data = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
        val_data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]" )
        test_data = load_dataset("cnn_dailymail", "3.0.0", split="test[:1%]")

        self.train_ds = train_data.map(
            self.preprocess_function, 
            batched=True, 
            batch_size=self.batch_size, 
            remove_columns=["article", "highlights", "id"],
        ).with_format("torch")

        self.val_ds = val_data.map(
            self.preprocess_function, 
            batched=True, 
            batch_size=self.batch_size,
            remove_columns=["article", "highlights", "id"],
        ).with_format("torch")

        self.test_ds = test_data.map(
          self.preprocess_function, 
            batched=True, 
            batch_size=self.batch_size,
            remove_columns=["article", "highlights", "id"],
        ).with_format("torch")
        

    def preprocess_function(self, batch):
        inputs = self.tokenizer(batch["article"][0],add_special_tokens=True, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
        outputs = self.tokenizer(batch["highlights"][0],add_special_tokens=True, padding="max_length", truncation=True, max_length=128,return_tensors="pt")
        batch["input_ids"] = inputs.input_ids
        batch["attention_mask"] = inputs.attention_mask
        batch["labels"] = outputs.input_ids
        return batch
    
    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.test_ds, batch_size=self.batch_size)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_ds, batch_size=self.batch_size)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_ds, batch_size=self.batch_size)

In [18]:
class MyLightningModule(pl.LightningModule):
    def __init__(self, model_name, learning_rate, weight_decay, batch_size):
        super().__init__()
        self.model_name = model_name
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.batch_size = batch_size
        self.tokenizer = AutoTokenizer.from_pretrained("t5-base")
        
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)

        self.metric = load_metric("rouge")

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        
        loss, logits = self(input_ids, attention_mask, labels)
        self.log('train_loss', loss, on_epoch=True, on_step=True)
        self.logger.log_metrics({'train_loss': loss}, step=self.global_step)
        return {'loss': loss, 'logits': logits}


    def on_validation_epoch_end(self):
        pred_token_ids = self.logits.argmax(dim=-1)
        decode_pred = self.tokenizer.batch_decode(pred_token_ids, skip_special_tokens=True)
        decode_labels = self.tokenizer.batch_decode(self.labels, skip_special_tokens=True)
        scores = self.metric.compute(predictions=decode_pred, references=decode_labels, rouge_types=["rouge1"])["rouge1"].mid
        self.log('rouge1_precision', scores.precision, prog_bar=True)
        self.log('rouge1_recall', scores.recall, prog_bar=True)
        self.log('rouge1_fmeasure', scores.fmeasure, prog_bar=True)
        self.logger.log_metrics({'rouge1_precision': scores.precision, 'rouge1_recall': scores.recall, 'rouge1_fmeasure': scores.fmeasure}, step=self.global_step)
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, logits = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss, on_epoch=True, on_step=False)

        if not hasattr(self, 'logits'):
            self.logits = logits
        else:
            self.logits = torch.cat((self.logits, logits), dim=0)

        if not hasattr(self, 'labels'):
            self.labels = labels
        else:
            self.labels = torch.cat((self.labels, labels), dim=0)

        

        return {'loss': loss, 'logits': logits, 'labels': labels}

    def generate(self, input_text):
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt")
        output_ids = self.model.generate(
            input_ids=input_ids,
            max_length=100,
            num_beams=5,
            early_stopping=True,
        )
        output_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return output_text

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        input_text = batch
        input_ids = self.tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt").to(self.device)
        attention_mask = input_ids.ne(self.tokenizer.pad_token_id).float()
        output_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask)
        output_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        self.logger.experiment.add_text("Generated Text", output_text, self.current_epoch)
        return {'input_tex' : input_text,'out_text': output_text}
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
        return optimizer

In [25]:
%tensorboard --logdir logs/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 12704), started 1:25:57 ago. (Use '!kill 12704' to kill it.)

In [26]:
model = MyLightningModule(model_name="t5-small", learning_rate=1e-5, weight_decay=1e-4, batch_size=16)
logger = TensorBoardLogger("logs/", name="my_model")
trainer = pl.Trainer(accelerator="auto", max_epochs=5, deterministic=True, logger=logger,log_every_n_steps=1)
dm = MyDataModule(batch_size=16)
trainer.fit(model, datamodule=dm)
predictions = trainer.predict(model, ["23 October 2015 Last updated at 17:44 BST It's the highest rating a tropical storm can get and is the first one of this magnitude to hit mainland Mexico since 1959. But how are the categories decided and what do they mean? Newsround reporter Jenny Lawrence explains."])

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Map:   0%|          | 0/2871 [00:00<?, ? examples/s]

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Map:   0%|          | 0/115 [00:00<?, ? examples/s]


  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

RuntimeError: [enforce fail at C:\b\abs_bao0hdcrdh\croot\pytorch_1675190257512\work\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 263192576 bytes.