In [None]:
!pip install -Uqq datasets transformers  pytorch_lightning rouge_score evaluate

In [None]:
!pip install wandb



In [None]:
import warnings
warnings.simplefilter("ignore")

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import numpy as np
import torch
import datasets
import pytorch_lightning as pl
from datasets import load_dataset, load_metric


from transformers import (
    AutoModel,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

import torch
import pandas as pd
from torch.utils.data import Dataset

torch.set_float32_matmul_precision("medium")

In [None]:
# Load dataset with banking
dataset = load_dataset("cnn_dailymail",'3.0.0')

# # Removes the original labels because you'll be labeling from scratch
dataset = dataset.remove_columns("id")
dataset

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights'],
        num_rows: 11490
    })
})

In [None]:
# Loop through train, test, and validation splits
splits = ["train", "test", "validation"]
for split in splits:
    data_frame = pd.DataFrame(dataset[split])
    csv_file_path = f"{split}.csv"
    data_frame.to_csv(csv_file_path, index=False)
    print(f"Saved {split} split to {csv_file_path}")

Saved train split to train.csv
Saved test split to test.csv
Saved validation split to validation.csv


In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mandysingal[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Load the pre-trained model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from dataset import MyDataModule
from model import MyLightningModule
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import (
    AutoModel,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import torch

torch.set_float32_matmul_precision("medium")

if __name__ == "__main__":
    # Define the checkpoint callback
    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath="checkpoints",
        filename="my_model-{epoch:02d}-{val_loss:.2f}",
        save_top_k=-1,
        every_n_epochs=1,
        verbose=True,
    )
    logger = TensorBoardLogger("tb_logs", name="t5_dailymail")

    model_name = "t5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # File paths
    train_csv = "./train.csv"
    val_csv = "./validation.csv"
    test_csv = "./test.csv"

    # Create the data module
    dm = MyDataModule(train_csv, val_csv, test_csv, tokenizer, batch_size=32)
    dm.setup()

    model = MyLightningModule(
        model_name="t5-small", learning_rate=1e-4, weight_decay=1e-5
    )


    #checkpoint_path = "checkpoints/curr.ckpt"
    #checkpoint = torch.load(checkpoint_path)
    #model.load_state_dict(checkpoint["state_dict"])

    trainer = pl.Trainer(
        accelerator="gpu",
        devices=[0],
        max_epochs=10,
        precision=16,
        logger=logger,
        callbacks=[checkpoint_callback],
        log_every_n_steps=10,
    )
    trainer.fit(model, dm)
    trainer.validate(model, dm)

Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type            | Params
------------------------------------------
0 | model | OptimizedModule | 60.5 M
------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 8973: 'val_loss' reached 0.82234 (best 0.82234), saving model to '/content/checkpoints/my_model-epoch=00-val_loss=0.82.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 17946: 'val_loss' reached 0.70474 (best 0.70474), saving model to '/content/checkpoints/my_model-epoch=01-val_loss=0.70.ckpt' as top 2


Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 26919: 'val_loss' reached 0.63279 (best 0.63279), saving model to '/content/checkpoints/my_model-epoch=02-val_loss=0.63.ckpt' as top 3
Process ForkProcess-5:
Process ForkProcess-16:
Process ForkProcess-11:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Process ForkProcess-13:
Process ForkProcess-15:
Traceback (most recent call last):
Process ForkProcess-7:
Process ForkProcess-14:
Process ForkProcess-8:
Traceback (most recent call last):
Process ForkProcess-10:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Process ForkProcess-12:
  File "/usr/lib/python3.10/concurrent/fut

Validation: 0it [00:00, ?it/s]

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
trainer.push_to_hub("Andyrasika/cnn-daily-model-summarization")