## Converting to pytorch lightning fitting routine 

In [1]:
!export CUDA_LAUNCH_BLOCKING=1

In [1]:
# import tensorboard
import torch
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl


VOCAB_SIZE = 32128

class Routine(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.lr = 1e-3
        self.validation_step_outputs = []
        self.training_step_outputs = []
        self.test_step_outputs = []

    def forward(
        self,
        input_ids,
        attention_mask,
        labels,
        decoder_attention_mask, 
    ):
        y_hat = self.model(input_ids=input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, labels=labels)
        
        # print(f"forward(): {y_hat=}")
        return y_hat

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # print(f"keys = {batch.keys()}")
        # print(f"{batch=}")
        y = batch['labels']
        # print(f"{y=}")
        # print("y.shape", y.shape)
        # print(f"{y=}")
        y_hat = self(**batch)
        
        # print(f"{y_hat=}")
        # print("y_hat.shape", y_hat.logits.shape) 


        y_onehot = F.one_hot(y, num_classes=VOCAB_SIZE)
        y = y_onehot.float()


        losses = []
        # computing cross-entropy on per-token basis and averaging the loss. 
        for tok in range(y_hat.logits.shape[1]):
            # print("Per-token loss cross entropy")
            loss = F.cross_entropy(y_hat.logits[:,tok,:] , y[:,tok,:])
            print(loss)
            # loss = F.nll_loss(y_hat[:,tok,:] , y[:,tok,:])
            losses.append(loss)


        loss  = torch.mean(torch.tensor(losses))

        print(f"{loss=}")
        # print(f"{ F.log_softmax(y_hat.logits, dim=-1).shape=}")
        # loss = F.nll_loss( F.log_softmax(y_hat.logits, dim=-1), y_onehot)
        # print(f"{loss=}")
        # # y_hat = y_hat.squeeze()

        # dummy metrics
        metrics_dict = {"loss": loss.item(), "train_EM": 0.9, "train_F1": 0.9}
        print(metrics_dict)
        # loss = F.binary_cross_entropy_with_logits(y_hat, y)

        # y_hat = (F.sigmoid(y_hat) > 0.5).float()

        # metrics = self.metric(y_hat, y)()
        # metrics_dict = {
        #     "loss": loss,
        #     "train_ttr": metrics.ttr,
        #     "train_ftr": metrics.ftr,
        #     "train_acc": metrics.acc,
        # }
        self.training_step_outputs.append(metrics_dict)
        return metrics_dict



    def on_train_epoch_end(self):
        results = {
            "loss": torch.tensor(
                [x["loss"] for x in self.training_step_outputs]
            ).mean(),
            "F1": torch.tensor(
                [x["train_F1"] for x in self.training_step_outputs]
            ).mean(),
            "EM": torch.tensor(
                [x["train_EM"] for x in self.training_step_outputs]
            ).mean(),
        }
        # self.log(f"LR",self.lr, on_epoch=True, prog_bar=True, logger=True)
        for k, v in results.items():
            self.log(
                f"train_{k}",
                v,
                on_epoch=True,
                prog_bar=True,
                logger=True,
                sync_dist=True,
            )

    def validation_step(self, batch, batch_idx):
        # x = batch["x"]
        # y = batch["y"]
        y_hat = self(**batch)
        # (batch, num_classes)
        y_hat = y_hat.squeeze()
        # (batch,)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        # pred = F.sigmoid(y_hat)
        # y_hat = (pred > 0.5).float()

        # metrics = self.metric(y_hat, y)()
        # metrics_dict = {
        #     "val_loss": loss,
        #     "val_ttr": metrics.ttr,
        #     "val_ftr": metrics.ftr,
        #     "val_acc": metrics.acc,
        # }

        # dummy metrics
        metrics_dict = {"loss": 10, "val_EM": 0.9, "val_F1": 0.9}
        self.validation_step_outputs.append(metrics_dict)
        return metrics_dict

    def on_validation_epoch_end(self):
        results = {
            "loss": torch.tensor(
                [x["val_loss"] for x in self.validation_step_outputs]
            ).mean(),
            "EM": torch.tensor(
                [x["val_EM"] for x in self.validation_step_outputs]
            ).mean(),
            "F1": torch.tensor(
                [x["val_F1"] for x in self.validation_step_outputs]
            ).mean(),
        }
        for k, v in results.items():
            self.log(
                f"val_{k}", v, on_epoch=True, prog_bar=True, logger=True, sync_dist=True
            )
            # self.log(f"val_{k}", v, on_epoch=True, prog_bar=True) # , logger=True)

    def test_step(self, batch, batch_idx):
        # x = batch["x"]
        # y = batch["y"]
        y_hat = self(**batch)
        # (batch, num_classes)
        y_hat = y_hat.squeeze()
        # (batch,)
        pred = F.sigmoid(y_hat)

        # (batch_probabilities,)
        # y_hat = (pred > 0.5).float()
        # (batch_labels,)
        # metrics = self.metric(y_hat, y)()

        metrics_dict = {
            "test_EM": 0.9,
            "test_F1": 0.8,
        }
        self.test_step_outputs.append(metrics_dict)
        return metrics_dict

    def on_test_epoch_end(self):
        results = {
            "F1": torch.tensor([x["test_EM"] for x in self.test_step_outputs]).mean(),
            "EM": torch.tensor([x["test_F1"] for x in self.test_step_outputs]).mean(),
        }

        for k, v in results.items():
            self.log(
                f"test_{k}",
                v,
                on_epoch=True,
                prog_bar=True,
                logger=True,
                sync_dist=True,
            )

    def configure_optimizers(self):

        # special scheduler for transformers
        optimizer = optim.AdamW(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=0.001,  # self.cfg_fitting.learning_rate,
            betas=(0.9, 0.999),
            eps=1e-08,
            weight_decay=0.05,
        )
        return {
            "optimizer": optimizer,
            # "lr_scheduler": scheduler_1,
            "monitor": "val_loss",
        }

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pathlib import Path
from dataclasses import dataclass
from pathlib import Path
from babl.data import TextDataset, TextDataModule
from pytorch_lightning.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    LearningRateMonitor,
)


class CallbackCollection:
    def __init__(self, model_name, data_path) -> None:

        ####################################################################################
        @dataclass
        class FittingArgs:
            es_patience: int = 5
            model_dir: str = str(Path("/home/nameduser/Code/babl/outputs") / model_name)

        ####################################################################################

        self.data_path = data_path

        self.args = FittingArgs()

    def __call__(self):
        lr_monitor = LearningRateMonitor(logging_interval="epoch")

        early_stopping = EarlyStopping(
            mode="min", monitor="val_loss", patience=self.args.es_patience
        )
        checkpoint_callback = ModelCheckpoint(
            monitor="val_loss",
            dirpath=self.args.model_dir,
            save_top_k=2,
            save_last=True,
            mode="min",
            filename="{epoch}-{val_loss:.2f}-{val_acc:.2f}-{val_ttr:.2f}-{val_ftr:.2f}",
        )

        callbacks = {
            "checkpoint": checkpoint_callback,
            "lr": lr_monitor,
            "es": early_stopping,
        }
        # callbacks = [checkpoint_callback, lr_monitor, early_stopping]
        return callbacks


class Fitter:
    def __init__(
        self,
        model,
        tokenizer,
        model_name,
        data_path="../inputs",
    ):
        self.model = model
        self.tokenizer = tokenizer
        self.model_name = model_name
        self.data_path = data_path

    def setup(self):
        data_module = TextDataModule(data_path=self.data_path, tokenizer=self.tokenizer)

        train_loader = data_module.train_dataloader()
        val_loader = data_module.val_dataloader()
        test_loader = data_module.test_dataloader()

        return train_loader, val_loader, test_loader

    def callbacks(self):
        # cfg_fitting = self.cfg_fitting
        callback_collection = CallbackCollection(self.model_name, self.data_path)
        return callback_collection()

    def __call__(self):

        ####################################################################################
        @dataclass
        class FittingArgs:
            es_patience: int = 5
            model_dir: str = str(Path("/home/ola/Code/babl/outputs") / self.model_name)
            max_epoch: int = 10
            fast_dev_run: bool = True
        ####################################################################################

        args = FittingArgs()

        logger = TensorBoardLogger(
            save_dir=args.model_dir,
            name="lightning_logs",
        )
        Model = self.model
        # get loaders and datamodule to access input shape
        train_loader, val_loader, test_loader = self.setup()
        print("Created training, validating and test loaders .... ")
        # get input shape for onnx exporting
        # input_shape = data_module.input_shape
        # init model
        # kwargs = {}
        # model = Model(**kwargs)

        # setup training, validating and testing routines for the model
        routine = Routine(self.model)

        # Init a trainer to execute routine
        callback_dict = self.callbacks()
        callback_list = [v for (_, v) in callback_dict.items()]
        number_devices = os.getenv("CUDA_VISIBLE_DEVICES", "1,").split(",")
        try:
            number_devices.remove("")
        except ValueError:
            pass

        # ####################################################################################
        # @dataclass
        # class FittingArgs:
        #     max_epoch: int = 10
        #     fast_dev_run: bool = True

        # ####################################################################################

        # args = FittingArgs()
        trainer = Trainer(
            accelerator="cpu",
            devices=len(number_devices),
            # strategy=os.getenv("STRATEGY", "ddp_notebook"),
            sync_batchnorm=True,
            logger=logger,
            max_epochs=args.max_epoch,
            callbacks=callback_list,
            num_sanity_val_steps=2,
            # resume_from_checkpoint=self.cfg_fitting.resume_from_checkpoint,
            gradient_clip_val=1.0,
            fast_dev_run=args.fast_dev_run,
        )

        trainer.fit(
            routine, train_dataloaders=train_loader, val_dataloaders=val_loader
        )  # ,ckpt_path=PATH)

        if args.fast_dev_run:
            # issue with finding best weights path for in fast dev run using last model weights
            model_ckpt_path = callback_dict["checkpoint"].__dict__["last_model_path"]
        else:
            model_ckpt_path = callback_dict["checkpoint"].__dict__["best_model_path"]

        trainer.test(
            dataloaders=test_loader,
            ckpt_path=model_ckpt_path,
        )
        # Return the input_shapes and trainer of the model for exporting
        return trainer

In [3]:
from babl.models import MODELS_CHOICES, MODELS
from babl.config import T5 as T5Config
from pathlib import Path

model_name = "t5"
full_model_name = MODELS_CHOICES[model_name][0]
t_w_m = MODELS[model_name]

tokenizer = t_w_m["tok"]
model = t_w_m["model"]

t = tokenizer.from_pretrained(full_model_name)
m = model.from_pretrained(full_model_name, **T5Config().__dict__)

data_path_root = Path("/home/nameduser/Code/babl/inputs")

# data_path_val = data_path_root / "10k.jsonl"
# ds = TextDataset(data_path_val, tokenizer=t, plain_text=False)
# from babl.data import T2TDataCollator
# from torch.utils.data import DataLoader
# t_dl = DataLoader(ds, batch_size=64, shuffle=True, collate_fn=T2TDataCollator())
# test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
# data_module = TextDataModule(data_path, tokenizer)

Fitter(model=m, model_name=full_model_name, tokenizer=t, data_path=data_path_root)()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/nameduser/.local/share/virtualenvs/babl-qoEDH0A2/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.

  | Name  | Type                       | Params | Mode
---------------------------------------

Created training, validating and test loaders .... 


/home/nameduser/.local/share/virtualenvs/babl-qoEDH0A2/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:476: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Epoch 0:   0%|          | 0/1 [00:00<?, ?it/s] 

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Per-token loss cross entropy
tensor(10.3852, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(3.8576, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(3.3010, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(0.9408, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(0.4076, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(0.1963, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(0.1035, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(0.0272, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(0.1794, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(0.0638, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(1.0094, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(1.3266, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(1.9805, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(3.6689, grad_fn=<DivBackward1>)
Per-token loss cross entropy
tensor(5.9450, gra

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
# for b in t_dl:
#     # print(b)
#     m(**b)

import torch.nn.functional as F 


y = torch.tensor([[ 822,   10,  125, 100, 100, 100],
                  [ 822,   10,  116, 100, 100, 100],
                  [ 822,   10,  125, 100, 100, 100],
                  [ 822,   10,  213, 100, 100, 100],
                  [ 822,   10,  213, 100, 100, 100],
                  [ 822,   10,  116, 100, 100, 100],
                  [ 822,   10,  125, 100, 100, 100],
                  [ 822,   10,  116, 100, 100, 100],
                  [ 822,   10,  125, 100, 100, 100],
                  [ 822,   10,  213, 100, 100, 100],
                  [ 822,   10,  213, 100, 100, 100],
                  [ 822,   10,  116, 100, 100, 100]], dtype=torch.long
)
y_hat = torch.tensor([[-20.2879,  -9.8936, -13.5965, -40.7275, -40.8642, -40.8486],
         [-34.0870,  -3.6627, -14.2458,  -46.1296, -46.3147, -46.2990],
         [-30.5974,  -3.4536, -15.5923,  -43.6581, -43.8461, -43.8219],
         [-18.1922,  -8.0767, -14.5352,  -45.5706, -45.7357, -45.7194],
         [-18.1516,  -8.0787, -14.4750,  -45.4796, -45.6429, -45.6272],
         [-18.1262,  -8.1061, -14.4559,  -45.4136, -45.5755, -45.5602],
         [-17.2200,  -9.7170, -14.2499,  -38.4455, -38.5326, -38.4609],
         [-34.3804,  -6.2359, -13.2374,  -42.5014, -42.6473, -42.5558],
         [-27.8060,  -7.1265, -15.4786,  -42.2502, -42.3610, -42.2977],
         [-17.2795,  -7.8251, -15.8752,  -44.6078, -44.7242, -44.6339],
         [-17.1784,  -7.7900, -15.8198,   -44.4029, -44.5184, -44.4275],
         [-17.1213,  -7.7632, -15.7711,   -44.2831, -44.3977, -44.3082]]) 

num_class= 1321
y_hat = torch.stack([y_hat]*num_class, dim=2)

y_hat.shape
y.shape
# y.shape
# y_hat.shape
y = F.one_hot(y, num_classes=1321)
y = y.float()

y.shape == y_hat.shape

y_hat = F.softmax(y_hat, dim=-1)



losses = []
for tok in range(y_hat.shape[1]):
    print(y[:,tok,:].shape)
    print(y_hat[:,tok,:].shape)
    loss = F.cross_entropy(y_hat[:,tok,:] , y[:,tok,:])
    print(loss)
    # loss = F.nll_loss(y_hat[:,tok,:] , y[:,tok,:])
    losses.append(loss)


torch.mean(torch.tensor(losses))
# yx = F.one_hot(y, num_classes=1321)[:,0,:].shape

# loss = F.nll_loss(y_hat , y)


In [None]:

data_path_root = Path("/home/nameduser/Code/babl/inputs")

data_path_val = data_path_root / "10k.jsonl"
ds = TextDataset(data_path_val, tokenizer=t, plain_text=True )

# from babl.data import T2TDataCollator
# from torch.utils.data import DataLoader
# t_dl = DataLoader(ds, batch_size=64, shuffle=True, collate_fn=T2TDataCollator())
# test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
# data_module = TextDataModule(data_path, tokenizer)

In [10]:
x=  torch.tensor([[2]]).item()
x

2

In [None]:
ds.ds["input_text"].__len__()

In [None]:
list({"x": [1, 2, 3, 4]}.values())[0].__len__()