Intro to the framework that helps organise PyTorch code

In [None]:
!pip install torch torchvision matplotlib tqdm pytorch-lightning



In [None]:
import torch
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import tqdm
import math

import torch.nn.functional as F
from torch import nn
from torch import optim

%matplotlib inline
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Set a random seed for reproducibility
seed = 1

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)

<torch._C.Generator at 0x7ed3741d66f0>

In [None]:
# Load the MNSIT dataset
# Initial transform
transform = transforms.Compose([
    transforms.ToTensor(),
])

train_data = datasets.MNIST('data', train=True, download=True, transform=transform)
test_data = datasets.MNIST('data', train=False, download=True, transform=transform)

# Basic statistics of this dataset
train_mean = train_data.data.double().mean() / 255.
train_std = train_data.data.double().std() / 255.
print(f'Train Data: Mean={train_mean}, Std={train_std}')

# Normalize train and test data using calculated training mean and standard deviation
transform = transforms.Compose([
   transforms.ToTensor(),
   transforms.Normalize((train_mean, ), (train_std, ))
])

train_data.transform = transform
test_data.transform = transform

batch_size = 32
torch.manual_seed(seed)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
                                           shuffle=True, num_workers=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
                                          shuffle=False, num_workers=True)

Train Data: Mean=0.1306604762738429, Std=0.30810780717887876


In [None]:
train_data

Dataset MNIST
    Number of datapoints: 60000
    Root location: data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
           )

In [None]:
test_data

Dataset MNIST
    Number of datapoints: 10000
    Root location: data
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
           )

In [None]:
import pytorch_lightning as pl

**Lightning module**
  - subclass of torch.nn.Module
  - links optimizers to models
  - defines how the model behaves during training, validation and testing

In [None]:
class NeuralNet(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.linear = nn.Linear(in_features = 784, out_features = 10)

  def forward(self, x):
    return self.linear(x)

  # defines what needs to be done in a single step of training
  # maps a batch to a loss value - which can be backpropagated
  def training_step(self, batch, batch_idx):
    x, y = batch
    x = x.view(batch_size, -1)
    ypred = self(x)
    loss = F.cross_entropy(ypred, y)
    return loss

  def test_step(self, batch):
    x, y = batch
    x = x.view(batch_size, -1)
    ypred = self(x)
    test_loss = F.cross_entropy(ypred, y)
    return test_loss

  def configure_optimizers(self) -> torch.optim.Optimizer:
    optimizer = torch.optim.Adam(self.parameters(), lr=3e-4)
    return optimizer

**Trainer module**
  - relies on the Lightning module interface to train, validate and test
  - make the max_epochs, min_epochs, max_time, max_steps choice


In [None]:
import logging
import textwrap
import traceback

try:
    model = NeuralNet()
    trainer = pl.Trainer(max_epochs = 10)
    trainer.fit(model = model, train_dataloaders = train_loader)

except pl.utilities.exceptions.MisconfigurationException as error:
    print("Error:", *textwrap.wrap(str(error), 80), sep="\n\t")  # show the error without raising it

finally:
    logging.getLogger("pytorch_lightning").setLevel(logging.INFO)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name   | Type   | Params | Mode 
------------------------------------------
0 | linear | Linear | 7.9 K  | train
------------------------------------------
7.9 K     Trainable params
0         Non-trainable params
7.9 K     Total params
0.031     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


Lightning Data Module
  - organises the data loading code
  - desgined to work in distributed settings where operations that set state need to be handled.
  
  - Methods:
    - setup - handles state that needs to be set up on each GPU, on a machine - For eg: splitting data and associating it with self
    - prepare_data - handles state that needs to be set up in each machine - For eg: downloading fata from storage and writing it to the local disk

In [None]:
print(pl.LightningDataModule.__doc__)

A DataModule standardizes the training, val, test splits, data preparation and transforms. The main advantage is
    consistent data splits, data preparation and transforms across models.

    Example::

        import lightning.pytorch as L
        import torch.utils.data as data
        from pytorch_lightning.demos.boring_classes import RandomDataset

        class MyDataModule(L.LightningDataModule):
            def prepare_data(self):
                # download, IO, etc. Useful with shared filesystems
                # only called on 1 GPU/TPU in distributed
                ...

            def setup(self, stage):
                # make assignments here (val/train/test split)
                # called on every process in DDP
                dataset = RandomDataset(1, 100)
                self.train, self.val, self.test = data.random_split(
                    dataset, [80, 10, 10], generator=torch.Generator().manual_seed(42)
                )

            def train_dataloader(self):

CallBack
  - used to add useful features to training, validation and testing
  - can be implement features like model checkpointing
  - "callback" code is generally executed on some trigger
  - custom callback code can be defined by overriding the desired "hook" method

In [None]:
class CustomCallBack(pl.Callback):
    def on_train_epoch_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        print("👋 hello from the start of the training epoch!")

    def on_validation_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        print("👋 hello from the end of the validation epoch!")

In [None]:
model = NeuralNet()
trainer = pl.Trainer(max_epochs = 10, callbacks=[CustomCallBack()])
trainer.fit(model = model, train_dataloaders = train_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name   | Type   | Params | Mode 
------------------------------------------
0 | linear | Linear | 7.9 K  | train
------------------------------------------
7.9 K     Trainable params
0         Non-trainable params
7.9 K     Total params
0.031     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

👋 hello from the start of the training epoch!
👋 hello from the start of the training epoch!
👋 hello from the start of the training epoch!
👋 hello from the start of the training epoch!
👋 hello from the start of the training epoch!
👋 hello from the start of the training epoch!
👋 hello from the start of the training epoch!
👋 hello from the start of the training epoch!
👋 hello from the start of the training epoch!
👋 hello from the start of the training epoch!


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


Metrics:
  - help monitor training and catch bugs
  - help determine how to fix bugs in training
  - can be very costly, though - metrics calculation during training adds extra work!

  - **torchmetrics** lib - provides a Metric class that incorporates best performance practices such as accumulation over batches and devices
  - torchmetrics.Metric inherits from nn.Module -- like nn.Module it relies on persistent state and can be distributed over devices and machines

In [None]:
import torchmetrics

In [None]:
class NeuralNet(pl.LightningModule):

  def __init__(self):
    super().__init__()
    self.linear = nn.Linear(in_features = 784, out_features = 10)
    self.train_accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=10)
    self.test_accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=10)

  def forward(self, x):
    return self.linear(x)

  def training_step(self, batch, batch_idx):
    x, y = batch
    x = x.view(batch_size, -1)
    ypred = self(x)
    loss = F.cross_entropy(ypred, y)
    self.log(value = self.train_accuracy(torch.argmax(ypred, dim = 1), y),
             prog_bar = True,
             name = "Train acc")
    return loss

  def test_step(self, batch):
    x, y = batch
    x = x.view(batch_size, -1)
    ypred = self(x)
    test_loss = F.cross_entropy(ypred, y)
    self.log(value = self.test_accuracy(torch.argmax(ypred, dim = 1), y),
             prog_bar = True,
             name = "Test acc")
    return test_loss

  def configure_optimizers(self) -> torch.optim.Optimizer:
    optimizer = torch.optim.Adam(self.parameters(), lr=3e-4)
    return optimizer

In [None]:
model = NeuralNet()
trainer = pl.Trainer(max_epochs = 10)
trainer.fit(model = model, train_dataloaders=train_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | linear         | Linear             | 7.9 K  | train
1 | train_accuracy | MulticlassAccuracy | 0      | train
2 | test_accuracy  | MulticlassAccuracy | 0      | train
--------------------------------------------------------------
7.9 K     Trainable params
0         Non-trainable params
7.9 K     Total params
0.031     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


In [115]:
trainer.test(model = model, dataloaders = test_loader)

Testing: |          | 0/? [00:00<?, ?it/s]