<a href="https://colab.research.google.com/github/alfie1104/deeplearning-with-pytorch/blob/main/pytorch_lightning/pytorch_lightning_MNIST_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.3.0-py3-none-any.whl (812 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.2/812.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.0.0->pytorch-lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.0.0->pytorch-lightning)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.0.0->pyt

In [2]:
!pip install torchmetrics



# PyTorch Lightning

1. model
2. optimizer
3. data
4. training loop "the magic"
5. validation loop "the validation magic"

In [3]:
import torch
from torch import nn, optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

In [4]:
import pytorch_lightning as pl
from torchmetrics.functional import accuracy

class ResNet(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.l1 = nn.Linear(28*28, 64)
    self.l2 = nn.Linear(64,64)
    self.l3 = nn.Linear(64,10)
    self.do = nn.Dropout(0.1)
    self.loss = nn.CrossEntropyLoss()
    self.val_step_outputs = []

  def forward(self, x):
    h1 = nn.functional.relu(self.l1(x))
    h2 = nn.functional.relu(self.l2(h1))
    do = self.do(h2+h1) # if h2 is not necessary, the network will set the bias of module l2 to very negative value, such that the ReLU gonna be setting to zero the value from L2.
    logits = self.l3(do)
    return logits

  # optimizer
  def configure_optimizers(self):
    optimizer = optim.SGD(self.parameters(), lr=1e-2)
    return optimizer

  # training loop
  def training_step(self, batch, batch_idx):
    x, y = batch

    # x : batch size x channel x 28 x 28
    b = x.size(0) # batch size
    x = x.view(b, -1) # in the PyTorch Lightning, we don't need CUDA stuff, even if we want to run on CUDA

    # 1) forward
    logits = self(x)

    # 2) compute the objective function
    J = self.loss(logits, y)

    acc = accuracy(logits, y, task="multiclass", num_classes=10)
    pbar = {'train_acc':acc}
    return {'loss':J, 'progress_bar':pbar} # return {'loss':J} is same as return J in PyTorch-lightning

  def validation_step(self, batch, batch_idx):
    results = self.training_step(batch, batch_idx)
    results['progress_bar']['val_acc'] = results['progress_bar']['train_acc']
    del results['progress_bar']['train_acc']
    return results

  def on_validation_epoch_end(self):
    # [results, results, results, ...] = outputs
    avg_val_loss = torch.tensor([x['loss'] for x in self.val_step_outputs]).mean()
    avg_val_acc = torch.tensor([x['progress_bar']['val_acc'] for x in self.val_step_outputs]).mean()

    self.val_step_outputs.clear() # free memory

    pbar = {'avg_val_acc':avg_val_acc}
    return {'val_loss':avg_val_loss, "progress_bar":pbar}

  def prepare_data(self):
    datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor())

  def setup(self, stage=None):
    # Train, Val split
    dataset = datasets.MNIST('data', train=True, download=False, transform=transforms.ToTensor())
    self.train_data, self.val_data = random_split(dataset, [55000, 5000])

  def train_dataloader(self):
    train_loader = DataLoader(self.train_data, batch_size=32)
    return train_loader

  def val_dataloader(self):
    val_loader = DataLoader(self.val_data, batch_size=32)
    return val_loader


model = ResNet()

In [5]:
# trainer = pl.Trainer(max_epochs=5) # the default epochs of Trainer is 1000
trainer = pl.Trainer(max_epochs=5, devices=1, accelerator='gpu') # training on GPU
trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:02<00:00, 4527902.37it/s]


Extracting data/MNIST/raw/train-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 133843.46it/s]


Extracting data/MNIST/raw/train-labels-idx1-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:01<00:00, 1092207.10it/s]


Extracting data/MNIST/raw/t10k-images-idx3-ubyte.gz to data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 3497435.06it/s]


Extracting data/MNIST/raw/t10k-labels-idx1-ubyte.gz to data/MNIST/raw



INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name | Type             | Params | Mode 
--------------------------------------------------
0 | l1   | Linear           | 50.2 K | train
1 | l2   | Linear           | 4.2 K  | train
2 | l3   | Linear           | 650    | train
3 | do   | Dropout          | 0      | train
4 | loss | CrossEntropyLoss | 0      | train
--------------------------------------------------
55.1 K    Trainable params
0         Non-trainable params
55.1 K    Total params
0.220     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [6]:
# log data (in lightning_logs/version_xx/checkpoints folder, there is the best check point model)
# ! ls lightning_logs/version_3/checkpoints
!ls lightning_logs/

version_0
