<a href="https://colab.research.google.com/github/alfie1104/deeplearning-with-pytorch/blob/main/pytorch_lightning/5_callbacks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
%%capture
!pip install lightning

In [27]:
%%capture
!pip install torchmetrics

## config.py

In [28]:
# Training hyperparameters
INPUT_SIZE = 784
NUM_CLASSES = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 64
NUM_EPOCHS = 3

# Dataset
DATA_DIR = "dataset/"
NUM_WORKERS = 4

# Compute related
ACCELERATOR = "gpu"
DEVICES = [0] # single gpu. it is the same as DEVICE = 1
PRECISION = 16

## dataset.py

In [29]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import lightning as pl

class MnistDataModule(pl.LightningDataModule):
  def __init__(self, data_dir, batch_size, num_workers):
    super().__init__()
    self.data_dir = data_dir
    self.batch_size = batch_size
    self.num_workers = num_workers

  def prepare_data(self):
    # single gpu
    datasets.MNIST(self.data_dir, train=True, download=True)
    datasets.MNIST(self.data_dir, train=False, download=True)

  def setup(self, stage):
    # multiple gpu
    entire_dataset = datasets.MNIST(
        root=self.data_dir,
        train=True,
        transform=transforms.ToTensor(),
        download=False,
    )
    self.train_ds, self.val_ds = random_split(entire_dataset, [50000, 10000])
    self.test_ds = datasets.MNIST(
        root=self.data_dir,
        train=False,
        transform=transforms.ToTensor(),
        download=False,
    )

  def train_dataloader(self):
    return DataLoader(self.train_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

  def val_dataloader(self):
    return DataLoader(self.val_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

  def test_dataloader(self):
    return DataLoader(self.test_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

## model.py

In [30]:
import torch
import torch.nn.functional as F
from torch import nn, optim
import lightning as pl
import torchmetrics

class NN(pl.LightningModule):
  def __init__(self, input_size, learning_rate, num_classes):
    super().__init__()
    self.lr = learning_rate
    self.fc1 = nn.Linear(input_size, 50)
    self.fc2 = nn.Linear(50, num_classes)
    self.loss_fn = nn.CrossEntropyLoss()
    self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
    self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=num_classes)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

  def training_step(self, batch, batch_idx):
    loss, scores, y = self._common_step(batch, batch_idx)
    accuracy = self.accuracy(scores,y)
    f1_score = self.f1_score(scores,y)
    self.log_dict({'train_loss':loss, 'train_accuracy':accuracy, 'train_f1_score':f1_score},
                  on_step=False, on_epoch=True, prog_bar=True, logger=True)
    return {'loss' : loss, 'scores':scores, 'y':y}

  # def training_epoch_end(self, outputs):
  #   pass

  def validation_step(self, batch, batch_idx):
    loss, scores, y = self._common_step(batch, batch_idx)
    self.log('val_loss', loss)
    return loss

  def test_step(self, batch, batch_idx):
    loss, scores, y = self._common_step(batch, batch_idx)
    self.log('test_loss', loss)
    return loss

  def _common_step(self, batch, batch_idx):
    x, y = batch
    x = x.reshape(x.size(0), -1) # flatten the matrix x to array
    scores = self.forward(x)
    loss = self.loss_fn(scores, y)
    # in PyTorch lightning we don't need to care of 'zero_grad','backward','step'
    return loss, scores, y

  def predict_step(self, batch, batch_idx):
    x, y = batch
    x = x.reshape(x.size(0), -1) # flatten the matrix x to array
    scores = self.forward(x)
    preds = torch.argmax(scores, dim=1)
    return preds

  def configure_optimizers(self):
    return optim.Adam(self.parameters(), lr=self.lr)

## callbacks.py

In [31]:
from lightning.pytorch.callbacks import EarlyStopping, Callback

class MyPrintingCallback(Callback):
  def __init__(self):
    super().__init__()

  def on_train_start(self, trainer, pl_module):
    print("Starting to train!")

  def on_train_end(self, trainer, pl_module):
    print("Training is done.")

## train.py

In [32]:
import torch
import lightning as pl

torch.set_float32_matmul_precision("medium") # to make lightning happy

# 파일로 분리했을 때 아래 주석 해제 필요
# from model import NN
# from dataset import MnistDataModule
# import config
# from callbacks import MyPrintingCallback, EarlyStopping

# if __name__ == "__main__":
#   # Set device cuda for GPU if it's available otherwise run on the CPU
#   device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#   # Initialize network
#   model = NN(input_size=config.INPUT_SIZE, learning_rate=config.LEARNING_RATE, num_classes=config.NUM_CLASSES)

#   # Data Module
#   dm = MnistDataModule(data_dir=config.DATA_DIR, batch_size=config.BATCH_SIZE, num_workers=config.NUM_WORKERS)

#   trainer = pl.Trainer(accelerator=config.ACCELERATOR, devices=config.DEVICES, min_epochs=1, max_epochs=config.NUM_EPOCHS, precision=config.PRECISION, callbacks=[MyPrintingCallback(), EarlyStopping(monitor="val_loss")])

#   trainer.fit(model, dm)
#   trainer.validate(model, dm)
#   trainer.test(model, dm)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize network
model = NN(input_size=INPUT_SIZE, learning_rate=LEARNING_RATE, num_classes=NUM_CLASSES)

# Data Module
dm = MnistDataModule(data_dir=DATA_DIR, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)

trainer = pl.Trainer(accelerator=ACCELERATOR, devices=DEVICES, min_epochs=1, max_epochs=NUM_EPOCHS, precision=PRECISION, callbacks=[MyPrintingCallback(), EarlyStopping(monitor="val_loss")])

trainer.fit(model, dm)
trainer.validate(model, dm)
trainer.test(model, dm)

/usr/local/lib/python3.10/dist-packages/lightning/fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name     | Type               | Params | Mode 
--------------------------------------------------------
0 | fc1      | Linear             | 39.2 K | train
1 | fc2      | Linear             | 510    | train
2 | loss_fn  | CrossEntropyLoss   | 0      | train
3 | accuracy | MulticlassAccuracy | 0      | train
4 | f1_score | 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Starting to train!


Training: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


Training is done.


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.1755320131778717}]