In [1]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl

### Step 1: Define LightningModule

In [12]:
class LitAutoEncoder(pl.LightningModule):

    def __init__(self, batch_size=10):
        super().__init__()
        self.batch_size = batch_size
        self.encoder = nn.Sequential(
            nn.Linear(28*28, 64),
            nn.ReLU(),
            nn.Linear(64, 3)
        )
        self.decoder = nn.Sequential(
            nn.Linear(3, 64),
            nn.ReLU(),
            nn.Linear(64, 28*28)
        )

    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        embedding = self.encoder(x)
        return embedding

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        x, y = batch
        x = x.view(x.size(0), -1)
        z = self.encoder(x)
        x_hat = self.decoder(z)
        loss = F.mse_loss(x_hat, x)
        # Logging to TensorBoard by default
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

### Step 2: Fit with Lightning Trainer

In [3]:
dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
train_loader = DataLoader(dataset)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw/train-images-idx3-ubyte.gz to /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw/train-labels-idx1-ubyte.gz to /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw/t10k-images-idx3-ubyte.gz to /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw/t10k-labels-idx1-ubyte.gz to /home/ahsong/Shell_DS/91_lightning-hydra/MNIST/raw

Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [14]:
# init model
autoencoder = LitAutoEncoder()

# most basic trainer, uses good defaults (auto-tensorboard, checkpoints, logs, and more)
# trainer = pl.Trainer(gpus=8) (if you have GPUs)
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger('tb_logs', name='my_model')

trainer = pl.Trainer(max_epochs=4, gpus=1, logger=logger, auto_scale_batch_size=True,
                     default_root_dir="~/Shell_DS/91_lightning-hydra/lightning_logs")

# # train using Sharded DDP
# from pytorch_lightning.plugins.ddp_sequential_plugin import DDPSequentialPlugin
# trainer = Trainer(gpus=8, accelerator='ddp', plugins='ddp_sharded')

# plugin = DDPSequentialPlugin(balance=[1, 1, 1, 1])
# trainer = Trainer(accelerator='ddp', gpus=4, plugins=[plugin])

# # find the batch size
# trainer.tune(autoencoder)

# tuner = Tuner(trainer)
# # Invoke method
# new_batch_size = tuner.scale_batch_size(model, *extra_parameters_here)

# # Override old batch size (this is done automatically)
# model.hparams.batch_size = new_batch_size


trainer.fit(autoencoder, train_loader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 50.4 K
1 | decoder | Sequential | 51.2 K
---------------------------------------
101 K     Trainable params
0         Non-trainable params
101 K     Total params
0.407     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

The Trainer automates:

- Epoch and batch iteration
- Calling of optimizer.step(), backward, zero_grad()
- Calling of .eval(), enabling/disabling grads
- weights loading
- Tensorboard (see loggers options)
- Multi-GPU support
- TPU
- 16-bit precision AMP support

### LOGGERS
Lightning supports the most popular logging frameworks (TensorBoard, etc.)

In [None]:
# def training_step(self, batch, batch_idx):
#     self.log('my_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
#     img = ...
#     log_image(img, self.trainer.log_dir)


# class MyModule(LightningModule):
#     def any_lightning_module_function_or_hook(self):
#         some_img = fake_image()
#         self.logger.experiment.add_image('generated_images', some_img, 0)


# from pytorch_lightning.loggers import MLFlowLogger
# mlf_logger = MLFlowLogger(experiment_name="default", tracking_uri="file:./ml-runs")
# trainer = Trainer(logger=mlf_logger)


# from pytorch_lightning.loggers import TensorBoardLogger
# logger = TensorBoardLogger('tb_logs', name='my_model')


# trainer = pl.Trainer(max_epochs=4, gpus=1, logger=logger, auto_scale_batch_size=True,
#                      default_root_dir="~/Shell_DS/91_lightning-hydra/lightning_logs")

In [19]:
!tensorboard --logdir="tb_logs" 

2021-06-18 13:24:18.040667: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.4.1 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [20]:
!tensorboard --logdir="lightning_logs"

2021-06-18 13:30:10.950855: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.4.1 at http://localhost:6006/ (Press CTRL+C to quit)
^C
