In [1]:
import os
import lightning.pytorch as pl

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import wandb
from lightning.pytorch.loggers import WandbLogger

In [2]:
PATH_DATASETS = os.environ.get("PATH_DATASETS","/users/PLS0129/ysu0053/CSCI4852_6852_F23_DL/data")
BATCH_SIZE = 256 if torch.cuda.is_available() else 64

In [3]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print('Number of GPUs:',torch.cuda.device_count())
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
Number of GPUs: 1

Tesla V100-PCIE-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [4]:
# Define the Lightning module
class MNISTLightning(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28 * 28, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log('val_loss', loss)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log('test_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(mnist_train, batch_size=32, shuffle=True)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(mnist_val, batch_size=32)
    
    # Add a test dataloader
    def test_dataloader(self):
        return torch.utils.data.DataLoader(mnist_test, batch_size=32, shuffle=False)

In [5]:
# Init our model
mnist_model = MNISTLightning()

# Init DataLoader from MNIST Dataset
mnist_full = datasets.MNIST(PATH_DATASETS, train=True, download=True, transform=transforms.ToTensor())
mnist_train, mnist_val = random_split(mnist_full, [50000, 10000])
#train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE,num_workers=2)
#val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE,num_workers=2)

mnist_test = datasets.MNIST(PATH_DATASETS, train=False, download=True, transform=transforms.ToTensor())
#test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE,num_workers=2)

# Initialize wandb
wandb.init(project='mnist_mlp')
settings=wandb.Settings(silent="True")

# Create the WandbLogger
wandb_logger = WandbLogger()

# Initialize a trainer
trainer = pl.Trainer(max_epochs=5, logger=wandb_logger) #    accelerator="auto",devices=1,

# Train the model ⚡
trainer.fit(mnist_model)

#Test
trainer.test()
# Close wandb run
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malazar[0m. Use [1m`wandb login --relogin`[0m to force relogin


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type   | Params
--------------------------------
0 | fc1  | Linear | 401 K 
1 | fc2  | Linear | 131 K 
2 | fc3  | Linear | 2.6 K 
--------------------------------
535 K     Trainable params
0         Non-trainable params
535 K     Total params
2.143     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 1563/1563 [00:10<00:00, 152.30it/s, v_num=5e5e]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/313 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/313 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 1/313 [00:00<00:01, 270.74it/s][A
Validation DataLoader 0:   1%|          | 2/313 [00:00<00:01, 219.20it/s][A
Validation DataLoader 0:   1%|          | 3/313 [00:00<00:01, 207.21it/s][A
Validation DataLoader 0:   1%|▏         | 4/313 [00:00<00:01, 202.88it/s][A
Validation DataLoader 0:   2%|▏         | 5/313 [00:00<00:01, 197.88it/s][A
Validation DataLoader 0:   2%|▏         | 6/313 [00:00<00:01, 194.41it/s][A
Validation DataLoader 0:   2%|▏         | 7/313 [00:00<00:01, 182.17it/s][A
Validation DataLoader 0:   3%|▎         | 8/313 [00:00<00:01, 181.72it/s][A
Validation DataLoader 0:   3%|▎         | 9/313 [00:00<00:01, 181.41it/s][A
Validation DataLoader 0:   3%|▎         | 10/313 [00:00<00:01, 179.87it/s][

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 1563/1563 [00:12<00:00, 130.14it/s, v_num=5e5e]


  rank_zero_warn(
Restoring states from the checkpoint path at ./lightning_logs/2s4x5e5e/checkpoints/epoch=4-step=7815.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at ./lightning_logs/2s4x5e5e/checkpoints/epoch=4-step=7815.ckpt
SLURM auto-requeueing enabled. Setting signal handlers.
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 313/313 [00:01<00:00, 219.63it/s]




0,1
epoch,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▇▇▇▇▇▇▇█
test_loss,▁
train_loss,█▄▅▂▂▁▂▁▁▂▂▁▁▄▁▁▁▂▁▁▂▂▁▁▁▁▁▁▁▁▁▂▂▁▁▁▁▁▁▂
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▄▁▂▂

0,1
epoch,5.0
test_loss,0.07697
train_loss,0.06331
trainer/global_step,7815.0
val_loss,0.09032


In [6]:
trainer.test(mnist_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
SLURM auto-requeueing enabled. Setting signal handlers.


Testing DataLoader 0: 100%|██████████| 313/313 [00:01<00:00, 244.56it/s]

UsageError: Run (2s4x5e5e) is finished. The call to `log` will be ignored. Please make sure that you are using an active run.