In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..

/home/ubuntu/s22-unet-vae


In [3]:
import sys

sys.path.append('.')
sys.path.append('./models')
sys.path.append('./data')
sys.path.append('./notebooks')
sys.path.append('./utils')

In [4]:
!pip install -q -r requirements.txt

In [5]:
from lightning.pytorch import Trainer, seed_everything
import lightning as pl
from lightning.pytorch.callbacks import (ModelCheckpoint, 
                                         LearningRateMonitor, 
                                         RichModelSummary,
                                         EarlyStopping)
from config import UNetConfig, load_config, update_config
import torch
import os
from lit_unet import LitUNet
from datamodule import DataModule
from callbacks import ClassAccuracyLoss, PlotExampleCallback
import wandb
from lightning.pytorch.loggers import WandbLogger

os.environ["WANDB_NOTEBOOK_NAME"] = "./notebooks/train-unet-model-1.ipynb"
wandb.init(settings=wandb.Settings(_service_wait=300))

[34m[1mwandb[0m: Currently logged in as: [33makv1000[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112214966669854, max=1.0…

In [6]:
torch.cuda.empty_cache()
pl.seed_everything(42, workers=True)

Seed set to 42


42

In [7]:
config = UNetConfig()
json_data = load_config("training_1.json")
config = update_config(config, json_data)
print(config)

UNetConfig(in_channels=3, out_channels=3, num_filters=64, num_layers=4, batch_size=16, learning_rate=0.001, epochs=25, root_dir='./data', height=240, width=240, optimizer='adam', loss_function='cross_entropy', channel_reduction_method='max_pool', channel_expansion_method='transposed_conv', softmax_dim=1, dropout_rate=0.5, augmentation=True)


In [8]:
config.batch_size = 32
print(config)

UNetConfig(in_channels=3, out_channels=3, num_filters=64, num_layers=4, batch_size=32, learning_rate=0.001, epochs=25, root_dir='./data', height=240, width=240, optimizer='adam', loss_function='cross_entropy', channel_reduction_method='max_pool', channel_expansion_method='transposed_conv', softmax_dim=1, dropout_rate=0.5, augmentation=True)


In [9]:
# initialise the wandb logger and name your wandb project
wandb_logger = WandbLogger(project='s22-unet')

# add your batch size to the wandb config
wandb_logger.experiment.config["batch_size"] = config.batch_size

/opt/conda/envs/pytorch/lib/python3.11/site-packages/lightning/pytorch/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.


In [10]:
data_module = DataModule(config)
data_module.setup()

In [11]:
model = LitUNet(config)

In [12]:
callback = [ModelCheckpoint(dirpath="checkpoints/",
                                                         save_top_k=3,
                                                         monitor="val_loss",
                                                         mode="min",
                                                         filename="model-{epoch:02d}-{val_loss:.2f}-{val_loss:4f}",
                                                         save_last=True,
                                                         verbose=True,),
                              ClassAccuracyLoss(),
                              LearningRateMonitor(logging_interval='step', log_momentum=True)]

In [13]:
trainer = pl.Trainer(precision="16-mixed",
                     max_epochs= config.epochs,
                     accelerator="cuda",
                     devices="auto",
                     logger = wandb_logger,
                     callbacks=callback,
                    check_val_every_n_epoch=3,
                    num_sanity_val_steps=2)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [14]:
from lightning.pytorch.tuner import Tuner

# Create a Tuner
tuner = Tuner(trainer)

# Finding the learning rate
lr_finder =tuner.lr_find(model,datamodule=data_module, min_lr=1e-4, max_lr=1, num_training=trainer.max_epochs,attr_name='learning_rate')

# Plot with
fig = lr_finder.plot(suggest=True)
fig.show()
# Pick point based on plot, or get suggestion
new_lr = lr_finder.suggestion()
print(f"{new_lr=}")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/25 [00:00<?, ?it/s]

RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 30 but got size 60 for tensor number 1 in the list.

In [None]:
new_lr = model.learning_rate