In [1]:
import os
import torch
import pytorch_lightning as L
import matplotlib.pyplot as plt
from omegaconf import DictConfig

%matplotlib inline
plt.rcParams['image.interpolation'] = 'nearest'

%load_ext autoreload
%autoreload 2

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0).total_memory)

# Get number of cores
print(os.cpu_count())

# Get number of threads
print(torch.get_num_threads())

True
Tesla V100-SXM2-32GB
34072559616
72
36


In [6]:
import os

# List data folder
data_folder = os.path.join(os.getcwd(), "data")
os.listdir(data_folder)

['RUGD', '.gitkeep', 'RUGD_old']

In [7]:
from model import RoadModel
from pytorch_lightning.loggers import WandbLogger

from src import RoadDataModule, LogPredictionsCallback, val_checkpoint, regular_checkpoint


def main(cfg: DictConfig) -> None:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = RoadModel(cfg, device)
    datamodule = RoadDataModule(cfg)

    wandb_logger = WandbLogger(project="road-segmentation", name="baseline", log_model='all')

    trainer = L.Trainer(max_epochs=5, accelerator="gpu", devices=1, logger=wandb_logger, callbacks=[
        LogPredictionsCallback(),
        val_checkpoint,
        regular_checkpoint
    ])
    trainer.fit(model, datamodule=datamodule)
    # trainer.test(model, datamodule=datamodule)
    # trainer.predict(model, datamodule=datamodule)

In [8]:
from hydra import compose, initialize

with initialize(version_base=None, config_path="conf"):
    config = compose(config_name="config")
    main(config)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/mnt/appl/software/PyTorch-Lightning/2.1.0-foss-2022a-CUDA-11.7.0/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:389: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                   | Params
-----------------------------------------------------
0 | model     | DeepLabV3              | 39.6 M
1 | criterion | CrossEntropyLoss       | 0     
2 | accuracy  | MulticlassAccuracy     | 0     
3 | jaccard   | MulticlassJaccardIndex | 0     
-----------------------------------------------------
39.6 M    Trainable params
0         Non-trainable params
39.6 M    Total params
158.536   Total estimated model params size (MB)
SLU

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/tmp/eb-build/PyTorch/2.0.1/foss-2022a-CUDA-11.7.0/pytorch-v2.0.1/aten/src/ATen/native/cuda/NLLLoss2d.cu:103: nll_loss2d_forward_kernel: block: [0,0,0], thread: [768,0,0] Assertion `t >= 0 && t < n_classes` failed.
/tmp/eb-build/PyTorch/2.0.1/foss-2022a-CUDA-11.7.0/pytorch-v2.0.1/aten/src/ATen/native/cuda/NLLLoss2d.cu:103: nll_loss2d_forward_kernel: block: [0,0,0], thread: [769,0,0] Assertion `t >= 0 && t < n_classes` failed.
/tmp/eb-build/PyTorch/2.0.1/foss-2022a-CUDA-11.7.0/pytorch-v2.0.1/aten/src/ATen/native/cuda/NLLLoss2d.cu:103: nll_loss2d_forward_kernel: block: [0,0,0], thread: [770,0,0] Assertion `t >= 0 && t < n_classes` failed.
/tmp/eb-build/PyTorch/2.0.1/foss-2022a-CUDA-11.7.0/pytorch-v2.0.1/aten/src/ATen/native/cuda/NLLLoss2d.cu:103: nll_loss2d_forward_kernel: block: [0,0,0], thread: [771,0,0] Assertion `t >= 0 && t < n_classes` failed.
/tmp/eb-build/PyTorch/2.0.1/foss-2022a-CUDA-11.7.0/pytorch-v2.0.1/aten/src/ATen/native/cuda/NLLLoss2d.cu:103: nll_loss2d_forward_kernel: blo

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
