In [1]:
import sys
import matplotlib.pyplot as plt

import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
# from torchview import draw_graph
from network import SampleModel
from dataset import SampleDataset
from sklearn.model_selection import train_test_split
from pathlib import Path
from pa228_tools import train, validate
import glob


In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Computing with {}!'.format(device))

Computing with cuda!


In [3]:
def label_loss_prep(yb, dev): # takes batch
    label_dict = {
        (0, 0, 0) : 0,
        (128, 64, 128) : 1,
        (70, 70, 70) : 2,
        (153, 153, 153) : 3, 
        (107, 142, 35) : 4,
        (70, 130, 180) : 5,
        (220, 20, 60) : 6,
        (0, 0, 142) : 7
        }
    
    class_masks = torch.zeros(yb.shape[0], 8, yb.shape[1], yb.shape[2])

    for color, class_index in label_dict.items():
        color_mask = torch.all(yb == torch.tensor(color).view(1, 1, 1, 3).to(dev), dim=-1).float()
        class_masks[:, class_index, :, :] = color_mask
    
    return class_masks

def loss_batch(model, loss_func, xb, yb, dev, opt=None):
    print('enter batch')
    xb, yb = xb.to(dev), yb.to(dev)
    print('loss')
    loss = loss_func(model(xb), label_loss_prep(yb, dev))
    print('done loss')
    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)


def train(model, train_dl, loss_func, dev, opt):
        
        model.train()
        loss, size = 0, 0
        for b_idx, (xb, yb) in tqdm(enumerate(train_dl), total=len(train_dl), leave=False):
            print('batching')
            b_loss, b_size = loss_batch(model, loss_func, xb, yb, dev, opt)
            print('done batching')

            loss += b_loss * b_size
            size += b_size
            
        return loss / size
    
    
def validate(model, valid_dl, loss_func, dev, opt=None):
        
        model.eval()
        with torch.no_grad():
            losses, nums = zip(
                *[loss_batch(model, loss_func, xb, yb, dev) for xb, yb in valid_dl]
            )
            
        return np.sum(np.multiply(losses, nums)) / np.sum(nums)

In [4]:
def fit(net, batch_size, epochs, trainloader, validloader, loss_fn, optimizer, device):
    train_losses = []
    validation_losses = []

    for epoch in tqdm(range(epochs), 'epochs'):
        print('training')
        loss = train(net, trainloader, loss_fn, device, optimizer)
        print('validating')
        val_loss = validate(net, validloader, loss_fn, device)

        train_losses.append(loss)
        validation_losses.append(val_loss)
        print(f'epoch {epoch+1}/{epochs}, loss: {loss : .05f}, validation loss: {val_loss:.05f}')

      
    print('Training finished!')
    return train_losses, validation_losses

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Computing with {}!'.format(device))

torch.cuda.current_device()



Computing with cuda!


DeferredCudaCallError: CUDA call failed lazily at initialization with error: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "/opt/conda/conda-bld/pytorch_1712608847532/work/aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. device=, num_gpus=

CUDA call was originally invoked at:

  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "/home/xboril/miniconda3/lib/python3.12/asyncio/base_events.py", line 639, in run_forever
    self._run_once()
  File "/home/xboril/miniconda3/lib/python3.12/asyncio/base_events.py", line 1985, in _run_once
    handle._run()
  File "/home/xboril/miniconda3/lib/python3.12/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in dispatch_queue
    await self.process_one()
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 523, in process_one
    await dispatch(*args)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 429, in dispatch_shell
    await result
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 767, in execute_request
    reply_content = await reply_content
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 429, in do_execute
    res = shell.run_cell(
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
    return super().run_cell(*args, **kwargs)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell
    result = self._run_cell(
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3106, in _run_cell
    result = runner(coro)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
    coro.send(None)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3311, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3493, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_3741949/3725067675.py", line 4, in <module>
    import torch
  File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1331, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 935, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 995, in exec_module
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/torch/__init__.py", line 1478, in <module>
    _C._initExtension(manager_path())
  File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1331, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 935, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 995, in exec_module
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/torch/cuda/__init__.py", line 238, in <module>
    _lazy_call(_check_capability)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/torch/cuda/__init__.py", line 235, in _lazy_call
    _queued_calls.append((callable, traceback.format_stack()))


In [6]:

# config dictionary
config = {
'batch_size': 2,
'epoch': 1,
'num_workers': 1,
'dropout': 0.5,
'lr': 0.0001,
'optimizer':'Adam',
'img_size': 128,
'n_classes': 2
}

PATH = Path('{}'.format('data'), 'data_seg_public')
img_dir = PATH / 'img'
mask_dir = PATH / 'mask'
img_files = glob.glob("{}/*/*.png".format(img_dir))
mask_files = glob.glob("{}/*/*.png".format(mask_dir))
df = pd.DataFrame({'img': img_files, 'mask': mask_files})

import albumentations as A
from albumentations.pytorch import ToTensorV2

transforms = A.Compose([
                        A.SmallestMaxSize (512),
                        A.CenterCrop(512, 1024),
                        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
                        ToTensorV2(),
                        ]   
                    )


In [7]:

train_df, valid_df = train_test_split(df, test_size=.3, random_state=2)
traindataset, valdataset = SampleDataset(train_df, transforms=transforms), SampleDataset(valid_df, transforms=transforms)

trainloader = torch.utils.data.DataLoader(traindataset,
                    batch_size=config['batch_size'],
                    shuffle=False,
                    num_workers=config['num_workers'])

valloader = torch.utils.data.DataLoader(valdataset,
                    batch_size=config['batch_size'],
                    shuffle=False,
                    num_workers=config['num_workers'])


In [10]:


net = SampleModel(num_class=8)
# input_sample = torch.zeros((1, 512, 1024))
# draw_network_architecture(net, input_sample)

# define optimizer and learning rate
optimizer = torch.optim.Adam(net.parameters(), lr=config['lr'])

# define loss function
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=2)

# train the network for three epochs
print(device)
tr_losses, val_losses = fit(net, config['batch_size'], config['epoch'], trainloader, valloader, loss_fn, optimizer, device)


cuda


epochs:   0%|          | 0/1 [00:00<?, ?it/s]

training


  0%|          | 0/1143 [00:00<?, ?it/s]

batching


DeferredCudaCallError: CUDA call failed lazily at initialization with error: device >= 0 && device < num_gpus INTERNAL ASSERT FAILED at "/opt/conda/conda-bld/pytorch_1712608847532/work/aten/src/ATen/cuda/CUDAContext.cpp":50, please report a bug to PyTorch. device=, num_gpus=

CUDA call was originally invoked at:

  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "/home/xboril/miniconda3/lib/python3.12/asyncio/base_events.py", line 639, in run_forever
    self._run_once()
  File "/home/xboril/miniconda3/lib/python3.12/asyncio/base_events.py", line 1985, in _run_once
    handle._run()
  File "/home/xboril/miniconda3/lib/python3.12/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in dispatch_queue
    await self.process_one()
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 523, in process_one
    await dispatch(*args)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 429, in dispatch_shell
    await result
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 767, in execute_request
    reply_content = await reply_content
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 429, in do_execute
    res = shell.run_cell(
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
    return super().run_cell(*args, **kwargs)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell
    result = self._run_cell(
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3106, in _run_cell
    result = runner(coro)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
    coro.send(None)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3311, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3493, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_3741949/3725067675.py", line 4, in <module>
    import torch
  File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1331, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 935, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 995, in exec_module
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/torch/__init__.py", line 1478, in <module>
    _C._initExtension(manager_path())
  File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1331, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 935, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 995, in exec_module
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/torch/cuda/__init__.py", line 238, in <module>
    _lazy_call(_check_capability)
  File "/home/xboril/miniconda3/lib/python3.12/site-packages/torch/cuda/__init__.py", line 235, in _lazy_call
    _queued_calls.append((callable, traceback.format_stack()))
