In [20]:
import multiprocessing
import nvidia_smi
from os import path as osp, getcwd

In [23]:
# Check GPU
!nvidia-smi -L

GPU 0: Tesla V100-SXM3-32GB (UUID: GPU-3f66cb17-51c3-5eaa-68ba-1d4713de8b54)


In [13]:
# Choose num workers based on number of available CPUs
n_workers = multiprocessing.cpu_count() - 2
n_workers

94

In [25]:
# Choose batch size based on amount of available video memory
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
avail_gpu_memory = info.free
batch_size_choices = [2**n for n in range(0,10)]
batch_size_pre_alignment = (avail_gpu_memory / 10e8)*4
batch_size = [x for x in batch_size_choices if x < batch_size_pre_alignment][-1]
batch_size

128

In [26]:
# Training config variables
dataset_path = osp.join(getcwd(), "..", "branching_retinal_mix_dataset")
train_script = osp.join(getcwd(), "train.py")
model_path = osp.join(getcwd(), "models/segformerB0_tubes.py")
pretrained_weights = osp.join(getcwd(), "pretrained/segformer_b0/last_checkpoint.pth")
exp_name = "segformerB0_tubes"

In [27]:
# Run training
train_args = [
    model_path,
    f"--pretrained_weights={pretrained_weights}",
    f"--dataset_path={dataset_path}",
    "--gpus=0",
    f"--workers={n_workers}",
    f"--batch-size={batch_size}",
    f"--exp-name={exp_name}",
    #"--resume-exp=000",
    #"--resume-prefix=42",
    #"--start-epoch=43"
]
execute_training_cmd = f"python {train_script} {' '.join(train_args)}"
!{execute_training_cmd}

Number of GPUs: 1
Run experiment with config:
{   'CHECKPOINTS_PATH': PosixPath('/nfs/hpc/share/wigginno/branching/ClickSEG/experiments/segformerB0_tubes/004_segformerB0_tubes/checkpoints'),
    'EXP_PATH': PosixPath('/nfs/hpc/share/wigginno/branching/ClickSEG/experiments/segformerB0_tubes/004_segformerB0_tubes'),
    'LOGS_PATH': PosixPath('/nfs/hpc/share/wigginno/branching/ClickSEG/experiments/segformerB0_tubes/004_segformerB0_tubes/logs'),
    'VIS_PATH': PosixPath('/nfs/hpc/share/wigginno/branching/ClickSEG/experiments/segformerB0_tubes/004_segformerB0_tubes/vis'),
    'batch_size': 128,
    'dataset_path': '/nfs/hpc/share/wigginno/branching/branching_retinal_mix_dataset',
    'device': device(type='cuda', index=0),
    'distributed': False,
    'exp_name': 'segformerB0_tubes',
    'gpu_ids': [0],
    'gpus': '0',
    'local_rank': 0,
    'model_path': '/nfs/hpc/share/wigginno/branching/ClickSEG/models/segformerB0_tubes.py',
    'multi_gpu': False,
    'ngpus': 1,
    'pretrained_w

In [None]:
# start time: 4:30pm
# end time: 