In [1]:
from cards_class import CardsDataset, unpackBoundigBox, show, collate_fn
import torch
import time
import torch.backends.cudnn as cudnn
import torch.optim
from torch.utils.data import DataLoader
from model import SSD300, MultiBoxLoss
from utils import *

In [2]:
dataset = CardsDataset('data/images', 'data/txt_cards', 'data/general_labels/classes.txt')
num_classes = len(list(dataset.labels.keys()))
label_map = dataset.invLabels

In [5]:
print('Classes:\n', dataset.labels)
print('Number of classes:\n', num_classes)

Classes:
 {0: 'background', 1: 'a_heart', 2: 'a_diamond', 3: 'a_club', 4: 'a_spade', 5: '2_heart', 6: '2_diamond', 7: '2_club', 8: '2_spade', 9: '3_heart', 10: '3_diamond', 11: '3_club', 12: '3_spade', 13: '4_heart', 14: '4_diamond', 15: '4_club', 16: '4_spade', 17: '5_heart', 18: '5_diamond', 19: '5_club', 20: '5_spade', 21: '6_heart', 22: '6_diamond', 23: '6_club', 24: '6_spade', 25: '7_heart', 26: '7_diamond', 27: '7_club', 28: '7_spade', 29: '8_heart', 30: '8_diamond', 31: '8_club', 32: '8_spade', 33: '9_heart', 34: '9_diamond', 35: '9_club', 36: '9_spade', 37: '10_heart', 38: '10_diamond', 39: '10_club', 40: '10_spade', 41: 'j_heart', 42: 'j_diamond', 43: 'j_club', 44: 'j_spade', 45: 'q_heart', 46: 'q_diamond', 47: 'q_club', 48: 'q_spade', 49: 'k_heart', 50: 'k_diamond', 51: 'k_club', 52: 'k_spade', 53: 'joker'}
Number of classes:
 54


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
print('Availabele GPU:\n', device)

Availabele GPU:
 cpu


In [13]:
# show(dataset[100], 1)

tensor([46, 46,  1,  1], dtype=torch.int32)


In [8]:
# Learning parameters
checkpoint = None#'cards.pth.tar'  # path to model checkpoint, None if none
batch_size = 2  # batch size
iterations = 12000  # number of iterations to train
workers = 4  # number of workers for loading data in the DataLoader
print_freq = 50  # print training status every __ batches
lr = 1e-4  # learning rate
decay_lr_at = [8000, 10000]  # decay learning rate after these many iterations
decay_lr_to = 0.1  # decay learning rate to this fraction of the existing learning rate
momentum = 0.9  # momentum
weight_decay = 5e-4  # weight decay
grad_clip = None  # clip if gradients are exploding, which may happen at larger batch sizes (sometimes at 32) - you will recognize it by a sorting error in the MuliBox loss calculation

cudnn.benchmark = True

In [9]:
def train(train_loader, model, criterion, optimizer, epoch):
    """
    One epoch's training.
    :param train_loader: DataLoader for training data
    :param model: model
    :param criterion: MultiBox loss
    :param optimizer: optimizer
    :param epoch: epoch number
    """
    model.train()  # training mode enables dropout

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss

    start = time.time()

    # Batches
    for i, (images, boxes, labels) in enumerate(train_loader):
        # (images, boxes, labels) = next(iter(train_loader))
        data_time.update(time.time() - start)

        # Move to default device
        images = images.to(device)  # (batch_size (N), 3, 300, 300)
        boxes = [cxcy_to_xy(b).to(device) for b in boxes]
        labels = [l.to(device) for l in labels]

        # Forward prop.
        predicted_locs, predicted_scores = model(images)  # (N, 8732, 4), (N, 8732, n_classes)
        # Loss
        loss = criterion(predicted_locs, predicted_scores, boxes, labels)  # scalar
        # Backward prop.
        optimizer.zero_grad()
        loss.backward()

        # Clip gradients, if necessary
        if grad_clip is not None:
            clip_gradient(optimizer, grad_clip)

        # Update model
        optimizer.step()

        losses.update(loss.item(), images.size(0))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                    'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(train_loader),
                                                                    batch_time=batch_time,
                                                                    data_time=data_time, loss=losses))
    del predicted_locs, predicted_scores, images, boxes, labels  # free some memory since their histories may be stored


In [10]:
def main():

    global start_epoch, label_map, epoch, checkpoint, decay_lr_at

    # Initialize model or load checkpoint
    if checkpoint is None:
        start_epoch = 0
        model = SSD300(n_classes=num_classes)
        # Initialize the optimizer, with twice the default learning rate for biases, as in the original Caffe repo
        biases = list()
        not_biases = list()
        for param_name, param in model.named_parameters():
            if param.requires_grad:
                if param_name.endswith('.bias'):
                    biases.append(param)
                else:
                    not_biases.append(param)
        optimizer = torch.optim.SGD(params=[{'params': biases, 'lr': 2 * lr}, {'params': not_biases}],
                                    lr=lr, momentum=momentum, weight_decay=weight_decay)

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        print('Loaded checkpoint from epoch %d.' % start_epoch)
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    # Move to default device
    model = model.to(device)
    criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)


    train_dataset = dataset

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                                                num_workers=workers,  collate_fn=collate_fn,
                                                drop_last=True)
                                                # pin_memory=True


    epochs = iterations // (len(train_dataset) // batch_size)
    # epochs = 200
    print('Number of epochs:\n', epochs)
    decay_lr_at = [it // (len(train_dataset) // batch_size) for it in decay_lr_at]
    print('Decaying learning rate at epochs:\n', decay_lr_at)
    

     # Epochs
    for epoch in range(start_epoch, epochs):

        # Decay learning rate at particular epochs
        if epoch in decay_lr_at:
            adjust_learning_rate(optimizer, decay_lr_to)

        # One epoch's training
        train(train_loader=train_loader, model=model,criterion=criterion,
              optimizer=optimizer,epoch=epoch)

        # Save checkpoint
        if epoch % 2 == 0:
            save_checkpoint('cards.pth.tar',epoch, model, optimizer)

In [11]:
main()


Loaded base model.

Number of epochs:
 60
Decaying learning rate at epochs:
 [40, 50]
Epoch: [0][0/200]	Batch Time 6.936 (6.936)	Data Time 1.610 (1.610)	Loss 29.0324 (29.0324)	


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x10d029160>
Traceback (most recent call last):
  File "/Users/acano/.virtualenvs/py3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/Users/acano/.virtualenvs/py3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1177, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/local/opt/python@3.8/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/usr/local/opt/python@3.8/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/popen_fork.py", line 44, in wait
    if not wait([self.sentinel], timeout):
  File "/usr/local/opt/python@3.8/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/usr/loc

Traceback (most recent call last):
  File "/Users/acano/.virtualenvs/py3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-263240bbee7e>", line 1, in <module>
    main()
  File "<ipython-input-10-fee3d661165c>", line 56, in main
    train(train_loader=train_loader, model=model,criterion=criterion,
  File "<ipython-input-9-902505160ddf>", line 34, in train
    loss.backward()
  File "/Users/acano/.virtualenvs/py3/lib/python3.8/site-packages/torch/tensor.py", line 221, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/Users/acano/.virtualenvs/py3/lib/python3.8/site-packages/torch/autograd/__init__.py", line 130, in backward
    Variable._execution_engine.run_backward(
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/acano/.virtualenvs/py3/lib/p

TypeError: object of type 'NoneType' has no len()