In [1]:
import os
import random

import cv2
import numpy as np

import torch
from torch.utils.data import DataLoader
from torchvision import models

from src.resnet_yolo import resnet50
from yolo_loss import YoloLoss
from src.dataset import VocDetectorDataset
from src.eval_voc import evaluate
from src.predict import predict_image
from src.config import VOC_CLASSES, COLORS
from kaggle_submission import output_submission_csv

import matplotlib.pyplot as plt
import collections

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Initialization

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device

In [3]:
# YOLO network hyperparameters
B = 2  # number of bounding box predictions per cell
S = 14  # width/height of network output grid (larger than 7x7 from paper since we use a different network)

To implement Yolo we will rely on a pretrained classifier as the backbone for our detection network. PyTorch offers a variety of models which are pretrained on ImageNet in the [`torchvision.models`](https://pytorch.org/docs/stable/torchvision/models.html) package. In particular, we will use the ResNet50 architecture as a base for our detector. This is different from the base architecture in the Yolo paper and also results in a different output grid size (14x14 instead of 7x7).

Models are typically pretrained on ImageNet since the dataset is very large (> 1 million images) and widely used. The pretrained model provides a very useful weight initialization for our detector, so that the network is able to learn quickly and effectively.

In [4]:
load_network_path = None #'checkpoints/best_detector.pth' 
pretrained = True

# use to load a previously trained network
if load_network_path is not None:
    print('Loading saved network from {}'.format(load_network_path))
    net = resnet50().to(device)
    net.load_state_dict(torch.load(load_network_path))
else:
    print('Load pre-trained model')
    net = resnet50(pretrained=pretrained).to(device)

Load pre-trained model




In [5]:
learning_rate = 0.001
num_epochs = 1
batch_size = 24

# Yolo loss component coefficients (as given in Yolo v1 paper)
lambda_coord = 5
lambda_noobj = 0.5

## Reading Pascal Data

Since Pascal is a small dataset (5000 in train+val) we have combined the train and val splits to train our detector. This is not typically a good practice, but we will make an exception in this case to be able to get reasonable detection results with a comparatively small object detection dataset.

The train dataset loader also using a variety of data augmentation techniques including random shift, scaling, crop, and flips. Data augmentation is slightly more complicated for detection datasets since the bounding box annotations must be kept consistent throughout the transformations.

Since the output of the detector network we train is an SxSx(B*5+C), we use an encoder to convert the original bounding box coordinates into relative grid bounding box coordinates corresponding to the expected output. We also use a decoder which allows us to convert the opposite direction into image coordinate bounding boxes.

In [6]:
file_root_train = 'data/VOCdevkit_2007/VOC2007/JPEGImages/'
annotation_file_train = 'data/voc2007.txt'

train_dataset = VocDetectorDataset(root_img_dir=file_root_train,dataset_file=annotation_file_train,train=True, S=S)
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=2)
print('Loaded %d train images' % len(train_dataset))

Initializing dataset
Loaded 5011 train images


In [7]:
#train_dataset[0][0].size()

In [8]:
file_root_test = 'data/VOCdevkit_2007/VOC2007test/JPEGImages/'
annotation_file_test = 'data/voc2007test.txt'

test_dataset = VocDetectorDataset(root_img_dir=file_root_test,dataset_file=annotation_file_test,train=False, S=S)
test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False,num_workers=2)
print('Loaded %d test images' % len(test_dataset))

Initializing dataset
Loaded 4950 test images


In [9]:
data = train_dataset[0]
data[0].size()

torch.Size([3, 448, 448])

## Set up training tools

In [10]:
criterion = YoloLoss(S, B, lambda_coord, lambda_noobj)
optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=5e-4)

## Train detector

In [11]:
best_test_loss = np.inf
learning_rate = 1e-3
for epoch in range(num_epochs):
    net.train()
    
    # Update learning rate late in training
    if epoch == 30 or epoch == 40:
        learning_rate /= 10.0

    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate
    
    print('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs))
    print('Learning Rate for this epoch: {}'.format(learning_rate))
    
    total_loss = collections.defaultdict(int)
    
    for i, data in enumerate(train_loader):
        data = (item.to(device) for item in data)
        images, target_boxes, target_cls, has_object_map = data
        pred = net(images)
        loss_dict = criterion(pred, target_boxes, target_cls, has_object_map)
        
        for key in loss_dict:
            total_loss[key] += loss_dict[key].item()
        
        optimizer.zero_grad()
        loss_dict['total_loss'].backward()
        optimizer.step()
        
        if (i+1) % 2 == 0:
            outstring = 'Epoch [%d/%d], Iter [%d/%d], Loss: ' % ((epoch+1, num_epochs, i+1, len(train_loader)))
            outstring += ', '.join( "%s=%.3f" % (key[:-5], val / (i+1)) for key, val in total_loss.items() )
            print(outstring)
    
    # evaluate the network on the test data
    if (epoch + 1) % 5 == 0:
        test_aps = evaluate(net, test_dataset_file=annotation_file_test, img_root=file_root_test)
        print(epoch, test_aps)
    with torch.no_grad():
        test_loss = 0.0
        net.eval()
        for i, data in enumerate(test_loader):
            data = (item.to(device) for item in data)
            images, target_boxes, target_cls, has_object_map = data
            
            pred = net(images)
            loss_dict = criterion(pred, target_boxes, target_cls, has_object_map)
            test_loss += loss_dict['total_loss'].item()
        test_loss /= len(test_loader)
    
    if best_test_loss > test_loss:
        best_test_loss = test_loss
        print('Updating best test loss: %.5f' % best_test_loss)
        torch.save(net.state_dict(),'checkpoints/best_detector.pth')
    
    if (epoch+1) in [5, 10, 20, 30, 40]:
        torch.save(net.state_dict(),'checkpoints/detector_epoch_%d.pth' % (epoch+1))

    torch.save(net.state_dict(),'checkpoints/detector.pth')
    
    



Starting epoch 1 / 1
Learning Rate for this epoch: 0.001


KeyboardInterrupt: 

In [20]:
torch.unique(loss_dict[0])

tensor([0.0000e+00, 1.7372e-05, 4.7404e-03, 6.7832e-03, 7.3681e-03, 1.3472e-02,
        1.5916e-02, 1.6597e-02, 2.1372e-02, 2.1911e-02, 2.6144e-02, 4.9533e-02,
        6.5423e-02, 6.8032e-02, 8.0205e-02, 8.7367e-02, 8.7651e-02, 9.7907e-02,
        1.0544e-01, 1.2060e-01, 1.2316e-01, 1.2525e-01, 1.2571e-01, 1.3381e-01,
        1.6844e-01, 2.1772e-01, 2.6636e-01, 2.6671e-01, 2.7329e-01, 2.7466e-01,
        2.9724e-01, 3.0925e-01, 3.4175e-01, 5.1153e-01, 5.2065e-01],
       grad_fn=<Unique2Backward0>)

In [22]:
best_ious, indices = loss_dict[0].max(axis=0)

In [23]:
best_ious1, indices1 = loss_dict[1].max(axis=0)

In [41]:
torch.where(best_ious>best_ious1, best_ious, best_ious1)

tensor([0.8757, 0.8889, 0.6517, 0.6768, 0.6541, 0.7656, 0.8501, 0.8496, 0.8214,
        0.7904, 0.8950, 0.4768, 0.6404, 0.4575, 0.5089, 0.5202, 0.4178, 0.9146,
        0.9174, 0.7651, 0.4877, 0.8935, 0.4157, 0.0000, 0.1159, 0.4158, 0.8456,
        0.4766, 0.5479, 0.6580, 0.7750, 0.7364, 0.8230, 0.7871, 0.8028, 0.8055,
        0.9586, 0.8892, 0.6093, 0.7072, 0.8016, 0.8976, 0.7458, 0.8259, 0.8083,
        0.8825, 0.2664, 0.1440, 0.7668, 0.8360, 0.8989, 0.8927, 0.7580, 0.8162,
        0.8262, 0.8589, 0.9297, 0.8100, 0.0000, 0.8880, 0.8685],
       grad_fn=<WhereBackward0>)

In [42]:
torch.where(best_ious>best_ious1, indices, indices1)

tensor([41, 29,  7, 29,  7,  7, 34, 41, 51, 56, 34, 60, 10, 17,  0,  0, 17, 22,
         2, 14,  0, 57,  0,  0, 16, 39, 27, 60,  0, 60, 21, 13, 55, 51, 23, 42,
        41, 34, 29, 29, 36, 57, 55, 55, 54, 53, 17, 17,  7,  5, 38, 38, 57, 27,
        58, 37, 41, 17,  0, 29, 56])

In [55]:
pred_boxes_list = [pred[:, :, :, 5*i:5 + 5*i] for i in range(B)]

In [65]:
loss_dict[0]

tensor([[0.5828, 0.3051, 0.1334,  ..., 0.0000, 0.3042, 0.3401],
        [0.5977, 0.4429, 0.3755,  ..., 0.0000, 0.4907, 0.6432],
        [0.3859, 0.7938, 0.3831,  ..., 0.0000, 0.7020, 0.5306],
        ...,
        [0.5095, 0.2601, 0.1627,  ..., 0.0000, 0.2805, 0.3626],
        [0.8592, 0.5038, 0.2219,  ..., 0.0000, 0.5060, 0.5657],
        [0.1002, 0.1140, 0.2042,  ..., 0.0000, 0.1261, 0.1465]],
       grad_fn=<DivBackward0>)

In [63]:
pred_boxes_list[0].reshape(-1,5)

torch.Size([4704, 5])

In [51]:
(best_ious>best_ious1).view(-1,1)

tensor([[ True],
        [ True],
        [ True],
        [False],
        [ True],
        [ True],
        [False],
        [ True],
        [ True],
        [False],
        [False],
        [ True],
        [ True],
        [False],
        [False],
        [False],
        [False],
        [ True],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [ True],
        [ True],
        [ True],
        [ True],
        [False],
        [ True],
        [False],
        [ True],
        [False],
        [ True],
        [ True],
        [ True],
        [False],
        [False],
        [False],
        [False],
        [False],
        [ True],
        [False],
        [False],
        [ True],
        [False],
        [False],
        [False],
        [ True],
        [ True],
        [False],
        [ True],
        [False],
        [ True],
        [False],
        [False],
        [ True],
        [ True],
        [False

In [38]:
loss_dict[0]

tensor([[0.5828, 0.3051, 0.1334,  ..., 0.0000, 0.3042, 0.3401],
        [0.5977, 0.4429, 0.3755,  ..., 0.0000, 0.4907, 0.6432],
        [0.3859, 0.7938, 0.3831,  ..., 0.0000, 0.7020, 0.5306],
        ...,
        [0.5095, 0.2601, 0.1627,  ..., 0.0000, 0.2805, 0.3626],
        [0.8592, 0.5038, 0.2219,  ..., 0.0000, 0.5060, 0.5657],
        [0.1002, 0.1140, 0.2042,  ..., 0.0000, 0.1261, 0.1465]],
       grad_fn=<DivBackward0>)

In [18]:
torch.diagonal(loss_dict[0])

tensor([0.5828, 0.4429, 0.3831, 0.1597, 0.4085, 0.0806, 0.2437, 0.2397, 0.3404,
        0.2890, 0.1753, 0.0232, 0.0481, 0.0827, 0.0427, 0.0684, 0.3327, 0.1620,
        0.2098, 0.1820, 0.0513, 0.0462, 0.0287, 0.0000, 0.0077, 0.0195, 0.4321,
        0.0193, 0.1757, 0.0956, 0.1360, 0.1942, 0.5743, 0.6685, 0.2721, 0.0000,
        0.2642, 0.1374, 0.0669, 0.1340, 0.7149, 0.3059, 0.4018, 0.4385, 0.1808,
        0.1872, 0.0415, 0.0110, 0.1244, 0.4775, 0.5676, 0.2276, 0.2669, 0.5403,
        0.3837, 0.2315, 0.4759, 0.1495, 0.0000, 0.5060, 0.1465],
       grad_fn=<DiagonalBackward0>)

In [19]:
loss_dict[0]

tensor([[0.5828, 0.3051, 0.1334,  ..., 0.0000, 0.3042, 0.3401],
        [0.5977, 0.4429, 0.3755,  ..., 0.0000, 0.4907, 0.6432],
        [0.3859, 0.7938, 0.3831,  ..., 0.0000, 0.7020, 0.5306],
        ...,
        [0.5095, 0.2601, 0.1627,  ..., 0.0000, 0.2805, 0.3626],
        [0.8592, 0.5038, 0.2219,  ..., 0.0000, 0.5060, 0.5657],
        [0.1002, 0.1140, 0.2042,  ..., 0.0000, 0.1261, 0.1465]],
       grad_fn=<DivBackward0>)

In [26]:
loss_dict[0].max(0)

torch.return_types.max(
values=tensor([0., 0., 0.,  ..., 0., 0., 0.], grad_fn=<MaxBackward0>),
indices=tensor([0, 0, 0,  ..., 0, 0, 0]))

In [43]:
pred[0][:, :, 10:30][0]

tensor([[0.5615, 0.2149, 0.7674, 0.5361, 0.4301, 0.8496, 0.4098, 0.1755, 0.7524,
         0.7043, 0.6310, 0.2661, 0.7576, 0.4861, 0.3564, 0.5420, 0.7012, 0.6674,
         0.7293, 0.1961],
        [0.4558, 0.4398, 0.6796, 0.7049, 0.3708, 0.4593, 0.6923, 0.6599, 0.6557,
         0.6388, 0.4195, 0.5832, 0.3296, 0.3826, 0.4654, 0.7341, 0.8972, 0.8687,
         0.7484, 0.2679],
        [0.5733, 0.3183, 0.4129, 0.4595, 0.2009, 0.4039, 0.6262, 0.5722, 0.4986,
         0.4203, 0.4959, 0.3535, 0.8088, 0.1897, 0.4407, 0.6583, 0.8531, 0.8413,
         0.7367, 0.2966],
        [0.2879, 0.1305, 0.5989, 0.3842, 0.2623, 0.8088, 0.3100, 0.3260, 0.8073,
         0.2666, 0.3595, 0.1531, 0.5925, 0.7725, 0.3180, 0.8951, 0.7841, 0.6974,
         0.3814, 0.3868],
        [0.4664, 0.2636, 0.6590, 0.3526, 0.2709, 0.4842, 0.6744, 0.5104, 0.4779,
         0.2805, 0.3080, 0.2807, 0.4932, 0.6143, 0.2547, 0.8753, 0.6869, 0.8794,
         0.6697, 0.4137],
        [0.3784, 0.2892, 0.5769, 0.3842, 0.3653, 0.6776, 0.5

# View example predictions

In [None]:
net.eval()

# select random image from test set
image_name = random.choice(test_dataset.fnames)
image = cv2.imread(os.path.join(file_root_test, image_name))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

print('predicting...')
result = predict_image(net, image_name, root_img_directory=file_root_test)
for left_up, right_bottom, class_name, _, prob in result:
    color = COLORS[VOC_CLASSES.index(class_name)]
    cv2.rectangle(image, left_up, right_bottom, color, 2)
    label = class_name + str(round(prob, 2))
    text_size, baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)
    p1 = (left_up[0], left_up[1] - text_size[1])
    cv2.rectangle(image, (p1[0] - 2 // 2, p1[1] - 2 - baseline), (p1[0] + text_size[0], p1[1] + text_size[1]),
                  color, -1)
    cv2.putText(image, label, (p1[0], p1[1] + baseline), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1, 8)

plt.figure(figsize = (15,15))
plt.imshow(image)


## Evaluate on Test

To evaluate detection results we use mAP (mean of average precision over each class)

In [None]:
test_aps = evaluate(net, test_dataset_file=annotation_file_test, img_root=file_root_test)

### Cell added to get intermediate mAP values for students

In [None]:
network_paths = ['detector_epoch_%d.pth' % epoch for epoch in [5, 10, 20, 30, 40]]+['detector.pth']
for load_network_path in network_paths:
    print('Loading saved network from {}'.format(load_network_path))
    net_loaded =  resnet50().to(device)
    net_loaded.load_state_dict(torch.load(load_network_path))
    evaluate(net_loaded, test_dataset_file=annotation_file_test)


In [None]:
output_submission_csv('my_new_solution.csv', test_aps)