In [4]:
import torch
import torchvision

## Data Preparation
    This notebook is for training FasterRCNN using VOCDataset, for training with other datasets, refer to readme of this repository.
    Here I have only used detection parameters of VOCDataset. VOCDataset has 20 classes.

In [5]:
# Here we have defined 21 classes by considering first class as background and rest as classes of VOCDataset.
classes = ['__background__','aeroplane', 'bicycle', 'bird', 'boat','bottle', 'bus', 'car', 'cat', 'chair','cow', 'diningtable', 'dog', 'horse','motorbike', 'person', 'pottedplant','sheep', 'sofa', 'train', 'tvmonitor']

In [0]:
import os
import sys
import tarfile
import collections
from torchvision.datasets import VisionDataset

if sys.version_info[0] == 2:
    import xml.etree.cElementTree as ET
else:
    import xml.etree.ElementTree as ET

from PIL import Image
from torchvision.datasets.utils import download_url, check_integrity, verify_str_arg

# Here we have defined links for downloading different VOCDataset based on year.
DATASET_YEAR_DICT = {
    '2012': {
        'url': 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar',
        'filename': 'VOCtrainval_11-May-2012.tar',
        'md5': '6cd6e144f989b92b3379bac3b3de84fd',
        'base_dir': os.path.join('VOCdevkit', 'VOC2012')
    },
    '2011': {
        'url': 'http://host.robots.ox.ac.uk/pascal/VOC/voc2011/VOCtrainval_25-May-2011.tar',
        'filename': 'VOCtrainval_25-May-2011.tar',
        'md5': '6c3384ef61512963050cb5d687e5bf1e',
        'base_dir': os.path.join('TrainVal', 'VOCdevkit', 'VOC2011')
    },
    '2010': {
        'url': 'http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar',
        'filename': 'VOCtrainval_03-May-2010.tar',
        'md5': 'da459979d0c395079b5c75ee67908abb',
        'base_dir': os.path.join('VOCdevkit', 'VOC2010')
    },
    '2009': {
        'url': 'http://host.robots.ox.ac.uk/pascal/VOC/voc2009/VOCtrainval_11-May-2009.tar',
        'filename': 'VOCtrainval_11-May-2009.tar',
        'md5': '59065e4b188729180974ef6572f6a212',
        'base_dir': os.path.join('VOCdevkit', 'VOC2009')
    },
    '2008': {
        'url': 'http://host.robots.ox.ac.uk/pascal/VOC/voc2008/VOCtrainval_14-Jul-2008.tar',
        'filename': 'VOCtrainval_11-May-2012.tar',
        'md5': '2629fa636546599198acfcfbfcf1904a',
        'base_dir': os.path.join('VOCdevkit', 'VOC2008')
    },
    '2007': {
        'url': 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar',
        'filename': 'VOCtrainval_06-Nov-2007.tar',
        'md5': 'c52e279531787c972589f7e41ab4ae64',
        'base_dir': os.path.join('VOCdevkit', 'VOC2007')
    }
}




In [None]:
# Defining Dataset Class
class VOCDetection(VisionDataset):
    """`Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Detection Dataset.
    Args:
        root (string): Root directory of the VOC Dataset.
        year (string, optional): The dataset year, supports years 2007 to 2012.
        image_set (string, optional): Select the image_set to use, ``train``, ``trainval`` or ``val``
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
            (default: alphabetic indexing of VOC's 20 classes).
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, required): A function/transform that takes in the
            target and transforms it.
        transforms (callable, optional): A function/transform that takes input sample and its target as entry
            and returns a transformed version.
    """

    def __init__(self,
                 root,
                 year='2012',
                 image_set='train',
                 download=False,
                 transform=None,
                 target_transform=None,
                 transforms=None):
        super(VOCDetection, self).__init__(root, transforms, transform, target_transform)
        self.year = year
        self.url = DATASET_YEAR_DICT[year]['url']
        self.filename = DATASET_YEAR_DICT[year]['filename']
        self.md5 = DATASET_YEAR_DICT[year]['md5']
        valid_sets = ["train", "trainval", "val"]
        if year == "2007":
            valid_sets.append("test")
        self.image_set = verify_str_arg(image_set, "image_set", valid_sets)

        base_dir = DATASET_YEAR_DICT[year]['base_dir']
        voc_root = os.path.join(self.root, base_dir)
        image_dir = os.path.join(voc_root, 'JPEGImages')
        annotation_dir = os.path.join(voc_root, 'Annotations')

        if download:
            download_extract(self.url, self.root, self.filename, self.md5)

        if not os.path.isdir(voc_root):
            raise RuntimeError('Dataset not found or corrupted.' +
                               ' You can use download=True to download it')

        splits_dir = os.path.join(voc_root, 'ImageSets/Main')

        split_f = os.path.join(splits_dir, image_set.rstrip('\n') + '.txt')

        with open(os.path.join(split_f), "r") as f:
            file_names = [x.strip() for x in f.readlines()]

        self.images = [os.path.join(image_dir, x + ".jpg") for x in file_names]
        self.annotations = [os.path.join(annotation_dir, x + ".xml") for x in file_names]
        assert (len(self.images) == len(self.annotations))

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: (image, target) where target is a dictionary of the XML tree.
        """
        img = Image.open(self.images[index]).convert('RGB')
        raw_target = self.parse_voc_xml(
            ET.parse(self.annotations[index]).getroot())
        
        target = {}
        boxes = []
        labels = []
        try:
            for that in raw_target["annotation"]["object"]:
                boxes.append(list(map(float, list(that["bndbox"].values()))))
                labels.append(classes.index(that["name"]))
            target["boxes"] = torch.tensor(boxes)
            target["labels"] = torch.tensor(labels)
        except TypeError:
            boxes.append(list(map(float, list(raw_target["annotation"]["object"]["bndbox"].values()))))
            labels.append(classes.index(raw_target["annotation"]["object"]["name"]))
            target["boxes"] = torch.tensor(boxes)
            target["labels"] = torch.tensor(labels)
        target["image_id"] = torch.tensor([int(raw_target["annotation"]["filename"].split(".")[0])])
        
    
        if self.transforms is not None:
            img, target = self.transforms(img, target)
            
        return img, target

    def __len__(self):
        return len(self.images)

    def parse_voc_xml(self, node):
        voc_dict = {}
        children = list(node)
        if children:
            def_dic = collections.defaultdict(list)
            for dc in map(self.parse_voc_xml, children):
                for ind, v in dc.items():
                    def_dic[ind].append(v)
            if node.tag == 'annotation':
                def_dic['object'] = [def_dic['object']]
            voc_dict = {
                node.tag:
                    {ind: v[0] if len(v) == 1 else v
                     for ind, v in def_dic.items()}
            }
        if node.text:
            text = node.text.strip()
            if not children:
                voc_dict[node.tag] = text
        return voc_dict


def download_extract(url, root, filename, md5):
    download_url(url, root, filename, md5)
    with tarfile.open(os.path.join(root, filename), "r") as tar:
        tar.extractall(path=root)

In [4]:
# Just calling dataset class to check weather everything is working fine.
datadict = VOCDetection(".", year="2007", download=True, transforms=False)

Using downloaded and verified file: ./VOCtrainval_06-Nov-2007.tar


In [9]:
# importing useful python scripts for training and utility functions.
import sys
sys.path.append("../src/")

import transforms as T
import utils
from engine import train_one_epoch, evaluate

    
def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)


In [0]:
# model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
# data_loader = torch.utils.data.DataLoader(datadict, batch_size=2, shuffle=True, num_workers=4, collate_fn=utils.collate_fn)
# images,targets = next(iter(data_loader))
# images = list(image for image in images)
# targets = [{k: v for k, v in t.items()} for t in targets]
# output = model(images,targets)   # Returns losses and detections

In [13]:
# print(output)

{'loss_classifier': tensor(2.1501, grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0.4055, grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.0180, grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_rpn_box_reg': tensor(0.0361, grad_fn=<DivBackward0>)}


In [0]:
# # For inference
# model.cpu()
# model.eval()
# x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
# predictions = model(x) 
# print(predictions)

In [6]:
def faster_rcnn_model(num_classes, device):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    model.to(device)
    
    return model

In [0]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

# defining functions for training.
def main():
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
    dataset = VOCDetection(".", year="2007", transforms=get_transform(train=True))
    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset, indices[-50:])
    
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=4, shuffle=True, num_workers=8,
        collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=4, shuffle=False, num_workers=8,
        collate_fn=utils.collate_fn)
    
    
    num_classes = 21  #  classes + background 
    model = faster_rcnn_model(num_classes, device)
    
    #defining paremeters for training
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                              momentum=0.9, weight_decay=0.0005)
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                 step_size=3,
                                                 gamma=0.1)

    
    num_epochs = 10

    for epoch in range(num_epochs):
      # train for one epoch, printing every 10 iterations
      train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
      # update the learning rate
      lr_scheduler.step()
      # evaluate on the test dataset
      evaluate(model, data_loader_test, device=device)

    print("That's it!")



In [18]:
main()

Epoch: [0]  [  0/613]  eta: 0:15:39  lr: 0.000013  loss: 2.9484 (2.9484)  loss_classifier: 2.7104 (2.7104)  loss_box_reg: 0.1975 (0.1975)  loss_objectness: 0.0109 (0.0109)  loss_rpn_box_reg: 0.0295 (0.0295)  time: 1.5324  data: 0.6506  max mem: 4603
Epoch: [0]  [ 10/613]  eta: 0:07:35  lr: 0.000095  loss: 3.0651 (2.9660)  loss_classifier: 2.7525 (2.6548)  loss_box_reg: 0.2853 (0.2678)  loss_objectness: 0.0156 (0.0166)  loss_rpn_box_reg: 0.0208 (0.0267)  time: 0.7559  data: 0.0652  max mem: 6370
Epoch: [0]  [ 20/613]  eta: 0:07:36  lr: 0.000176  loss: 2.3115 (2.3566)  loss_classifier: 2.0950 (2.0694)  loss_box_reg: 0.2212 (0.2441)  loss_objectness: 0.0115 (0.0141)  loss_rpn_box_reg: 0.0238 (0.0291)  time: 0.7308  data: 0.0064  max mem: 7210
Epoch: [0]  [ 30/613]  eta: 0:07:24  lr: 0.000258  loss: 0.9785 (1.8460)  loss_classifier: 0.6815 (1.5649)  loss_box_reg: 0.2078 (0.2383)  loss_objectness: 0.0069 (0.0118)  loss_rpn_box_reg: 0.0241 (0.0310)  time: 0.7667  data: 0.0075  max mem: 7210


KeyError: ignored

In [0]:
!nvidia-smi

Mon Mar  9 20:53:50 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru