In [0]:
!pip install numpy==1.17.4
!pip freeze
import os
import numpy as np
import torch
from PIL import Image

import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.datasets import VOCDetection

from engine import train_one_epoch, evaluate
import utils
import transforms as T

import xml.etree.ElementTree as ET

absl-py==0.9.0
alabaster==0.7.12
albumentations==0.1.12
altair==4.1.0
asgiref==3.2.7
astor==0.8.1
astropy==4.0.1.post1
astunparse==1.6.3
atari-py==0.2.6
atomicwrites==1.3.0
attrs==19.3.0
audioread==2.1.8
autograd==1.3
Babel==2.8.0
backcall==0.1.0
beautifulsoup4==4.6.3
bleach==3.1.4
blis==0.4.1
bokeh==1.4.0
boto3==1.12.38
botocore==1.15.38
Bottleneck==1.3.2
branca==0.4.0
bs4==0.0.1
CacheControl==0.12.6
cachetools==3.1.1
catalogue==1.0.0
certifi==2020.4.5.1
cffi==1.14.0
chainer==6.5.0
chardet==3.0.4
click==7.1.1
cloudpickle==1.3.0
cmake==3.12.0
cmdstanpy==0.4.0
colorlover==0.3.0
community==1.0.0b1
contextlib2==0.5.5
convertdate==2.2.0
coverage==3.7.1
coveralls==0.5
crcmod==1.7
cufflinks==0.17.3
cupy-cuda101==6.5.0
cvxopt==1.2.4
cvxpy==1.0.29
cycler==0.10.0
cymem==2.0.3
Cython==0.29.16
daft==0.0.4
dask==2.12.0
dataclasses==0.7
datascience==0.10.6
decorator==4.4.2
defusedxml==0.6.0
descartes==1.1.0
dill==0.3.1.1
distributed==1.25.3
Django==3.0.5
dlib==19.18.0
docopt==0.6.2
docutils==0.15.2

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(3, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()

In [0]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

# load a pre-trained model for classification and return
# only the features
# backbone = torchvision.models.mobilenet_v2(pretrained=True).features
# print(backbone)
backbone = net#torchvision.models.mobilenet_v2(pretrained=True).features
print(backbone)
# FasterRCNN needs to know the number of
# output channels in a backbone. For mobilenet_v2, it's 1280
# so we need to add it here
backbone.out_channels = 16

# let's make the RPN generate 5 x 3 anchors per spatial
# location, with 5 different sizes and 3 different aspect
# ratios. We have a Tuple[Tuple[int]] because each feature
# map could potentially have different sizes and
# aspect ratios
anchor_generator = AnchorGenerator(sizes=((32,),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))

# let's define what are the feature maps that we will
# use to perform the region of interest cropping, as well as
# the size of the crop after rescaling.
# if your backbone returns a Tensor, featmap_names is expected to
# be [0]. More generally, the backbone should return an
# OrderedDict[Tensor], and in featmap_names you can choose which
# feature maps to use.
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0', '1'],
                                                output_size=7,
                                                sampling_ratio=2)

# put the pieces together inside a FasterRCNN model
model = FasterRCNN(backbone,
                   num_classes=21,
                   rpn_anchor_generator=anchor_generator,
                   box_roi_pool=roi_pooler)

Net(
  (conv1): Conv2d(3, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
)


In [0]:
# Sample code from the TorchVision 0.3 Object Detection Finetuning Tutorial
# http://pytorch.org/tutorials/intermediate/torchvision_tutorial.html

import os
import numpy as np
import torch
from PIL import Image

import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

from engine import train_one_epoch, evaluate
import utils
import transforms as T

from torchvision.datasets import VOCDetection


import torch
from tqdm import tqdm
import torchvision

import transforms as T
from  torch.utils.tensorboard import SummaryWriter

class PrepareInstance(object):
    CLASSES = (
        "__background__ ", "aeroplane", "bicycle", "bird", "boat",
        "bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
        "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
        "sofa", "train", "tvmonitor",
    )
    def __call__(self, image, target):
        anno = target['annotation']
        h, w = anno['size']['height'], anno['size']['width']
        boxes = []
        classes = []
        area = []
        iscrowd = []
        objects = anno['object']
        if not isinstance(objects, list):
            objects = [objects]
        for obj in objects:
            bbox = obj['bndbox']
            bbox = [int(bbox[n]) - 1 for n in ['xmin', 'ymin', 'xmax', 'ymax']]
            boxes.append(bbox)
            classes.append(self.CLASSES.index(obj['name']))
            iscrowd.append(int(obj['difficult']))
            area.append((bbox[2] - bbox[0]) * (bbox[3] - bbox[1]))

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        classes = torch.as_tensor(classes)
        area = torch.as_tensor(area)
        iscrowd = torch.as_tensor(iscrowd)

        image_id = anno['filename'][5:-4]
        image_id = torch.as_tensor([int(image_id)])

        target = {}
        target["boxes"] = boxes
        target["labels"] = classes
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        return image, target



def get_voc(root, image_set, train=True, transforms=None):
    t = [PrepareInstance()]

    if transforms is not None:
        t.append(transforms)
    t.append(T.ToTensor())
    if train:
      t.append(T.RandomHorizontalFlip(0.5))
    transforms = T.Compose(t)

    dataset = torchvision.datasets.VOCDetection(root, '2007', image_set, transforms=transforms, download=True)

    return dataset


def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model

In [0]:
# dataset = get_voc('.', 'trainval')
# # define training and validation data loaders
# print(dataset.images)
import os

os.mkdir('./proposals_dataset')
for i in CLASSES:
  os.mkdir('./proposals_dataset/{}'.format(i))

In [0]:
# import matplotlib.pyplot as plt
# CLASSES = (
#         "__background__ ", "aeroplane", "bicycle", "bird", "boat",
#         "bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
#         "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
#         "sofa", "train", "tvmonitor",
#     )
# # for i in range(len(dataset.images)):
# with open("./proposals/classes.txt", 'w') as file: 
#   with open("./proposals/train.txt", 'w') as train_file:
#     for i in range(len(dataset.images)):
#       img, target = dataset.__getitem__(i)
#       print(img.shape)
#       for j in range(len(target['boxes'])):
#         bbox = target['boxes'][j]
#         label = target['labels'][j]
#         fname = './proposals/{}_{}.jpg'.format(i,j)
#         img_tensor = img.permute(1, 2, 0)[int(bbox[1]):int(bbox[3]),int(bbox[0]):int(bbox[2]),:].numpy()
#         # plt.imshow(img_tensor)
#         # plt.show()
#         plt.imsave(fname, img_tensor)
#         file.write(str(label.item()) + '\n')
#         train_file.write(fname+'\n')

In [0]:
# !zip -r ./proposals.zip ./proposals
# from google.colab import files
# files.download("./proposals.zip")

In [0]:
from engine import train_one_epoch, evaluate
import utils

def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)
  
from engine import train_one_epoch, evaluate
import utils


def main():
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has two classes only - background and person
    num_classes = 21
    # use our dataset and defined transformations

    dataset = get_voc('.', 'trainval')
    # define training and validation data loaders
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=2, shuffle=True, num_workers=2,
        collate_fn=utils.collate_fn)
    
    dataset_test = get_voc('.', 'trainval', train=False)
    # define training and validation data loaders
    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1, shuffle=False, num_workers=2,
        collate_fn=utils.collate_fn)

    # get the model using our helper function
    # model = get_model_instance_segmentation(num_classes)

    # move model to the right device
    model.to(device)

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.005,
                                momentum=0.9, weight_decay=0.0005)
    # and a learning rate scheduler
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    # let's train it for 10 epochs
    num_epochs = 10

    for epoch in range(num_epochs):
        # train for one epoch, printing every 10 iterations
        train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
        # update the learning rate
        lr_scheduler.step()
        # evaluate on the test dataset
        evaluate(model, data_loader_test, device=device)

    print("That's it!")

In [0]:
main()

Using downloaded and verified file: ./VOCtrainval_06-Nov-2007.tar
Using downloaded and verified file: ./VOCtrainval_06-Nov-2007.tar
Epoch: [0]  [   0/2506]  eta: 0:42:05  lr: 0.000010  loss: 10.3168 (10.3168)  loss_classifier: 3.0476 (3.0476)  loss_box_reg: 0.0040 (0.0040)  loss_objectness: 0.6931 (0.6931)  loss_rpn_box_reg: 6.5721 (6.5721)  time: 1.0077  data: 0.1281  max mem: 407
Epoch: [0]  [  10/2506]  eta: 0:06:45  lr: 0.000060  loss: 10.2307 (9.8209)  loss_classifier: 3.0468 (3.0472)  loss_box_reg: 0.0000 (0.0005)  loss_objectness: 0.6931 (0.6931)  loss_rpn_box_reg: 6.4841 (6.0801)  time: 0.1624  data: 0.0178  max mem: 516
Epoch: [0]  [  20/2506]  eta: 0:04:52  lr: 0.000110  loss: 9.8658 (9.6832)  loss_classifier: 3.0403 (3.0388)  loss_box_reg: 0.0000 (0.0003)  loss_objectness: 0.6930 (0.6930)  loss_rpn_box_reg: 6.1533 (5.9511)  time: 0.0730  data: 0.0072  max mem: 516
Epoch: [0]  [  30/2506]  eta: 0:04:08  lr: 0.000160  loss: 9.8323 (9.7658)  loss_classifier: 3.0179 (3.0268)  lo