In [None]:
# default_exp pylightning_frcnn

# Exploring Object Detection using COCO Dataset

## Pytorch Lightning & Torch Vision

Let's start by using what's already in Torch Vision, see [example](https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html).
To force some learning, I shall attempt to port the example to [Pytorch-Lightning](https://github.com/PyTorchLightning/pytorch-lightning) as I've read that it removes a lot of boiler plate code and standardized Pytorch usage. As well as make advanced features like gradient accumulation and multi-GPU multi-node training simple.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import json, os, requests, sys, tarfile, torch, torchvision
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import torch.nn.functional as F
import pytorch_lightning as pl

from collections import defaultdict
from IPython.utils import io
from pathlib import Path
from PIL import Image

from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import *    

from torch import nn
from torch import optim
from torch.utils.data import DataLoader, random_split

from torchvision import transforms
from torchvision.datasets import CocoDetection
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

from tqdm import tqdm

In [None]:
print(f"Python ver {sys.version}, torch {torch.__version__}, torchvision {torchvision.__version__}, pytorch_lightning {pl.__version__}")

%matplotlib inline

## Download a Sample of COCO Data

The full COCO Dataset is huge (~50GB?). For my self education exploring object detection, with the intention of using pretrained model in transfer learning, it is not practical to deal with dataset this big as my first project.  Luckily, the kind folks at [FastAI](https://fast.ai) have prepared some convenient subsets, the medium size 3GB https://s3.amazonaws.com/fast-ai-coco/coco_sample.tgz seems like a good candidate.  The 800KB "http://files.fast.ai/data/examples/coco_tiny.tgz" on the other hand seems way too small, thus may not have enough data for adequate training.

However, to allow faster iteration, let's start with the tiny dataset just to test drive the whole process...

In [None]:
#export

def fetch_data(url:str, datadir: Path, tgt_fname:str, chunk_size:int=8*1024):
    dest = datadir/tgt_fname
    print(f"Downloading from {url} to {dest}...")
    with requests.get(url, stream=True, timeout=10) as response:
        content_len = int(response.headers['content-length'])
        with open(dest, 'wb') as f:
            with tqdm(total=content_len) as pbar:
                nbytes = 0
                num_chunks = 0
                for chunk in response.iter_content(chunk_size=chunk_size):
                    chunk_len = len(chunk)
                    nbytes += chunk_len
                    num_chunks += 1
                    f.write(chunk)
                    pbar.update(chunk_len)

    with tarfile.open(dest, 'r') as tar:
        extracted = []
        for item in tar:
            tar.extract(item, datadir)
            extracted.append(item.name)

    print(f"Downloaded {nbytes} from {url} to {dest}, extracted in {datadir}: {extracted[:3]},...,{extracted[-3:]}")

In [None]:
# export
froot = "coco_tiny"
fname = f"{froot}.tgz"
url = f"http://files.fast.ai/data/examples/{fname}"

# If using the bigger Coco subset, use these values
# froot = "coco_sample"
# fname = f"{froot}.tgz"
# url = f"https://s3.amazonaws.com/fast-ai-coco/{fname}"

datadir = Path("workspace")
fetch_data(url, datadir, fname, chunk_size=1024*1024)

## Check Annotations

Let's load and inspect the annotation file that comes with the coco tiny dataset...

In [None]:
# export

json_fname = datadir/froot/'train.json'
with open(json_fname, 'r') as json_f:
    train_json = json.load(json_f)

In [None]:
train_json['categories'], train_json['images'][0], [a for a in train_json['annotations'] if a['image_id']==train_json['images'][0]['id'] ]

## Digest the Dataset for useful Stats

Do some basic analysis of the data to get numbers like total images, boxes, and average box count per image...

In [None]:
#export
class CocoDatasetStats():
    # num_cats
    # num_imgs
    # num_bboxs
    # cat2name
    # lbl2cat
    # cat2lbl
    # img2fname
    # imgs
    # img2cat2bs
    # cat2ibs
    # avg_ncats_per_img
    # avg_nboxs_per_img
    # avg_nboxs_per_cat
    # chn_means
    # chn_stds
    def __init__(self, ann:dict):
        self.num_cats = len(ann['categories'])
        self.num_imgs = len(ann['images'])
        self.num_bboxs = len(ann['annotations'])
        
        # build cat id to name, assign FRCNN 
        self.cat2name = { c['id']: c['name'] for c in ann['categories'] }
        
        # expected labels w 0 = background
        self.lbl2cat = { l+1: c for l, (c, n) in enumerate(self.cat2name.items()) }
        self.cat2lbl = { c: l+1 for l, (c, n) in enumerate(self.cat2name.items()) }
        self.lbl2cat[0] = (0, 'background')
        self.cat2lbl[0] = 0

        # img_id to file map
        self.img2fname = { img['id']:img['file_name'] for img in ann['images'] }
        self.imgs = [ { 'id':img_id, 'file_name':img_fname } for (img_id, img_fname) in self.img2fname.items() ]

        # build up 2 maps for later analysis
        self.img2cat2ibs = defaultdict(lambda: defaultdict(lambda:set()))
        self.img2liibs = defaultdict(lambda:[])
        self.cat2iibs = defaultdict(lambda:set())
        anno_id = 0
        for a in ann['annotations']:
            img_id = a['image_id']
            cat_id = a['category_id']
            c2ibs_for_img = self.img2cat2ibs[img_id]
            (x, y, w, h) = a['bbox']
            ib = (anno_id, x, y, w, h) 
            iib = (img_id, *ib)
            liib = (cat_id, *iib)
            c2ibs_for_img[cat_id].add(ib)
            self.cat2iibs[cat_id].add(iib)
            self.img2cat2ibs[img_id] = c2ibs_for_img
            self.img2liibs[img_id].append(liib)
            anno_id +=1

        acc_ncats_per_img = 0.0
        acc_nboxs_per_img = 0.0
        for img_id, c2ibs in self.img2cat2ibs.items():
            acc_ncats_per_img += len(c2ibs)
            for cat_id, ibs in c2ibs.items():
                acc_nboxs_per_img += len(ibs)

        self.avg_ncats_per_img = acc_ncats_per_img/self.num_imgs
        self.avg_nboxs_per_img = acc_nboxs_per_img/self.num_imgs

        acc_nboxs_per_cat = 0.0
        for cat_id, iibs in self.cat2iibs.items():
            acc_nboxs_per_cat += len(iibs)

        self.avg_nboxs_per_cat = acc_nboxs_per_cat/self.num_cats
        
        # compute Images per channel means and std deviation using Welford’s method
        
        n = 0
        mean = np.zeros((3,))
        M2 = np.zeros((3,))

        for img in self.imgs:
            fname = img['file_name']
            n = n + 1
            img = Image.open(datadir/froot/'train'/fname)
            ia = np.asarray(img)
            x = np.mean(ia,axis=(0,1))
            delta = x - mean
            mean = mean + delta/n
            M2 = M2 + delta*(x - mean)

        variance = M2/(n - 1)
        
        self.chn_means = mean
        self.chn_stds = np.sqrt(variance)
        

In [None]:
#export 

stats = CocoDatasetStats(train_json)

print(
    f"Categories {stats.num_cats}, Images {stats.num_imgs}, Boxes {stats.num_bboxs}, "
    f"avg cats/img {stats.avg_ncats_per_img:.1f}, avg boxs/img {stats.avg_nboxs_per_img:.1f}, avg boxs/cat {stats.avg_nboxs_per_cat:.1f}.")

print(f"Image means by channel {stats.chn_means}, std.dev by channel {stats.chn_stds}")

Print out number of boxes per category to see if the distribution is not too unbalanced...

In [None]:
{ (cid, stats.cat2name[cid]): len(iibs) for cid, iibs in stats.cat2iibs.items() }

Hmm, seems like a lot of chairs and books, but vases, tvs, couches and remotes are roughly 10x less. This may be a problem. 
But we have enough to test drive the training pipeline. 

To make it more balanced, may be I can combine vase+tv+couch+remote into a new category 'others' later?

Anyway, we can also switch to a bigger dataset (e.g. the 3GB coco-sample) once we are happy with code.

## Look at Images

Let's look at an image.

In [None]:
img_pos = 0
img_id = train_json['images'][img_pos]['id']
cat2ibs = stats.img2cat2ibs[img_id]
img_fname = stats.img2fname[img_id]
img = Image.open(datadir/froot/'train'/img_fname)
plt.figure(figsize=(16,10))
plt.imshow(img)
img_id, img_fname, cat2ibs

## Overlay Boxes and Labels from Annotation

Let's overlay bounding boxes and labels over the image to confirm our understanding of the data.

In [None]:
#export

def bbox_to_rect(ibbox, color):
    return plt.Rectangle(
        xy=(ibbox[1], ibbox[2]), width=ibbox[3], height=ibbox[4],
        fill=False, edgecolor=color, linewidth=2)

def label_for_bbox(ibbox, label, color):
    return plt.text(ibbox[1], ibbox[2], f"{ibbox[0]}.{label}", color=color, fontsize=16)

def overlay_img_bbox(img:Image, cat2ibs: dict, cat2name: dict):
    cat2color = { cid: cname for (cid, cname) in zip(cat2ibs.keys(), mcolors.TABLEAU_COLORS.keys()) }
    fig = plt.figure(figsize=(16,10))
    fig = plt.imshow(img)
    for cid, ibs in cat2ibs.items():
        for ib in ibs:
            label_for_bbox(ib, cat2name[cid], cat2color[cid])
            fig.axes.add_patch(bbox_to_rect(ib, cat2color[cid]))

Pick some random image to test drive the box overlay code...

In [None]:
img_id = 129739
img = Image.open(datadir/froot/'train'/stats.img2fname[img_id])
overlay_img_bbox(img, stats.img2cat2ibs[img_id], stats.cat2name)

Looks good enough.

## Wrap Data Loading Logic using Pytorch-Lightning


Now we need to define a DataModule to encapsulate all the data loading logic. At first I thought I can reuse CocoDetect() from torchvision but it and downstream cocoapi expects json annotation file to be of this [format](https://cocodataset.org/#format-data):
```
annotation{
    "id": int,
    "image_id": int,
    "category_id": int,
    "segmentation": RLE or [polygon],
    "area": float,
    "bbox": [x,y,width,height],
    "iscrowd": 0 or 1,
}
```

Tiny Coco's train.json file only has a subset of the above fields:
```
“Annotations”: [
    {
      "image_id": 542959,
      "bbox": [
        32.52,
        86.34,
        8.53,
        9.41
      ],
      "category_id": 62
    },
    ...
]
``` 

Thus we will need to make a Dataset to handle it properly.

In [None]:
# export
    
class SubCocoDataset(torchvision.datasets.VisionDataset):
    """
    Simulate what torchvision.CocoDetect() returns for target given fastai's coco subsets
    Args:
        root (string): Root directory where images are downloaded to.
        stats (CocoDatasetStats): 
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.ToTensor``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
        transforms (callable, optional): A function/transform that takes input sample and its target as entry
            and returns a transformed version.
    """

    def __init__(self, root, stats, transform=None, target_transform=None, transforms=None):
        super(SubCocoDataset, self).__init__(root, transforms, transform, target_transform)
        self.stats = stats
        self.img_ids = list(stats.img2fname.keys())

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
        """
        img_id = self.img_ids[index] if index < len(self.img_ids) else 0
        img_fname = self.stats.img2fname.get(img_id, None)
        if img_id == None or img_fname ==None:
            print(f"__getitem__({index}): got img_id {img_id}, img_fname {img_fname}")
            return (None, None)
        
        img = Image.open(os.path.join(self.root, img_fname)).convert('RGB')
        target = { "boxes": [], "labels": [], "image_id": None, "area": [], "iscrowd": 0, "ids": [] }
        count = 0
        liibs = self.stats.img2liibs.get(img_id,[])
        for cat_id, img_id, anno_id, x, y, w, h in liibs:
            count += 1
            target["boxes"].append([x, y, x+w, y+h])
            target["labels"].append(self.stats.cat2lbl[cat_id])
            target["image_id"] = img_id
            target["area"].append(w*h)
            target["ids"].append(anno_id)

        for k, v in target.items():
            target[k] = torch.tensor(v)

        if self.transforms is not None:
            img, target = self.transforms(img, target)
        else:
            if self.transform is not None: img = self.transform(img)
            if self.target_transform is not None: target = self.target_transform(target)

        return img, target

    def __len__(self):
        return self.stats.num_imgs

Test the SubCocoDataset

In [None]:
tfm = transforms.Compose([ transforms.ToTensor() ])
fcoll = lambda batch: tuple(zip(*batch))

dataset = SubCocoDataset(datadir/froot/'train', stats, transform=tfm)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=1, collate_fn=fcoll)
# For Training
images,targets = next(iter(data_loader))
(images), (targets)

Now wrap the Dataset into a DataModule...

In [None]:
#export

class SubCocoDataModule(LightningDataModule):

    def __init__(self, root, stats, bs=32, workers=4, split_ratio=0.8):
        super().__init__()
        self.dir = root
        self.bs = bs
        self.workers = workers 
        self.stats = stats
        self.split_ratio = split_ratio
        
        # transforms for images
        transform=transforms.Compose([
            transforms.ToTensor(), 
            transforms.Normalize(stats.chn_means, stats.chn_stds)
        ])

        # prepare transforms for coco object detection
        dataset = SubCocoDataset(self.dir, self.stats, transform=transform)
        num_items = len(dataset)
        num_train = int(self.split_ratio*num_items)
        self.train, self.val = random_split(dataset, (num_train, num_items-num_train), generator=torch.Generator().manual_seed(42))
        print(self.train, self.val)

    def collate_fn(self, batch):
        return tuple(zip(*batch))
    
    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.bs, num_workers=self.workers, collate_fn=self.collate_fn)

    def val_dataloader(self):
        return DataLoader(self.val, batch_size=self.bs, num_workers=self.workers, collate_fn=self.collate_fn)

Test the DataModule

In [None]:
#export

tiny_coco_dm = SubCocoDataModule(datadir/froot/'train', stats, bs=2)
tdl=tiny_coco_dm.train_dataloader()
images, targets = next(iter(tdl))
len(images), len(targets)

## Neural Network Architecture Model

There are Many object detection models to choose from.

To break my anaylysis paralysis researching and deciding between the various models and architectures, I decided to push ahead with what is provided by TorchVision out of the box i.e. Faster R-CNN with pretrained ResNet backbone. 

Luckily there is a turorial to follow
https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html

First, let's just use the raw model directly and verify the outputs

In [None]:
#export

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, stats.num_cats)

tdl = tiny_coco_dm.train_dataloader()
images, targets = next(iter(tdl))
images = list(img for img in images)
targets = [{k: v for k, v in t.items()} for t in targets]

# how to get both in 1 step?
losses = model(images,targets) 

model.eval()
predict = model(images)
(predict, losses)

## Metrics

In my initial attempt to port the example code Faster RCNN code to Pytorch-Lightning, I realized soon computing metrics is going to tricky. In fact, I don't actually know even what metrics to use!  Luckily someone pointed me to this well written [article by Raphael Padilla](https://github.com/rafaelpadilla/Object-Detection-Metrics) which explains the metrics used in object detection today. 

I decided to repurpose [CocoAPI](https://github.com/cocodataset/cocoapi/) which has builtin metric evaluation instead of rolling my own evaluation metrics.

It was painful as Coco API was not written for easy extension in mind.  For example, it tightly couples JSON file loading (and its format) into metrics computation, so I had to look under the cover for its implementation to then use it in a way that is probably not appropriate.  Also it tighly couple the computation of metrics with the batching of data.

I ended up just writing a wrapper that uses a separate Coco object for each ground truth and prediction, as if the whole epoch has only 1 sample. 

In [None]:
# export

class SubCocoWrapper():
    def __init__(self, categories, p, t):
        # turn tgt: { "boxes": [...], "labels": [...], "image_id": "xxx", "area": [...], "iscrowd": 0 }
        # into COCO with dataset dict of this form:
        # { images: [], categories: [], annotations: [{"image_id": int, "category_id": int, "bbox": (x,y,width,height)}, ...] }
        # see https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py
        with io.capture_output() as captured:
            self.target = COCO()
            img_id = int(t["image_id"]) # could be tensor, cast to int
            images = [ {'id': img_id, 'file_name': f"{img_id:012d}.jpg"} ]
            self.target.dataset["images"] = images
            self.target.dataset["categories"] = categories
            self.target.dataset["annotations"] = []
            for bi, b in enumerate(t["boxes"]):
                x, y, w, h = b
                cat_id = t["labels"][bi]
                anno_id = t["ids"][bi]
                self.target.dataset["annotations"].append({'id': anno_id, 'image_id': img_id, 'category_id': cat_id, 'bbox': b})
            self.target.createIndex()

            # [ {'boxes': tensor([[100.5,  39.7, 109.1,  52.7], [110.9,  41.1, 120.4,  54.4], [ 36.6,  56.1,  46.9,  74.0]], device='cuda:0'), 
            #    'labels': tensor([1, 1, 1], device='cuda:0'), 
            #    'scores': tensor([0.7800, 0.7725, 0.7648], device='cuda:0')}, ...]
            # numpy array [Nx7] of {imageID,x1,y1,w,h,score,class}
            pna = np.zeros((len(p["boxes"]), 7))
            for bi, b in enumerate(p["boxes"]):
                pna[bi]=(img_id, *b, p["scores"][bi], p["labels"][bi])

            anns = self.target.loadNumpyAnnotations(pna)
            self.prediction = COCO()
            self.prediction.dataset["images"] = images
            self.prediction.dataset["categories"] = categories
            self.prediction.dataset["annotations"] = anns

    def targetCoco(self): 
        return self.target
    
    def predictionCoco(self): 
        return self.prediction

Using the above wrapper, I can then compute the metric for the 1 sample epoch to get the metrics I want for the image
* Mean Average Precision (MAP)
* Mean Average Recall (MAR) 
* over a range of Intersection over Union (IOU) values from 50% to 95%
* then combined it using [F1](https://en.wikipedia.org/wiki/F1_score)

In [None]:
# export

class FRCNN(LightningModule):
    def __init__(self, lbl2cat):
        super(FRCNN, self).__init__()
        self.model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
        self.categories = [ {'id': lid, 'name': f"{cid}" } for lid, cid in lbl2cat.items() ]
        self.num_classes = len(self.categories)  
        
        # get number of input features for the classifier
        self.in_features = self.model.roi_heads.box_predictor.cls_score.in_features
        # replace the pre-trained head with a new one
        self.model.roi_heads.box_predictor = FastRCNNPredictor(self.in_features, self.num_classes)

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        losses = self.model(x, y)
        loss = sum(losses.values())
        logs = {'train_loss': loss} 
        return {'loss': loss, 'log': logs} # should add 'acc' accuracy e.g. MAP, MAR etc

    def metrics(self, preds, targets):
        accu = torch.zeros((len(preds), 1))
        for i, (p,t) in enumerate(zip(preds, targets)):
            subcoco = SubCocoWrapper(self.categories, p, t)
            cocoeval = COCOeval(subcoco.targetCoco(), subcoco.predictionCoco(), "bbox")
            with io.capture_output() as captured:
                cocoeval.evaluate()
                cocoeval.accumulate()
                cocoeval.summarize()
            precision = cocoeval.stats[0] # Average Precision (AP) @[ IoU=0.50:0.95 | area=all | maxDets=100 ]
            recall = cocoeval.stats[8] # Average Recall (AR) @[ IoU=0.50:0.95 | area=all | maxDets=100 ]
            f1 = 2/(1/precision + 1/recall) # see https://en.wikipedia.org/wiki/F1_score
            accu[i] = f1
        return accu
        
    def validation_step(self, val_batch, batch_idx):
        # validation runs the model in eval mode, so Y is prediction, not losses
        xs, ys = val_batch
        preds = self.model(xs, ys)
        accu = self.metrics(preds, ys)
        return {'val_acc': accu} # should add 'val_acc' accuracy e.g. MAP, MAR etc

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    def validation_epoch_end(self, outputs):
        # called at the end of the validation epoch
        # outputs is an array with what you returned in validation_step for each batch
        # outputs = [{'loss': batch_0_loss}, {'loss': batch_1_loss}, ..., {'loss': batch_n_loss}] 
        avg_loss = torch.stack([ o['val_acc'] for o in outputs ]).mean()
        tensorboard_logs = {'val_acc': avg_loss}
        return {'avg_val_acc': avg_loss, 'val_acc': tensorboard_logs}


Let's see if we can train a few epocs...

In [None]:
frcnn_model = FRCNN(stats.lbl2cat)
tiny_coco_dm = SubCocoDataModule(datadir/froot/'train', stats, bs=2) # on my small GPU W/ 4GB VRAM, I can only fit bs=2
chkpt_cb = ModelCheckpoint(
    filepath='model/tiny-coco.ckpt',
    verbose=True,
    monitor='val_loss',
    mode='min'
)
# train
trainer = Trainer(gpus=1, max_epochs=20, checkpoint_callback=chkpt_cb, accumulate_grad_batches=8)
trainer.fit(frcnn_model, tiny_coco_dm)

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/ --host "0.0.0.0" 

## Inference

Time to see how well the model can perform.

In [None]:
img = dataset[0]
img

In [None]:
model.eval()
pred = model([img[0]])
pred

In [None]:
def digest_pred(l2c, pred, cutoff=0.4):
    scores = pred['scores']
    pass_idxs = (scores > cutoff).nonzero(as_tuple=False)
    lbls = pred['labels'][pass_idxs]
    bboxs = pred['boxes'][pass_idxs]
    c2ibs = defaultdict(lambda: [])
    for i,lb in enumerate(zip(lbls, bboxs)):
        l,b = lb
        x,y,w,h = b[0]
        c = l2c[l.item()]
        ibs = c2ibs[c]
        ibs.append((i,x.item(),y.item(),w.item(),h.item()))
        c2ibs[c] = ibs
    return c2ibs

In [None]:
pred_c2ibs = digest_pred(stats.lbl2cat, pred[0])
pimg = torchvision.transforms.ToPILImage()(img[0])

overlay_img_bbox(pimg, pred_c2ibs, stats.cat2name)

## Reconstitude from Saved Model
Now let's reload from checkpoint to see if all works...

In [None]:
pretrained_model = FRCNN.load_from_checkpoint("model/*.ckpt")
pretrained_model.freeze()
pred = pretrained_model([img[0]])