# Predicting Bounding Boxes and Classifications

We start by pulling in some code from Part 1:

In [None]:
from fastai.vision.all import *

import matplotlib.colors as mcolors
import matplotlib.cm as cmx
from matplotlib import patches, patheffects

SIZE=224

# display an image in such a way that we can layer on some additional annotations
def show_img(im, figsize=None, ax=None):
    if not ax: fig,ax = plt.subplots(figsize=figsize)
    show_image(im,ax) # We use this FastAI method to make life a little easier
    ax.set_xticks(np.linspace(0, SIZE, 8))
    ax.set_yticks(np.linspace(0, SIZE, 8))
    ax.grid()
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    return ax

# draw an outline around the shape; used to add contrast to the text so we can read it easily
def draw_outline(o, lw):
    o.set_path_effects([patheffects.Stroke(
        linewidth=lw, foreground='black'), patheffects.Normal()])

# draw text in the specified location along with an outline so that there is some contrast between the text and the image
def draw_text(ax, xy, txt, sz=14, color='white'):
    text = ax.text(*xy, txt,
        verticalalignment='top', color=color, fontsize=sz, weight='bold')
    draw_outline(text, 1)

def draw_rect(ax, b, color='white'):
    patch = ax.add_patch(patches.Rectangle(b[:2], *b[-2:], fill=False, edgecolor=color, lw=2))
    draw_outline(patch, 4)

def get_cmap(N):
    color_norm  = mcolors.Normalize(vmin=0, vmax=N-1)
    return cmx.ScalarMappable(norm=color_norm, cmap='Set3').to_rgba

# generate a list of different colors for rendering our bounding boxes
num_colr = 12
cmap = get_cmap(num_colr)
colr_list = [cmap(float(x)) for x in range(num_colr)]

# Grab our dataset
path = untar_data(URLs.PASCAL_2007)
imgs,lbl_bbox = get_annotations(path/'train.json')

# utility function that takes a bounding box in the form of x1,y1,x2,y2 and returns it's area (w*h)
def area(b): return (b[2]-b[0])*(b[3]-b[1])

# zip the bounding boxes together with the object class; sort it descending order by the size of the bounding; return the first one (largest one)
def get_largest(boxes):
    return sorted(L(zip(*boxes)),key=lambda b: -area(b[0]))[0]
    
# list comprehension to go through all of the training data and extract the largest objects
lrg_bbox = [get_largest(boxes) for boxes in lbl_bbox]
# Given the image file name and bounding box will scale the bounding box to 224x224
def squish_bbox(img_file,labeled_bbox):
    bbox,label = labeled_bbox
    p = path/f'train/{img_file}'
    img = Image.open(p)
    h,w = img.shape
    yscale,xscale = h/SIZE,w/SIZE
    scaled_bbox = (bbox[0]//xscale,bbox[1]//yscale,bbox[2]//xscale,bbox[3]//yscale)
    return [scaled_bbox,label]
    
lrg_bbox_scaled = [squish_bbox(img_file,labeled_bbox) for img_file,labeled_bbox in zip(imgs,lrg_bbox)]

# scale all of the bounding boxes in our 'largest' dataset using a list comprehension.
img2lrgbboxscaled = dict(zip(imgs,lrg_bbox_scaled))


This again gives a largest object dataset, along with appropriately scaled bounding boxes:

In [None]:
k = L(img2lrgbboxscaled)[1]; k, img2lrgbboxscaled[k]

# Visualisation

Let's start by looking at an image from the dataset, along with the bounding box and label:

In [None]:
def show_lrg_item(im, lbl_bbox, figsize=None, ax=None):
    if not ax: fig, ax = plt.subplots(figsize=figsize)
    ax = show_img(im, ax=ax)
    b = lbl_bbox[0]
    l = lbl_bbox[1]
    b = (*b[:2], b[2]-b[0]+1, b[3]-b[1]+1)
    draw_rect(ax, b, color=colr_list[0])
    draw_text(ax, b[:2], l, color=colr_list[0])

img_file = L(img2lrgbboxscaled)[1]
img_bbox_scaled = img2lrgbboxscaled[k]

img_scaled = Image.open(path/'train'/img_file).resize((SIZE, SIZE))
show_lrg_item(img_scaled, img_bbox_scaled)

# Predicting Multiple Things
Now we want the bounding box, and the class label. The model needs to be able to access both during training and validation. 

To create the `DataLoaders` we need an array of getter functions which return the appropriate data. All the functions in the getters take the image file name as input. The first returns the full path, the second returns a tensor containing the four bounding box coordinates and the third returns the string class label for the largest object:

In [None]:
getters = [
    lambda o: path/'train'/o,
    lambda o: FloatTensor(img2lrgbboxscaled[o][0]),
    lambda o: img2lrgbboxscaled[o][1]
]
k, getters[0](k), getters[1](k), getters[2](k)

Now we have more than two blocks in the model we need to specify `n_inp = 1` so FastAI can understand that there is one input, and two output blocks:

In [None]:
dblock = DataBlock(
    blocks=(ImageBlock, RegressionBlock(n_out=4), CategoryBlock),
    getters=getters,
    item_tfms=item_tfms,
    n_inp=1
)
dls = dblock.dataloaders(imgs, bs=128); len(dls.vocab), dls.vocab

We can view the batches, as in the last part we use a custom function to see the bounding boxes. Note that when using the proper FastAI API this won't be necessary:

In [None]:
def bb_hw(a): return np.array([a[0],a[1],a[2]-a[0]+1,a[3]-a[1]+1])

def show_batch(dls):
    b = dls.one_batch()
    #print(b[2])

    axs = subplots(3,3)[1].flat
    for img,box,c,ax in zip(b[0][:9],b[1][:9],b[2],axs):
        show_img(img,ax=ax)
        label = dls.vocab[c]
        draw_rect(ax,bb_hw(box.tolist()),color=colr_list[0])
        draw_text(ax,bb_hw(box.tolist())[:2],label,color=colr_list[0])

show_batch(dls)

We will continue to use the general `resnet34` backbone, but at this point the `vision_learner` API is not able to infer an appropriate head on its own. We will define a custom head, this is very similar to the ones we have seen before but it just needs to output the appropriate number of activations, in this case four for the bounding box and one for each possible class label:

In [None]:
head_reg4 = nn.Sequential(
    Flatten(),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(25088, 256),
    nn.ReLU(),
    nn.BatchNorm1d(256),
    nn.Dropout(0.5),
    nn.Linear(256, 4+len(dls.vocab))
)

Clearly now some of the output activations are predicting the box, and some are predicting the class label. The Loss function, not the model architecture, defines how the activations are used and will have to discriminate between those used for the bounding box and those used for the class label. The former is a regression problem, which can use an L1 loss measure. The class is a single label classification suitable for cross entropy. As the loss function needs to return a sclar we just sum these two values. Note that because the cross entropy loss is on a different scale we multiply the cross entropy loss by a scaling factor. This is a hyperparameter which can be tuned:

In [None]:
CROSS_ENTROPY_SCALE = 20

def detn_loss(input, bb_t, c_t):
    # Separate
    bb_i, c_i = input[:, :4], input[:, 4:]
    # Scale the bounding box on [0, 1] and then multiply by the image size
    bb_i = F.sigmoid(bb_i)*SIZE

    return F.l1_loss(bb_i, bb_t) + F.cross_entropy(c_i, c_t) * CROSS_ENTROPY_SCALE

It also helps to have some custom metrics. Remember the loss is a function for the  computer, and the metric is a function for the user

In [None]:
def detn_l1(input, bb_t, c_t):
    bb_i = input[:, :4]
    bb_i = F.sigmoid(bb_i) * SIZE
    return F.l1_loss(bb_i, bb_t).data

def detn_ce(input, bb_t, c_t):
    c_i = input[:, 4:]
    return F.cross_entropy(c_i, c_t).data

def detn_acc(input, bb_t, c_t):
    c_i = input[:, 4:]
    return accuracy(c_i, c_t)

Now we can create the learner

In [None]:
learn = vision_learner(dls, resnet34, loss_func=detn_loss, custom_head=head_reg4, metrics=[detn_l1, detn_ce, detn_acc])
head = learn.model[1]; head

# Training
As before we find an appropriate learning rate

In [None]:
lrs = learn.lr_find()
lrs

And train:

In [None]:
learn.fine_tune(20, base_lr=lrs.valley)

# Analysis

Let's check out some predictions to see how we did. As before we make some helper functions:

In [None]:
# Some functions to visulize the predictions of our model.
def bb_hw(a): return np.array([a[0],a[1],a[2]-a[0]+1,a[3]-a[1]+1])

def show_preds(learner,valid=True):
    dl = learn.dls[1 if valid else 0].new(shuffle=True)
    imgs,boxes,labels = dl.one_batch()
    learner.model.eval()
    preds = learner.model(imgs)

    fig,axs = subplots(9,2)
    for img,box,label,pred,ax in zip(imgs,boxes,labels,preds,axs):
        img = (img-img.min())/(img.max()-img.min())
        bb_pred = F.sigmoid(pred[:4])*224
        cl_pred = pred[4:]
        cl_pred = cl_pred.argmax()
        show_img(img,ax=ax[0])
        draw_rect(ax[0],bb_hw(box.tolist()),color=colr_list[0])
        draw_text(ax[0],bb_hw(box.tolist())[:2],learner.dls.vocab[label],color=colr_list[0])
        show_img(img,ax=ax[1])
        draw_rect(ax[1],bb_hw(bb_pred.tolist()),color=colr_list[0])
        draw_text(ax[1],bb_hw(bb_pred.tolist())[:2],learner.dls.vocab[cl_pred],color=colr_list[0])
    fig.tight_layout():
    axs[0][0].set_title('label')
    axs[0][1].set_title('prediction')
    plt.show()

In [None]:
show_preds(learn)

The groundtruth is on the left, and the predictions on the right.