# Reload

This is just some code from the first part. This assumes that the dataset was downloaded as in the first notebook and that the COCO format Json files were generated. We continue to work with the three-channel composite images generated in part 1.

In [None]:
from fastai.vision.all import *

import matplotlib.colors as mcolors
import matplotlib.cm as cmx
from matplotlib import patches, patheffects

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#
SIZE=512

path = Path.cwd() / "data"

# display an image in such a way that we can layer on some additional annotations
def show_img(im, figsize=None, ax=None):
    if not ax: fig,ax = plt.subplots(figsize=figsize)
    show_image(im,ax) # We use this FastAI method to make life a little easier
    ax.set_xticks(np.linspace(0, SIZE, 8))
    ax.set_yticks(np.linspace(0, SIZE, 8))
    ax.grid()
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    return ax

# draw an outline around the shape; used to add contrast to the text so we can read it easily
def draw_outline(o, lw):
    o.set_path_effects([patheffects.Stroke(
        linewidth=lw, foreground='black'), patheffects.Normal()])

# draw text in the specified location along with an outline so that there is some contrast between the text and the image
def draw_text(ax, xy, txt, sz=14, color='white'):
    text = ax.text(*xy, txt,
        verticalalignment='top', color=color, fontsize=sz, weight='bold')
    draw_outline(text, 1)

def draw_rect(ax, b, color='white'):
    patch = ax.add_patch(patches.Rectangle(b[:2], *b[-2:], fill=False, edgecolor=color, lw=2))
    draw_outline(patch, 4)

def get_cmap(N):
    color_norm  = mcolors.Normalize(vmin=0, vmax=N-1)
    return cmx.ScalarMappable(norm=color_norm, cmap='Set3').to_rgba

# generate a list of different colors for rendering our bounding boxes
num_colr = 12
cmap = get_cmap(num_colr)
colr_list = [cmap(float(x)) for x in range(num_colr)]

# Grab our dataset
train_imgs, train_lbl_bbox = get_annotations(path/'composite'/'train'/'data.json')
valid_imgs, valid_lbl_bbox = get_annotations(path/'composite'/'val'/'data.json')
imgs = [f"train/{each}" for each in train_imgs] + [f"val/{each}" for each in valid_imgs]
lbl_bbox = train_lbl_bbox + valid_lbl_bbox

# utility function that takes a bounding box in the form of x1,y1,x2,y2 and returns it's area (w*h)
def area(b): return (b[2]-b[0])*(b[3]-b[1])
# zip the bounding boxes together with the object class; sort it descending order by the size of the bounding; return the first one (largest one)
def get_largest(boxes):
    return sorted(L(zip(*boxes)),key=lambda b: -area(b[0]))[0]
    
# list comprehension to go through all of the training data and extract the largest objects
lrg_bbox = [get_largest(boxes) for boxes in lbl_bbox]

This takes us back to the point where we had a "largest object dataset" which for a given image contains the largest object, and its bounding box.

# Scaling
In this part we are going to look in more depth at the bounding boxes. As these are defined in terms of image coordinates, and we scale images down to `SIZE` we also need to scale the bounding boxes. For now we can do this in the dataset:

In [None]:
# Given the image file name and bounding box will scale the bounding box to 224x224
def squish_bbox(img_file,labeled_bbox):
    bbox,label = labeled_bbox
    p = path/"composite"/img_file
    img = Image.open(p)
    h,w = img.shape
    yscale,xscale = h/SIZE,w/SIZE
    scaled_bbox = (bbox[0]//xscale,bbox[1]//yscale,bbox[2]//xscale,bbox[3]//yscale)
    return [scaled_bbox,label]
lrg_bbox[1], squish_bbox(imgs[1], lrg_bbox[1])

Now we can scale all the bounding boxes in the "largest" dataset:

In [None]:
lrg_bbox_scaled = [squish_bbox(img_file, labeled_bbox) for img_file, labeled_bbox in zip(imgs, lrg_bbox)]

We can make a `dict` so the dependent variables can be easily recalled given the independent image file name

In [None]:
img2lrgbboxscaled = dict(zip(imgs, lrg_bbox_scaled))
k = L(img2lrgbboxscaled)[1]; k, img2lrgbboxscaled[k]

# Visualisation
Now we ignore class labels, and just work with the bounding box for the largest class present:

In [None]:
def show_lrg_item(im, lbl_bbox, figsize=None, ax=None):
    if not ax: fig, ax = plt.subplots(figsize=figsize)
    ax = show_img(im, ax=ax)
    b = lbl_bbox[0]
    b = (*b[:2],b[2]-b[0]+1,b[3]-b[1]+1)
    draw_rect(ax, b, color=colr_list[0])

img_file = L(img2lrgbboxscaled)[1]
img_bbox_scaled = img2lrgbboxscaled[k]

img_scaled = Image.open(path/'composite'/img_file).resize((SIZE, SIZE))
show_lrg_item(img_scaled, img_bbox_scaled)

# Training a BB Prediction Model
When we made the prior model to classify images by the largest object, fastai itself did a lot. It constructed a model with the resnet34 architecture, and initialised the parameters with a set of pretrained weights. These do *transfer learning*, and are obtained by training the model on the imagenet dataset. This has over 100k images across 200 classes. The early layers of the model are great for detecting edges or primitive shapes common to any images. This start point gives a much more efficient descent than using random initialisation.

In [None]:
# Define the getter that will obtain the input image and the class label associated with the image.
getters = [lambda o: path/'composite'/o, lambda o: img2lrgbboxscaled[o][1]]

item_tfms = [Resize(SIZE, method='squish'),]
batch_tfms = [Rotate(10), Flip(), Dihedral()]
dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
                   getters=getters,
                   item_tfms=item_tfms,
                   batch_tfms=batch_tfms,
                   splitter = FuncSplitter(lambda o: Path(o).parent == 'val'))
dls = dblock.dataloaders(imgs, bs = 128)
learn = vision_learner(dls,resnet34)

As we have different class labels in this dataset the latter layers are less useful. FastAI's `vision_learner` adapts the model to our dataset by cutting it into two parts. The pretrained part is retained, and referred to as the backbone of our network. The latter part (the head) is discarded, and replaced with a dynamically created sequence of linear layer with twenty output activations to match the number of classes we want to predict. The head is randomly initialised and trained to map from the backbone features to desired model output.

In [None]:
backbone = learn.model[0]; backbone

In [None]:
head = learn.model[1]; head

The most interesting thing here is the last layer of the head. The `out_features` give one activation for every one of the twenty classes. When we train with CrossEntropy loss the model drives the twenty activations to output probabilities for each of the twenty classes. 

The bounding box problem is distinct. The bounding box contains for numbers, the first two contain the upper-left coordinates of the rectangle and the latter two the lower-right coordinates. The network now has to output four continuous numbers. To get continuous numbers we need to use regression. We can continue to use the same `resnet34` backbone but the head needs to change.

## Bounding Box Model
We define a getter for training the model, which given an image file name returns the full path and a tensor with the four bounding box coordinates of the largest object in the image:

In [None]:
getters = [lambda o: path/'composite'/o, lambda o: FloatTensor(img2lrgbboxscaled[o][0])]
k, getters[0](k), getters[1](k)

In computer vision we often want to randomly transform the training images by rotation or flipping to increase diversity in the input images and reduce the likelihood of overfitting on the training data. If we do that now we would also need to do the same transformations on the bounding boxes. We neglect that for now, building a model without any image transforms or augmentations. This will be improved below:

In [None]:
item_tfms = [Resize(SIZE, method="squish"),]
dblock = DataBlock(blocks=(ImageBlock, RegressionBlock(n_out=4)),
                   getters=getters,
                   item_tfms=item_tfms,
                   splitter = FuncSplitter(lambda o: Path(o).parent == 'val'))
dls = dblock.dataloaders(imgs, bs = 12)

Now we can look at a sample batch:

In [None]:
dls.show_batch()

Now for FastAI the bounding box edges are just numbers. We can visualise:

In [None]:
def bb_hw(a): return np.array([a[0], a[1], a[2]-a[0]+1, a[3]-a[1]+1])

def show_batch(dls):
    b = dls.one_batch()

    axs = subplots(3, 3)[1].flat
    for img, box, ax in zip(b[0][:9], b[1][:9], axs):
        show_img(img, ax=ax)
        draw_rect(ax, bb_hw(box.tolist()), color=colr_list[0])

show_batch(dls)

Again we use the vision learner API:

In [None]:
learn = vision_learner(dls, resnet34, loss_func=L1LossFlat())
head = learn.model[1]; head

The default loss for regression is MSE, but this heavily penalises larger differences between predictions and targets. An L1 is probably more appropriate:

In [None]:
lrs = learn.lr_find()
lrs

# Training
We use the fine tune method as before:

In [None]:
learn.fine_tune(30, base_lr=lrs.valley)

# Evaluation
Now lets look at the predictions and see how it's doing:

In [None]:
def show_preds(learner,valid=False):
    learn.model.eval()
    dl = learn.dls[1 if valid else 0].new(shuffle=True)
    imgs,boxes = dl.one_batch()
    preds = learner.model(imgs).tolist()

    fig,axs = subplots(9,2)
    for img,box,pred,ax in zip(imgs,boxes,preds,axs):
        img = (img-img.min())/(img.max()-img.min())
        show_img(img,ax=ax[0])
        draw_rect(ax[0],bb_hw(box.tolist()),color=colr_list[0])
        show_img(img,ax=ax[1])
        draw_rect(ax[1],bb_hw(pred),color=colr_list[0])
    fig.tight_layout()
    axs[0][0].set_title('label')
    axs[0][1].set_title('prediction')
    plt.show()

show_preds(learn)

These results are quite reasonable!

