# Read from Kaggle, unzip file

In [None]:
import urllib.request as urlrequest
from pathlib import Path

base_data_path = Path() / "data" 

base_data_path.mkdir(parents = True, exist_ok = True)

dataset_location = base_data_path / "dataset"

zip_path = base_data_path / "dataset.zip"

if not zip_path.exists():

    dataset_url = "https://www.kaggle.com/api/v1/datasets/download/alvarole/asirra-cats-vs-dogs-object-detection-dataset"

    response = urlrequest.urlopen(
        dataset_url,
    )

    download_size = response.getheader("Content-Length")
    with open(zip_path, "wb") as f:

        f.write(response.read())


In [None]:

import zipfile
import os

def unzip():

    inner_file = "Asirra_ cat vs dogs"
    with zipfile.ZipFile(zip_path, "r") as zip:
        
        for item in zip.infolist():

            zip.extract(item, base_data_path)

    os.rename(base_data_path / inner_file, dataset_location)

# unzip()


# Dataset creation

In [None]:

import itertools
import xml.etree.ElementTree as ET
import torch
from typing import TypedDict


def patched_dataset_paths(dataset_location):

    return itertools.batched(dataset_location.iterdir(), 2)

class Objects(TypedDict):
    '''
    `bndbox`: (xmin,ymin,xmax,ymax)
    '''

    name: str
    pose: str
    truncated: int
    difficult: int
    bndbox: torch.Tensor

class Metadata(TypedDict):
    '''
    `size`: (width, height, depth)
    '''

    size: torch.Tensor
    objects: list[Objects]

class MetaWithImage(Metadata):

    img_path: str

# specific xml reader implementation for the lolz
def read_metadata(xml_file: Path) -> Metadata:
    '''
    Read labeling from xml file into dict.
    '''

    with open(xml_file, "r", encoding = "utf-8") as f:
        text = ET.canonicalize(from_file=f, strip_text = True)
        
    tree = ET.fromstring(text)

    size = tree.find("size")
    size = torch.tensor([int(elem.text) for elem in size.iter() if not elem.tag == "size"])


    objects = tree.findall("object")
    objects: Objects = [dict(
        name = obj.find("name").text,
        pose = obj.find("pose").text,
        truncated = int(obj.find("truncated").text),
        difficult = int(obj.find("difficult").text),
        bndbox = torch.tensor([
            float(elem.text) 
            for elem in obj.find("bndbox").iter()
            if not elem.tag == "bndbox"
        ])
    ) for obj in objects]

    metadata: Metadata = dict(
        size = size,
        objects = objects
    )

    return metadata

def get_dataset(dataset_location) -> list[MetaWithImage]:

    meta: list[MetaWithImage] = []
    for img, xml_path in patched_dataset_paths(dataset_location):

        metadata: MetaWithImage = read_metadata(xml_path) | dict(img_path = img)
        meta.append(metadata)

    return meta

def dataset_splits(dataset: list[MetaWithImage] | None = None, fractions: tuple[float] = (0.8, 0.1, 0.1)):

    dataset = get_dataset(dataset_location) if dataset is None else dataset
    return  torch.utils.data.random_split(dataset, fractions)



    


In [None]:
import torch
from torchvision.io import read_image
import torchvision.transforms.v2.functional as tvt

class CatsAndDogsDataset(torch.utils.data.Dataset):
    def __init__(self, data: list[MetaWithImage], resize_to = (300,300)):
        '''


        `resize_to`:
        - should be square
        '''

        self.resize_to = resize_to
        self.data = [self.metadata_transform(val) for val in data]

    def __len__(self):
        return len(self.data)

    def image_transform(self, img):

        return tvt.resize(img, self.resize_to)

    def metadata_transform(self, metadata: MetaWithImage):
        '''
        Transform the bndbox values to be in the range [0,1].
        '''

        resize_x, resize_y = self.resize_to
        width, height, depth = metadata['size']
        for i in range(len(metadata["objects"])):
            obj = metadata['objects'][i]
            bndbox = obj['bndbox']
            metadata["objects"][i]['bndbox'] = (
                bndbox/torch.tensor([width, height]*2)
            )

        metadata["size"] = torch.tensor([resize_x, resize_y, depth])

        return metadata

    def __getitem__(self, idx):
        metadata = self.data[idx]
        img_path = metadata["img_path"]
        image = read_image(img_path)
        image = self.image_transform(image)
        return image, metadata


In [None]:
train_split, validation_split, test_split = dataset_splits()


train_split = CatsAndDogsDataset(train_split)
validation_split = CatsAndDogsDataset(validation_split)
test_split = CatsAndDogsDataset(test_split)

print(len(train_split))

# Test datasets with plotting

In [None]:
import matplotlib.pyplot as plt
import torchvision.transforms.v2.functional as vision_transforms
from torchvision.utils import draw_bounding_boxes

def to_plottable(img):

    return vision_transforms.to_pil_image(img) 

def add_bb(img, meta: MetaWithImage):

    width, height, _ = [val.item() for val in meta["size"]]
    for _object in meta["objects"]:
        bb = _object["bndbox"].reshape((-1, 4))
        bb = bb*torch.tensor([width, height, width, height])
        print(bb)
        img = draw_bounding_boxes(img, bb, colors = "cyan")

    return img

plt.figure()
im, meta = train_split[int(torch.rand(1).item()*len(train_split))]
im = add_bb(im, meta)
plt.imshow(to_plottable(im))
print(meta["objects"])
plt.show()


# Test IoU calculation

In [None]:
import src.default_box as default_box
import src.utils.math as math_utils


im, meta = train_split[0]
bndbox = meta["objects"][0]["bndbox"].reshape([-1,4])
width, height, _ = [val.item() for val in meta["size"]]

print(width, height)
boxes = default_box.default_boxes(
    scale = 0.8,
    centers = default_box.default_box_centers(width//8, height//8)
)

num_boxes, num_ratios, _ = boxes.shape
boxes = boxes.reshape([num_boxes*num_ratios, 4])

print(bndbox)
print(boxes.shape)
iou = math_utils.intersection_over_union(boxes.reshape([num_boxes*num_ratios, 4]), bndbox)
print(iou.shape)
iou = iou.reshape([num_ratios, len(bndbox), num_boxes])
print(iou.shape)

print([(iou >= val).sum() for val in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]])


# Design network and training setup

See [SSD paper](https://arxiv.org/abs/1512.02325), also [Dive into deep learning](https://d2l.ai/) has good practical material and examples.

SSD consists of a base network followed by successive prediction layers
which generate the class predictions and bounding box offsets at different
scales. 

- The base network downsamples the input, decreasing the width and height
while adding more channels.

- The prediction layers make the predictions
by convolving over their input and outputting a value for each class
and each default box offset. 
    * The output from a layer is further downsampled by e.g. pooling,
    creating a larger receptive field for the next layer. As a result,
    the scale of default boxes should increase further into the net.


## Network

In [None]:
import torch
from math import floor

import src.utils.reshape as reshape

def down_sampler(in_channels, out_channels):

    return torch.nn.Sequential(*[
        torch.nn.Conv2d(
            in_channels = in_channels,
            out_channels = out_channels,
            kernel_size = 3,
            padding = 1
        ),
        torch.nn.ReLU(),
        torch.nn.MaxPool2d(kernel_size = 2)
    ])

def prediction_layer(in_channels, num_classes, num_ratios):

    return torch.nn.ModuleDict(
        dict(
            class_pred = torch.nn.Conv2d(
                in_channels = in_channels,
                out_channels = num_classes*num_ratios,
                kernel_size = 3,
                padding = 1
            ),
            box_pred = torch.nn.Conv2d(
                in_channels = in_channels,
                out_channels = 4*num_ratios,
                kernel_size = 3,
                padding = 1

            )
        )
    )

def max_pool_change(in_size):
    '''
    How much an input size of `in_size` changes using
    torch.nn.MaxPool2D(kernel_size = 2)
    '''
            
    return floor((in_size - (2-1) - 1)/2 + 1)

def repeat_apply(func, _input, num):

    for _ in range(num):
        _input = func(_input)

    return _input

def generate_default_boxes(width, scale):

    return default_box.default_boxes(
        scale = scale,
        centers = default_box.default_box_centers(width, width)
    )

class SSD(torch.nn.Module):
    '''

    `num_classes`: int
    
    `num_ratios`: int
    - Number of ratios used for the default boxes.

    `default_boxes`: torch.Tensor
    - default boxes for each feature map layer, of shape
    (num_ratios*feature_map_height*feature_map_width,4). Indexing
    over boxes per pixel works by [num_ratios*i:num_ratios*(i+1), 4]

    `pixels_per_layer`: list of int
    - pixels per feature map layer, useful for iterating over `default_boxes`
    in layer-by-layer manner.
    '''

    def __init__(self, num_classes, num_ratios, in_channels = 3, width = 300):
        '''
        `num_classes`:
        - Number of classes including background "class".
        '''

        super().__init__()

        base_channels = [in_channels,9,27,81]

        self.num_classes = num_classes
        self.num_ratios = num_ratios

        self.max_pool = torch.nn.MaxPool2d(kernel_size = 2)

        self.base_network = torch.nn.Sequential(*[
            down_sampler(base_channels[i], base_channels[i+1])
            for i in range(len(base_channels)-1)
        ])

        self.prediction_layers = [
            prediction_layer(base_channels[-1], num_classes, num_ratios)
            for _ in range(3)
        ]
        
        # pre-generate the default boxes
        # ------------------------------
        start_size = repeat_apply(max_pool_change, width, len(base_channels)-1)
        
        scales = default_box.scales(len(self.prediction_layers))

        def_boxes = []
        self.pixels_per_layer = []

        for i in range(len(self.prediction_layers)):


            def_box = generate_default_boxes(start_size, scales[i])
            pixels = def_box.shape[0]
            self.pixels_per_layer.append(pixels)
            # shape to match the output from the box predictions:
            # [pixels*ratios, boxes]
            def_box = (
                def_box
                .flatten(start_dim = 1)
                .reshape([pixels*num_ratios, 4])
            )
            def_boxes.append(
                def_box
            )

            start_size = max_pool_change(start_size)

        self.default_boxes = torch.vstack(def_boxes)
        # ===============================



    def forward(self, x):
        '''
        Return dict with kwords class_preds, box_preds,
        both a list of torch.Tensor of predictions per feature map layer.
        '''

        X = self.base_network(x)

        pl = self.prediction_layers

        class_preds = []
        box_preds = []
        for i in range(len(pl)):

            # predict classes
            class_pred = reshape.items_per_pixel(
                pl[i]["class_pred"](X),
                self.num_classes
            )
            torch.nn.functional.log_softmax(class_pred, dim=1)
            class_preds.append(
                torch.nn.functional.log_softmax(class_pred, dim=1)
                
            )

            # predict box offsets
            box_pred = reshape.items_per_pixel(pl[i]["box_pred"](X), 4)
            # restrict w/h offset to > 0 (w/h used to scale width and height)
            box_pred[:,[2,3]] = torch.nn.functional.softplus(box_pred[:,[2,3]])

            box_preds.append(
                box_pred
            )

            X = self.max_pool(X)

        return dict(
            class_preds = class_preds,
            box_preds = box_preds
        )


In [None]:
from itertools import accumulate


model = SSD(3, 5)

# test properties
print(model.default_boxes.shape)
print(model.pixels_per_layer)

dummy_img = torch.rand([3,300,300])

res = model(dummy_img)
print(f"{torch.vstack(res["class_preds"]).shape=}")

bpreds = res["box_preds"][0]
print(res["box_preds"][0].shape)

print(model.default_boxes[:5,:])

# should match in dimensions
box_preds = torch.vstack(res["box_preds"])
def_boxes = model.default_boxes

box_sum = model.default_boxes + box_preds
print(f"{box_sum.shape=}")

# how to go over feature layers
layer_indices = list(accumulate(model.pixels_per_layer))
print(layer_indices)

assert torch.all(
    torch.tensor(box_sum[layer_indices[-2]*model.num_ratios:,:].shape) 
    == torch.tensor([model.pixels_per_layer[-1]*model.num_ratios, 4])
)


## utilities for calculating loss

In [None]:

print(train_split[0])


dat = train_split[0][1]
bndbox = []
clslist = []
class_dict = dict(bkg = 0, cat = 1, dog = 2)
for ob in train_split[0][1]["objects"]:

    bndbox.append(ob["bndbox"])
    clslist.append(class_dict[ob["name"]])

bndbox = torch.vstack(bndbox)
clslist = torch.tensor(clslist)

def classes_and_boxes_truth(
    iou: torch.Tensor,
    ground_truth_classes: torch.Tensor,
    threshold = 0.5,
) -> tuple[torch.Tensor, torch.Tensor]:
    '''
    Calculate a tensor indicating which default box is considered
    to overlap which class and which ground truth box.

    Return shape is (number of default boxes), with each element
    a class indicator index (background is zero) or box index (-1 is
    no box).

    `iou`:
    - The intsection-over-union of default boxes and ground truth
    boxes, as per
    
    > utils.math.intersection_over_union(model.default_boxes, ground_truth_boxes)
    '''

    matches_max = iou.max(dim=0)
    boxes = matches_max.indices
    classes = ground_truth_classes[boxes]
    background = matches_max.values <= threshold
    boxes[background] = -1
    classes[background] = 0
    return classes, boxes

iou = math_utils.intersection_over_union(
    model.default_boxes,
    bndbox,
)

classes, boxes = classes_and_boxes_truth(iou, clslist)


In [None]:
print(iou.shape)
(iou > 0.5).sum()

class_preds = torch.vstack(res["class_preds"])

print(class_preds.shape)

print(classes.shape)
print(classes.sum())
print((classes == 0).sum())
print((classes == 1).sum())
print((classes == 2).sum())

torch.nn.functional.nll_loss(class_preds, classes, reduction = "sum")

In [None]:


# matching predicted boxes to ground truth.
# - Only calculate for boxes which have a matching ground truth?
# - Only calculate for boxes which have a predicted class other than background?
#   - Or should box predictions and class predictions be considered
#   independent of each other? Is the box prediction layer expected
#   to learn to match the ground truth box independent of what the
#   target class actually is? Of course, the two are modeled as
#   separate, so independence should be assumed, I guess.

def calculate_box_loss(
    default_boxes: torch.Tensor,
    predicted_offsets: torch.Tensor,
    ground_truth_boxes: torch.Tensor,
    ground_truth_overlap_index: torch.Tensor
):
    '''
    Calculate a Smooth L1 Loss between predicted object boxes
    and actual ones. 
    
    `default_boxes` (xmin, ymin, xmax, ymax) will
    be offset by `predicted_offsets` (x,y,w,h): x/y is used
    to move the entire box along an axis, allowing any real value;
    w/h >= 0 is used to scale the box, keeping the xmin/ymin stationary
    while increasing/decreasing the distance of xmax/ymax from the
    former. The result of the offset is compared against the ground
    truth box in `ground_truth_boxes` which the default box in question
    matched against (see `classes_and_boxes_truth()`) based on 
    `ground_truth_overlap_index`.
    '''

    # calculate only for values which had a match
    match = ground_truth_overlap_index != -1

    predicted = torch.clone(default_boxes[match,:]).detach()
    # move boxes
    match_predicted_offsets = predicted_offsets[match]
    predicted += match_predicted_offsets[:, [0,1,0,1]]
    # scale boxes
    predicted[:,[2,3]] *= match_predicted_offsets[:, [2,3]]

    # match the ground truths to the default boxes based on
    # overlap
    indices = ground_truth_overlap_index[match]
    actual = ground_truth_boxes[indices,:]

    return torch.nn.functional.smooth_l1_loss(
        input = predicted,
        target = actual,
        reduction = "sum"
    )


calculate_box_loss(
    model.default_boxes,
    box_preds,
    bndbox,
    boxes
)

In [None]:

def calculate_losses(
    default_boxes,
    predicted_offsets,
    predicted_classes,
    ground_truth_boxes,
    matched_classes,
    ground_truth_overlap_index
):
    '''
    Return dict of l1, nll.
    '''
    
    l1 = calculate_box_loss(
        default_boxes,
        predicted_offsets,
        ground_truth_boxes,
        ground_truth_overlap_index
    )

    nll = torch.nn.functional.nll_loss(
        predicted_classes,
        matched_classes,
        reduction = "sum"
    )

    return dict(
        l1 = l1,
        nll = nll
    )

calculate_losses(model.default_boxes, box_preds, class_preds, bndbox, classes, boxes)


In [None]:

def calculate_loss(
        model: SSD,
        prediction,
        ground_truth_boxes:torch.Tensor,
        ground_truth_classes: torch.Tensor,
        weight: float = 1.0,
        iou = None
):
    '''
    Calculate the losses for the output of `model`'s forward pass, `prediction`.
    `ground_truth_boxes` is (N,4), containing the bounding boxes (xmin, ymin, xmax, ymax) 
    of the input image objects. `ground_truth_classes` is (N), with the elements matching
    the true class of the bounding boxes in `ground_truth_boxes`.

    `weight` is used as weight for the returned loss as l1 + weight*nll,
    where l1 is the loss for the predicted classes, and nll the loss
    for the predicted boxes.

    `iou` can be passed if it has been calculated using

    > iou = src.utils.math.intersection_over_union(model.default_boxes, target_bndbox)
    '''

    if iou is None:
        iou = math_utils.intersection_over_union(model.default_boxes, ground_truth_boxes)

    classes, boxes = classes_and_boxes_truth(iou, ground_truth_classes)

    class_preds = torch.vstack(prediction["class_preds"])
    box_preds = torch.vstack(prediction["box_preds"])

    losses = calculate_losses(
        model.default_boxes,
        box_preds,
        class_preds,
        ground_truth_boxes,
        classes,
        boxes
    )

    return losses["l1"] + weight*losses["nll"]


    
calculate_loss(model, res, bndbox, clslist)
