# Dataset
I am using the following dataset publicly available on [Kaggle](https://www.kaggle.com/datasets/mbkinaci/image-localization-dataset?select=training_images)

To run the notebook, please dowload the dataset and save it in the same directory as the notebook.

In [1]:
import os, sys
from pathlib import Path

try:
    home = os.path.dirname(os.path.realpath(__file__))
except Exception:
    home = os.getcwd()
    
current = home

while 'tinyBackProp' not in os.listdir(current):
    current = Path(current).parent

sys.path.append(str(current))
sys.path.append(os.path.join(str(current)))

In [2]:
data_folder = os.path.join(home, 'training_images')
if not os.path.isdir(data_folder):
    raise ValueError(f"PLEASE MAKE SURE THE DATA IS DOWNLOADED AND UNZIPPED IN THE SAME DIRECTORY AS THE NOTEBOOK")

In [3]:
import shutil
if os.path.exists(os.path.join(data_folder, 'training_images')):
    shutil.rmtree(os.path.join(data_folder, 'training_images'))

Let's prepare the data


In [4]:
import xml.dom.minidom

def get_annotations(xml_file):
    xml_doc = xml.dom.minidom.parse(xml_file)
    bbox = xml_doc.getElementsByTagName('bndbox')[0]
    anns = []
    for c in bbox.childNodes:
        data = c.childNodes
        for d in data:
            if len(d) > 0:
                anns.append(d.nodeValue)

    return anns

In [5]:
# let's display some images
import cv2 as cv
import torch
from typing import Union
import numpy as np
import json
def prepare_files(folder):
    initial_data = os.listdir(folder)
    for f in initial_data:
        if f.startswith('eggplant') and f.endswith('xml'):
            os.remove(os.path.join(folder, f)) 
        
        elif f.endswith('xml'):
            d = get_annotations(os.path.join(folder, f))
            # save the data as json
            x0, y0, x1, y1 = float(d[0]) / 227, float(d[1]) / 227, float(d[2]) / 227, float(d[3]) / 227
            data = {"x0": x0, "y0": y0, "x1": x1, "y1": y1}
            data_file_name = os.path.splitext(f)[0] + ".json" 
            with open(os.path.join(folder, data_file_name), 'w') as fp:
                json.dump(data, fp)
            os.remove(os.path.join(folder, f))
        else:
            image = cv.imread(os.path.join(folder, f))
            if image.shape != (56, 56, 3):
                image = cv.resize(image, (56, 56))
            # save the resized image
            cv.imwrite(os.path.join(folder, f), image)

# prepare_files(data_folder)

In [6]:
for index, f in enumerate(os.listdir(data_folder)):
    if f.endswith('.jpg') and not f.startswith('eggplant'):
        
        anns_file_name = os.path.splitext(f)[0] + ".json" 

        with open(os.path.join(data_folder, anns_file_name), 'r') as fp:
            d = json.load(fp)
            x0, y0, x1, y1 = float(d['x0']), float(d['y0']), float(d['x1']), float(d['y1']) 

            image = cv.imread(os.path.join(data_folder, f))

            x0 = int(x0 * image.shape[1]) 
            y0 = int(y0 * image.shape[0]) 

            x1 = int(x1 * image.shape[1]) 
            y1 = int(y1 * image.shape[0]) 

            cv.rectangle(image, (x0, y0), (x1, y1), (0, 255, 0), 2)
            cv.imshow('image', image)
            cv.waitKey(0)
            cv.destroyWindow(winname='image')    
    if index >= 10:
        break

qt.qpa.plugin: Could not find the Qt platform plugin "wayland" in "/home/ayhem18/DEV/My_Kaggle_Repo/kaggle_env/lib/python3.11/site-packages/cv2/qt/plugins"


In [7]:
from sklearn.model_selection import train_test_split
def prepare_data(folder):
    data, labels, boxes = [], [], []
    for f in os.listdir(folder):
        if f.endswith('jpg'):
            # read the image
            im = np.moveaxis(cv.imread(os.path.join(folder, f)), source=2, destination=0) 
            data.append(im)

            if f.startswith('eggplant'):
                boxes.append([-1, -1, -1, -1])    
                labels.append(0)
            else:
                anns_file_name = os.path.splitext(f)[0] + ".json" 
                with open(os.path.join(data_folder, anns_file_name), 'r') as fp:
                    d = json.load(fp)
                    x0, y0, x1, y1 = float(d['x0']), float(d['y0']), float(d['x1']), float(d['y1']) 
                    boxes.append([x0, y0, x1, y1])
                    labels.append((1 if f.startswith('cucumber') else 2))

    return np.asarray(data), np.asarray(labels), np.asarray(boxes)

data, labels, boxes = prepare_data(data_folder)
data = data / 255.0
train_x, test_x, train_y, test_y, train_b, test_b = train_test_split(data, labels, boxes, test_size=0.05, random_state=69, stratify=labels)

In [8]:
print(train_x.shape, train_y.shape, train_b.shape)
print(test_x.shape, test_y.shape, test_b.shape)

(176, 3, 56, 56) (176,) (176, 4)
(10, 3, 56, 56) (10,) (10, 4)


In [9]:
np.unique(train_y, return_counts=True)

(array([0, 1, 2]), array([59, 59, 58]))

# The network

In [10]:
import numpy as np
from torch import nn
from tinyBackProp.network import Network
from tinyBackProp.linear_layer import LinearLayer
from tinyBackProp.conv_layer import ConvLayer
from tinyBackProp.flatten import FlattenLayer
from tinyBackProp.activation_layers import SigmoidLayer, SoftmaxLayer, ReLULayer
from tinyBackProp.losses import MSELoss, CrossEntropyLoss
import torch
torch.manual_seed(69)
np.random.seed(69)

In [11]:
def build_model(cw1: np.ndarray = None
                          , cw2: np.ndarray = None
                          , lw1: np.ndarray = None
                          , lw2: np.ndarray = None
                          , lw3: np.ndarray = None) -> Network:
    
    if cw1 is None:
        t_c1 = nn.Conv2d(in_channels=3, out_channels=5, kernel_size=(7, 7), padding='valid', bias=False) 
        cw1 = t_c1.weight.cpu().detach().numpy()
    
    if cw2 is None:
        t_c2 = nn.Conv2d(in_channels=5, out_channels=10, kernel_size=(7, 7), padding='valid', bias=False)
        cw2 = t_c2.weight.cpu().detach().numpy()

    if lw1 is None:
        t_l1 = nn.Linear(in_features=10 * 44 * 44, out_features=20, bias=False)
        lw1 = t_l1.weight.detach().numpy()

    if lw2 is None:
        t_l2 = nn.Linear(in_features=20 , out_features=3, bias=False)
        lw2 = t_l2.weight.detach().numpy()

    if lw3 is None:
        t_l2 = nn.Linear(in_features=20 , out_features=4, bias=False)
        lw3 = t_l2.weight.detach().numpy()


    conv1 = ConvLayer(in_channels=3, out_channels=5, kernel_size=(7, 7), padding=None, weight_matrix=cw1) 
    conv2 = ConvLayer(in_channels=5, out_channels=10, kernel_size=(7, 7), padding=None, weight_matrix=cw2)
    linear1 = LinearLayer(in_features=10 * 44 * 44, out_features=20, weight_matrix=lw1)
    # linear2 = LinearLayer(in_features=20, out_features=4, weight_matrix=lw2)

    flatten = FlattenLayer()
    relu1, relu2 = ReLULayer(), ReLULayer()
    
    common_part = Network(layers=[conv1, relu1, conv2, relu2, flatten, linear1])

    labels_layer = LinearLayer(in_features=20, out_features=3, weight_matrix=lw2)
    boxes_layer = LinearLayer(in_features=20, out_features=4, weight_matrix=lw3)

    return common_part, labels_layer, boxes_layer

In [12]:
import math

def train(common_part: Network, 
        label_layer: LinearLayer,
        boxes_layer: LinearLayer, 
        data: np.ndarray, 
        labels: np.ndarray,
        boxes: np.ndarray,
        num_epochs: int = 2, 
        batch_size: int = 10, 
        learning_rate: float = 0.1) -> Network:

    ilr = learning_rate
    num_batches = int(math.ceil(len(data) / batch_size))

    # define the loss function
    mse =  MSELoss(reduction='mean')
    cel = CrossEntropyLoss(num_classes=3)

    sigmoid = SigmoidLayer()
    softmax = SoftmaxLayer()

    for e in range(1, num_epochs + 1):
        epoch_loss = 0
        
        np.random.shuffle(data)
        np.random.shuffle(labels)
        np.random.shuffle(boxes)
        
        for i in range(num_batches):
            # print(f"epoch: {e}, batch: {i}...")

            x = data[i * batch_size: (i + 1) * batch_size]
            y = labels[i * batch_size: (i + 1) * batch_size]
            b = boxes[i * batch_size: (i + 1) * batch_size]

            # forward pass
            features = common_part.forward(x)
            y_pred = softmax(label_layer(features))
            b_pred = sigmoid(boxes_layer(features))

            label_loss = cel(y_pred=y_pred, y_true=y)            
            b_loss = mse(y_pred=b_pred * (np.expand_dims(y, axis=1) != 0), 
                         y_true=b * (np.expand_dims(y, axis=1) != 0))

            # print(f"batch: {i}, classification: {round(label_loss, 4)}, localization: {round(b_loss, 4)}")

            epoch_loss += label_loss + b_loss

            # backward pass
            label_grad = cel.grad(y_pred=y_pred, y_true=y, reduction='mean')
            label_grad = softmax.grad(upstream_grad=label_grad)

            b_grad = mse.grad(b_pred, b)
            # select only gradients associated with non zero labels
            b_grad = b_grad * (np.expand_dims(y, axis=1) != 0)
            b_grad = sigmoid.grad(upstream_grad=b_grad)


            # label layer backprop
            g_param = label_layer.param_grad(upstream_grad=label_grad)
            g_x_label = label_layer.grad(upstream_grad=label_grad)
            # update the weights of the label_layers
            label_layer.update(g_param, learning_rate=learning_rate)


            # do the same for box_layer
            g_param = boxes_layer.param_grad(upstream_grad=b_grad)
            g_x_box = boxes_layer.grad(upstream_grad=b_grad)
            boxes_layer.update(g_param, learning_rate=learning_rate)


            # the final gradient to be passed to the common part is the sum of both layers
            upstream_grad = g_x_label + g_x_box
            common_part.backward(upstream_grad, learning_rate=ilr)
                    
        print(f"epoch: {e}: loss: {round(epoch_loss / num_batches, 4)}")

    return common_part, label_layer, boxes_layer


In [14]:
common_part, label_layer, box_layer = build_model()
c, l, b = train(common_part=common_part, 
                label_layer=label_layer, 
                boxes_layer=box_layer, 
                data=train_x,
                labels=train_y, 
                boxes=train_b,
                num_epochs=10)

epoch: 1: loss: 1.6649
epoch: 2: loss: 1.6198
epoch: 3: loss: 1.6132
epoch: 4: loss: 1.615
epoch: 5: loss: 1.5914
epoch: 6: loss: 1.6821
epoch: 7: loss: 1.6163
epoch: 8: loss: 1.6928
epoch: 9: loss: 1.6322
epoch: 10: loss: 1.7105


In [15]:
def predict(common_part, label_layer, box_layer):
    features = common_part.forward(test_x)
    classes = np.argmax(SoftmaxLayer()(label_layer.forward(features)), axis=1)
    boxes = SigmoidLayer()(box_layer.forward(features))

    for im, c, b in zip(test_x, classes, boxes):
        im = np.moveaxis(im, source=0, destination=2) * 255
        im = im.astype(np.uint8).copy()
        # print(im.shape)
        # cv.imshow(f'{cls}', im)
        x0, y0, x1, y1 = b 
        
        x0 = int(x0 * im.shape[1]) 
        y0 = int(y0 * im.shape[0]) 

        x1 = int(x1 * im.shape[1]) 
        y1 = int(y1 * im.shape[0])     
        
        cv.rectangle(im, (x0, y0), (x1, y1), (0, 255, 0), 2)
        cv.imshow(f'{c}', im)
        cv.waitKey(0)
        cv.destroyAllWindows()   

In [18]:
predict(c, l, b)