In [1]:
import torch
import torch.nn as nn

In [2]:
architecture = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1)
]

In [3]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        x = self.leakyrelu(self.batchnorm(self.conv(x)))
        return x

In [4]:
class Yolov1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * split_size * split_size, 512),
            nn.Dropout(0.2),
            nn.LeakyReLU(0.1),
            nn.Linear(512, split_size*split_size*(num_classes+num_boxes*5))
        )


    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers.append(
                    CNNBlock(
                        in_channels,
                        out_channels=x[1],
                        kernel_size=x[0],
                        stride = x[2],
                        padding = x[3]
                    )
                )
                in_channels = x[1]
            elif type(x) == str:
                layers.append(
                    nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
                )
            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]
                for _ in range(num_repeats):
                    layers += [
                        nn.Conv2d(
                            in_channels,
                            out_channels=conv1[1],
                            kernel_size=conv1[0],
                            stride=conv1[2],
                            padding=conv1[3]
                        ),
                        nn.Conv2d(
                            in_channels=conv1[1],
                            out_channels=conv2[1],
                            kernel_size=conv2[0],
                            stride=conv2[2],
                            padding=conv2[3]
                        )
                    ]
                    in_channels = conv2[1]
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.darknet(x)
        x = torch.flatten(x, start_dim=1)
        return self.fcs(x)

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
model = Yolov1(3, split_size=7, num_boxes=2, num_classes=20).to(device)

In [7]:
qq = torch.zeros((2, 3, 448, 448)).to(device)

In [8]:
model.train(False)
with torch.no_grad():
    print(model.forward(qq).shape)

torch.Size([2, 1470])


In [9]:
from IntersectionOverUnion import intersection_over_union

In [10]:
model.train(False)
with torch.no_grad():
    ex = model.forward(qq)

In [54]:
class YoloLoss(nn.Module):
    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction='sum')
        self.S = S
        self.B = B
        self.C = C
        self.lamda_noobj = 0.5
        self.lamda_coord = 5

    def forward(self, predictions, target):
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B*5)
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
        iou_maxes, bestbox = torch.max(ious, dim=0)
        bestbox = bestbox.unsqueeze(3)
        exists_box = target[..., 20].unsqueeze(3)

        ### box coordinates
        print(bestbox.shape, predictions[..., 26:30].shape)
        box_predictions = exists_box * (
            (
                bestbox * predictions[..., 26:30] + (1-bestbox)* predictions[..., 21:25]
            )
        )

        box_targets = exists_box * target[..., 21:25]

        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4])* torch.sqrt(torch.abs(box_predictions[..., 2:4]) + 1e-6)

        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        box_loss = self.mse(torch.flatten(box_predictions, end_dim=-2),
                            torch.flatten(box_targets, end_dim=-2))


        ### object loss

        pred_box = bestbox * predictions[..., 25:26] + (1-bestbox) * predictions[..., 20:21]
        object_loss = self.mse(
            torch.flatten(exists_box*pred_box),
            torch.flatten(exists_box*target[..., 20:21])
        )
        ### no object loss

        no_object_loss = self.mse(
            torch.flatten((1-exists_box)*predictions[..., 20:21], start_dim=1),
            torch.flatten((1-exists_box)* target[..., 20:21], start_dim=1)
        )

        no_object_loss += self.mse(
            torch.flatten((1-exists_box)*predictions[..., 25:26], start_dim=1),
            torch.flatten((1-exists_box)* target[..., 20:21], start_dim=1)
        )


        ###class loss
        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2),
            torch.flatten(exists_box * target[..., :20], end_dim=-2)
        )


        loss = (
            self.lamda_coord*box_loss + self.lamda_noobj*no_object_loss + object_loss + class_loss
        )
        return loss

In [55]:
ex.shape

torch.Size([2, 1470])

In [56]:
ex

tensor([[-0.0439, -0.0296, -0.0421,  ...,  0.0061, -0.0049,  0.0094],
        [-0.0439, -0.0296, -0.0421,  ...,  0.0061, -0.0049,  0.0094]],
       device='cuda:0')

In [57]:
YoloLoss().forward(ex.cpu(), torch.ones(2, 7, 7, 25))

torch.Size([2, 7, 7, 1]) torch.Size([2, 7, 7, 4])


tensor(3975.4619)

In [45]:
torch.zeros(2, 7, 7, 1)* torch.ones(2,7,7,4)

tensor([[[[0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]],

         [[0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]],

         [[0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]],

         [[0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]],

         [[0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]],

         [

In [15]:
ex.shape

torch.Size([2, 1470])

In [73]:
torch.ones(2, 7, 7, 25).shape

torch.Size([2, 7, 7, 25])

In [54]:
ex.shape

torch.Size([2, 1470])

In [52]:
torch.ones(2, 7, 7, 25)

tensor([[[[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],

         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],

         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]],

         ...,

         [[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 

In [26]:
a = torch.tensor(range(120)).reshape(4, 30)
a

tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
          14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
          28,  29],
        [ 30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
          44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,
          58,  59],
        [ 60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,
          74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,
          88,  89],
        [ 90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
         104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
         118, 119]])

In [32]:
q = torch.cat([intersection_over_union(a[:, 21:25], a[:, 21:25]).unsqueeze(0), intersection_over_union(a[:, 21:25], a[:, 21:25]).unsqueeze(0)])

In [37]:
torch.max(q, dim=0)

torch.return_types.max(
values=tensor([1., 1., 1., 1.]),
indices=tensor([0, 0, 0, 0]))

In [44]:
torch.tensor(range(100)).reshape(4, 25)[:, 20]

tensor([20, 45, 70, 95])