In [86]:
from ssdmultibox.datasets import TrainPascalDataset
from torchvision import models
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import torch.nn.functional as F
from ssdmultibox.datasets import Bboxer

In [2]:
model = models.resnet18(pretrained=True)

In [3]:
NUM_FEATURES = model.fc.in_features
NUM_FEATURES

512

In [10]:
dataset = TrainPascalDataset(grid_size=4)
image_id, im, gt_bbs, gt_cats = dataset[1]
image_id, im.shape, gt_bbs.shape, gt_cats.shape

(17, (3, 224, 224), (64,), (16,))

In [16]:
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)

item = next(iter(dataloader))
image_ids, ims, bbs, cats = item
image_ids.shape, ims.shape, bbs.shape, cats.shape

(torch.Size([4]),
 torch.Size([4, 3, 224, 224]),
 torch.Size([4, 64]),
 torch.Size([4, 16]))

In [17]:
320 / 64

5.0

In [36]:
model = models.resnet18(pretrained=True)

class AsIsHead(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return x
    
model.fc = AsIsHead()

In [37]:
outputs = model(ims)

In [38]:
outputs.shape

torch.Size([4, 512])

# Custom Head

In [12]:
class CustomHead(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv2d_1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2d_2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2d_drop = nn.Dropout2d()
        self.linear = nn.Linear(512, 784)
        self.linear_cats = nn.Linear(320, 16*20)
        self.linear_bbs = nn.Linear(320, 16*4)

    def forward(self, outputs):
        out3 = self.linear(outputs)
        out4 = torch.reshape(out3, (4, 28, 28))
        out5 = out4.unsqueeze(1) # [4, 1, 28, 28]
        out7 = self.conv2d_1(out5) # [4, 10, 24, 24]
        out7_mp = F.relu(F.max_pool2d(out7, kernel_size=2)) # [4, 10, 12, 12]
        out8_mp = F.relu(F.max_pool2d(self.conv2d_2(out7_mp), kernel_size=2)) # [4, 20, 4, 4]
        flat = out8_mp.view(4, 320) # [4, 320]

        cats_pred = self.linear_cats(flat)
        bbs_pred = self.linear_bbs(flat)
        return [bbs_pred, cats_pred]
    
model.fc = CustomHead()
outputs = model(ims)

In [35]:
ims.shape

torch.Size([4, 3, 224, 224])

In [28]:
20 * 4 * 4

320

In [29]:
256*2

512

In [30]:
512-320

192

In [31]:
192 / 16

12.0

In [32]:
conv2d_1 = nn.Conv2d(1, 10, kernel_size=5)
conv2d_2 = nn.Conv2d(10, 20, kernel_size=5)
conv2d_drop = nn.Dropout2d()
linear = nn.Linear(512, 784)
linear_cats = nn.Linear(320, 16*20)
linear_bbs = nn.Linear(320, 16*4)

In [39]:
out3 = linear(outputs)
out4 = torch.reshape(out3, (4, 28, 28))
out5 = out4.unsqueeze(1) # [4, 1, 28, 28]
out7 = conv2d_1(out5) # [4, 10, 24, 24]
out7_mp = F.relu(F.max_pool2d(out7, kernel_size=2)) # [4, 10, 12, 12]
out8_mp = F.relu(F.max_pool2d(conv2d_2(out7_mp), kernel_size=2)) # [4, 20, 4, 4]
out8_mp.shape

torch.Size([4, 20, 4, 4])

In [40]:
out7.shape

torch.Size([4, 10, 24, 24])

In [58]:
out1 = nn.Conv2d(20, 40, kernel_size=3, padding=1)(out8_mp)
out1.shape

torch.Size([4, 40, 4, 4])

In [59]:
40*4*4

640

In [60]:
out2 = F.relu(F.max_pool2d(out1, kernel_size=2))
out2.shape

torch.Size([4, 40, 2, 2])

In [61]:
40*2*2

160

In [63]:
out3 = out2.view(4, -1)
out3.shape

torch.Size([4, 160])

In [64]:
linear128 = nn.Linear(160, 128)
linear64 = nn.Linear(128, 64)
linear16 = nn.Linear(64, 16)

In [65]:
out4 = linear128(out3)
out4.shape

torch.Size([4, 128])

In [66]:
out5 = linear64(out4)
out5.shape

torch.Size([4, 64])

In [67]:
out6 = linear16(out5)
out6.shape

torch.Size([4, 16])

In [73]:
conv2d_1 = nn.Conv2d(1, 10, kernel_size=5)
conv2d_2 = nn.Conv2d(10, 20, kernel_size=5)
conv2d_3 = nn.Conv2d(20, 40, kernel_size=3, padding=1)
conv2d_drop = nn.Dropout2d()
linear = nn.Linear(512, 784)
linear128 = nn.Linear(160, 128)
linear64 = nn.Linear(128, 64)
linear16 = nn.Linear(64, 16)

out3 = linear(outputs)
out4 = torch.reshape(out3, (4, 28, 28))
out5 = out4.unsqueeze(1) # [4, 1, 28, 28]
out7 = conv2d_1(out5) # [4, 10, 24, 24]
out7_mp = F.relu(F.max_pool2d(out7, kernel_size=2)) # [4, 10, 12, 12]
out8 = conv2d_2(out7_mp) # [4, 20, 8, 8]
out8_mp = F.relu(F.max_pool2d(out8, kernel_size=2)) # [4, 20, 4, 4]
out9 = conv2d_3(out8_mp) # [4, 40, 4, 4]
out9_mp = F.relu(F.max_pool2d(out9, kernel_size=2)) # [4, 40, 2, 2]
flat = out9_mp.view(4, -1) # [4, 160]
flat128 = linear128(flat)
flat64 = linear64(flat128)
flat16 = linear16(flat64)

In [74]:
flat64.shape

torch.Size([4, 64])

In [75]:
flat16.shape

torch.Size([4, 16])

In [71]:
out9_mp.shape

torch.Size([4, 40, 2, 2])

In [125]:
class LinearCats(nn.Module):
    """
    Takes a conv of [4, 20, 4, 4] and returns a vector of 320
    """
    def __init__(self):
        super().__init__()
        self.conv2d_3 = nn.Conv2d(20, 40, kernel_size=3, padding=1)
        self.conv2d_4 = nn.Conv2d(40, 80, 3, padding=1)

    def forward(self, x):
        x = F.relu(self.conv2d_3(x)) # [4, 40, 4, 4]
        x = self.conv2d_4(x) # [4, 80, 4, 4]
        x = F.relu(F.max_pool2d(x, 2)) # [4, 80, 2, 2] 
        return x.view(-1, 320) # [4, 320]

class LinearBbs(nn.Module):
    """
    Takes a conv of [4, 20, 4, 4] and returns a vector of 64
    """
    def __init__(self):
        super().__init__()
        self.conv2d_3 = nn.Conv2d(20, 40, kernel_size=3, padding=1)
        self.linear128 = nn.Linear(160, 128)
        self.linear64 = nn.Linear(128, 64)

    def forward(self, x):
        out9 = self.conv2d_3(x) # [4, 40, 4, 4]
        out9_mp = F.relu(F.max_pool2d(out9, kernel_size=2)) # [4, 40, 2, 2]
        flat = out9_mp.view(4, -1) # [4, 160]
        flat128 = F.relu(self.linear128(flat))
        flat64 = F.relu(self.linear64(flat128))
        return flat64
    
class CustomHeadDense(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv2d_1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2d_2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2d_3 = nn.Conv2d(20, 40, kernel_size=3, padding=1)
        self.linear = nn.Linear(512, 784)
        self.linear_cats = LinearCats()
        self.linear_bbs = LinearBbs()

    def forward(self, outputs):
        out3 = self.linear(outputs)
        out4 = torch.reshape(out3, (4, 28, 28))
        out5 = out4.unsqueeze(1) # [4, 1, 28, 28]
        out7 = self.conv2d_1(out5) # [4, 10, 24, 24]
        out7_mp = F.relu(F.max_pool2d(out7, kernel_size=2)) # [4, 10, 12, 12]
        out8 = self.conv2d_2(out7_mp) # [4, 20, 8, 8]
        out8_mp = F.relu(F.max_pool2d(out8, kernel_size=2)) # [4, 20, 4, 4]
        return self.linear_bbs(out8_mp), self.linear_cats(out8_mp)

In [126]:
model.fc = CustomHeadDense()

outputs = model(ims)

In [127]:
pred_bbs, pred_cats = outputs
pred_bbs.shape, pred_cats.shape

(torch.Size([4, 64]), torch.Size([4, 320]))

In [114]:
out2 = F.max_pool2d(pred_cats, 2)
out2.shape

torch.Size([4, 40, 2, 2])

In [115]:
40*2*2

160

In [119]:
out1 = nn.Conv2d(40, 80, 3, padding=1)(pred_cats)

out3 = F.max_pool2d(out1, 2)


torch.Size([4, 80, 4, 4])

In [120]:
out3 = F.max_pool2d(out1, 2)
out3.shape

torch.Size([4, 80, 2, 2])

In [121]:
80*2*2

320

In [111]:
20*4*4

320

In [91]:
bboxer = Bboxer(grid_size=4)
num_classes=20
one_pred_cats = torch.eye(num_classes)[pred_cats.cpu()]
one_pred_cats.shape

RuntimeError: tensors used as indices must be long or byte tensors

In [93]:
pred_cats

tensor([[0.0005, 0.0743, 0.0000, 0.0000, 0.1044, 0.0000, 0.0199, 0.0817, 0.0000,
         0.0457, 0.0000, 0.0165, 0.0000, 0.1043, 0.0000, 0.0635],
        [0.0022, 0.0713, 0.0000, 0.0000, 0.1084, 0.0000, 0.0199, 0.0892, 0.0000,
         0.0461, 0.0000, 0.0195, 0.0000, 0.1068, 0.0000, 0.0602],
        [0.0004, 0.0716, 0.0000, 0.0000, 0.0993, 0.0000, 0.0205, 0.0805, 0.0000,
         0.0489, 0.0000, 0.0195, 0.0000, 0.1094, 0.0000, 0.0645],
        [0.0038, 0.0755, 0.0000, 0.0000, 0.1017, 0.0000, 0.0219, 0.0807, 0.0000,
         0.0496, 0.0000, 0.0130, 0.0000, 0.1063, 0.0000, 0.0657]],
       grad_fn=<ReluBackward>)

In [129]:
from torch import optim

model = models.resnet18(pretrained=True)
model.fc = CustomHead()

# data
dataset = TrainPascalDataset(grid_size=4)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)

# optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# process
item = next(iter(dataloader))
image_ids, ims, bbs, cats = item

# zero the parameter gradients
optimizer.zero_grad()

# forward pass
pred_bbs, pred_cats = model(ims)

In [130]:
pred_bbs.shape, pred_cats.shape

(torch.Size([4, 64]), torch.Size([4, 320]))

In [132]:
cats.shape

torch.Size([4, 16])

In [134]:
onehot_cats_bg = torch.eye(num_classes+1)[cats].shape


torch.Size([4, 16, 21])

In [None]:
from ssdmultibox.datasets import Bboxer


class BCE_Loss(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.num_classes = num_classes

    def forward(self, pred, targ):
        t = one_hot_embedding(targ, self.num_classes+1)
        t = V(t[:,:-1].contiguous())#.cpu()
        x = pred[:,:-1]
        w = self.get_weight(x,t)
        return F.binary_cross_entropy_with_logits(x, t, w, size_average=False)/self.num_classes