In [1]:
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


In [8]:
target = []
a_maps = []

In [9]:
with open('../extractCOCO/data.pkl', 'rb') as f:
    data = pickle.load(f)

# with open('../extractCOCO/data_4000-8000.pkl', 'rb') as f:
#     data = pickle.load(f)

# with open('../extractCOCO/data_8000-12000.pkl', 'rb') as f:
#     data = pickle.load(f)

In [10]:

for key in data.keys():
    target.append(torch.Tensor(data[key]['boxe']).to(torch.float32))
    a_maps.append(data[key]['attn_map'])

del data

In [11]:
num_a_maps = sum([1 for i in a_maps for j in i if j is not None])
num_a_maps

7636

In [12]:
t_maps = torch.Tensor(num_a_maps, a_maps[0][0].shape[0], a_maps[0][0].shape[1])
t_targets = torch.Tensor(num_a_maps, 4)

for i, (a_map, t) in enumerate(zip(a_maps, target)):
    for a in a_map:
        t_targets[i] = t
        t_maps[i] = a



In [39]:
bach_size = 32

data = torch.utils.data.TensorDataset(t_maps, t_targets)

train, test = torch.utils.data.random_split(data, [int(0.8*len(data)), len(data)-int(0.8*len(data))])

train_loader = torch.utils.data.DataLoader(train, batch_size=bach_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=bach_size, shuffle=False)

In [37]:
def mse_box(output, target):
    loss = torch.sum(torch.mean((output - target)**2, dim=1))
    return loss

class TransformerEncoderLayer(nn.Module):
    def __init__(self, dim_emb, nhead: int = 2, dim_feedforward: int = 10, dropout: float =0.1, activation = nn.GELU):
        super(TransformerEncoderLayer, self).__init__()
        
        self.attn = nn.MultiheadAttention(dim_emb, nhead, dropout=dropout)
        
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(dim_emb, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, dim_emb)

        self.norm1 = nn.LayerNorm(dim_emb)
        self.norm2 = nn.LayerNorm(dim_emb)
        
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = activation()

    def forward(self, data, src_mask=None, src_key_padding_mask=None):

        # MultiHeadAttention
        x, attn = self.attn(data, data, data, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
        
        # add & norm
        x = data + self.dropout1(x)
        x = self.norm1(x)
        
        # Implementation of Feedforward model
        x1 = self.linear2(self.dropout(self.activation(self.linear1(x))))
        
        # add & norm
        x = x + self.dropout2(x1)
        x = self.norm2(x)

        return x
    


class MLP(nn.Module):
    def __init__(self, dim_emb, dim_feedforward = 10, dropout=0.1, activation = nn.GELU):
        super(MLP, self).__init__()
        
        self.linear1 = nn.Linear(dim_emb, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, dim_emb)

        self.dropout = nn.Dropout(dropout)

        self.activation = activation()

    def forward(self, data):
        
        x = self.linear2(self.dropout(self.activation(self.linear1(data))))
        
        return x

    

class Net(nn.Module):
    def __init__(
        self, 
        dim_emb, 
        ch_out_conv1d: int = 1, 
        dim_box: int = 4, 
        nhead: int = 1, 
        dim_feedforward: int = 10, 
        dropout: float = 0.1, 
        kernel_conv1d: int = 3,
        stride_conv1d: int = 1,
        activation = nn.GELU
    ):
        super(Net, self).__init__()
        
        self.conv2d = nn.Conv2d(1, ch_out_conv1d, kernel_conv1d, stride=2)
        dim_emb = 255
        self.encoder_layer = TransformerEncoderLayer(dim_emb, nhead, dim_feedforward, dropout, activation)
        self.mlp = MLP(dim_emb, dim_feedforward, dropout, activation)
        self.conv1d = nn.Conv1d(dim_emb, ch_out_conv1d, kernel_conv1d, stride=stride_conv1d)
        self.flatten = nn.Flatten()
        # with cnn2d 253
        # without 510
        # self.linear = nn.Linear(510, dim_feedforward*2)
        # self.linear2 = nn.Linear(dim_feedforward*2, dim_box)

        self.linear = nn.Linear(253, dim_box)


        self.activation = activation()
        
    def forward(self, data):
        data = data.reshape(data.shape[0], 1, data.shape[1], data.shape[2])
        x = self.conv2d(data)
        x = x.reshape(x.shape[0], x.shape[2], x.shape[3])
        # x = data
        x = self.encoder_layer(x)
        x = self.mlp(x)
        x = self.conv1d(x)
        x = self.flatten(x)
        # print(x.shape)
        # mlp
        # x = self.linear2(self.activation(self.linear(x)))
        x = self.linear(x)
        
        return x
    
#  loss nn.CrossEntropyLoss(reduction='mean')
def training(model, train_loader, optimizer, criterion = mse_box, device = 'cuda', epochs = 10):

    sample = 0.0
    cum_loss = 0.0

    model.train()

    for e in range(epochs):

        for batch_idx, (data, target) in enumerate(train_loader):

            data, target = data.to(device), target.to(device)

            output = model(data)

            loss = criterion(output, target.to(torch.float32))
            loss.backward()
            optimizer.step()
            
            optimizer.zero_grad()

            sample += len(data)
            cum_loss += loss.item()

        print(f'Train Epoch: {e} Loss: {cum_loss/sample}')    


def test_fn(model, test_loader, criterion = nn.CrossEntropyLoss(), device = 'cuda'):

    sample = 0.0
    cum_loss = 0.0

    model.eval()

    
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):

            data, target = data.to(device), target.to(device)

            output = model(data)

            loss = criterion(output, target)

            sample += len(data)
            cum_loss += loss.item()

        print(f'Test Loss: {cum_loss/sample}')       




In [17]:
# with cnn2d
# head = 1
net = Net(512).cuda()

training(net, train_loader, optimizer = optim.AdamW(net.parameters(), lr=0.001))

Train Epoch: 0 Loss: 1574908.619781434
Train Epoch: 1 Loss: 1574248.648309594
Train Epoch: 2 Loss: 1574005.2780523493
Train Epoch: 3 Loss: 1573881.9299445911
Train Epoch: 4 Loss: 1573521.4473794613
Train Epoch: 5 Loss: 1572078.1378400596
Train Epoch: 6 Loss: 1570877.7323635863


KeyboardInterrupt: 

In [24]:
# without cnn2d
# head = 1
net = Net(512).cuda()

training(net, train_loader, optimizer = optim.AdamW(net.parameters(), lr=0.001))

Train Epoch: 0 Loss: 1574442.5332351015
Train Epoch: 1 Loss: 1574049.551567872
Train Epoch: 2 Loss: 1570313.8399989426
Train Epoch: 3 Loss: 1568347.3311237567
Train Epoch: 4 Loss: 1567160.905685167
Train Epoch: 5 Loss: 1566364.3843064425
Train Epoch: 6 Loss: 1565790.2630347202
Train Epoch: 7 Loss: 1565357.357050308
Train Epoch: 8 Loss: 1565017.6816413356
Train Epoch: 9 Loss: 1564735.8094123485


In [25]:
# without cnn2d
# increasing the number of neurons in the mlp and head = 2

net = Net(512, dim_feedforward = 100, nhead = 2).cuda()

training(net, train_loader, optimizer = optim.AdamW(net.parameters(), lr=0.001))

Train Epoch: 0 Loss: 1575357.3728614112
Train Epoch: 1 Loss: 1569566.9609323838
Train Epoch: 2 Loss: 1567202.7041158453
Train Epoch: 3 Loss: 1565983.2081504278
Train Epoch: 4 Loss: 1565178.9855691306
Train Epoch: 5 Loss: 1564572.787003213
Train Epoch: 6 Loss: 1564086.8453031445
Train Epoch: 7 Loss: 1563692.671444916
Train Epoch: 8 Loss: 1563372.216937998
Train Epoch: 9 Loss: 1563103.5590490955


In [44]:
# without cnn2d
# increasing the number of neurons in the mlp and head = 2

net = Net(512, dim_feedforward = 10, nhead = 3).cuda()

training(net, train_loader, optimizer = optim.AdamW(net.parameters(), lr=0.001))

Train Epoch: 0 Loss: 1573897.554539129
Train Epoch: 1 Loss: 1573765.478110163
Train Epoch: 2 Loss: 1573449.9140386241
Train Epoch: 3 Loss: 1573503.5356781066
Train Epoch: 4 Loss: 1573538.4815610675
Train Epoch: 5 Loss: 1573444.1662044995
Train Epoch: 6 Loss: 1573469.2474579592
Train Epoch: 7 Loss: 1572854.5262744557
Train Epoch: 8 Loss: 1571978.421947196
Train Epoch: 9 Loss: 1571429.2642144524
