In [46]:
# !pip install pytorch-lightning

In [47]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import pytorch_lightning as pl

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class MLP(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        
        self.linear1 = nn.Linear(d_model, d_model//4)
        self.layer_norm1 = nn.LayerNorm(d_model//4)
        self.linear2 = nn.Linear(d_model//4,  d_model//16)
        self.layer_norm2 = nn.LayerNorm(d_model//16)
        self.linear3 = nn.Linear(d_model//16, d_model//32)
        self.layer_norm3 = nn.LayerNorm(d_model//32)
        self.linear4 = nn.Linear(d_model//32, 4)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        # x = x.to(device)
        x = x.cuda()

        x = self.layer_norm1(self.dropout(self.linear1(x)))
        x = self.layer_norm2(self.dropout(self.linear2(x)))
        x = self.layer_norm3(self.dropout(self.linear3(x)))
        x = self.linear4(x)

        return x


class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super().__init__()
        
        # embedding matching
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)

        # feedforward
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)


        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        # activation
        self.activation = nn.GELU()
        
    def forward(self, text_emb, box_emb):

        # https://arxiv.org/pdf/2002.04745.pdf
        # here we propose the original transformer encoder layer
        # however, we designed the architecture in this way
        # as the authors of the paper did to improve the convergence

        # text_emb = text_emb.to(device)
        # box_emb = box_emb.to(device)

        text_emb = text_emb.cuda()
        box_emb = box_emb.cuda()


        # # Add & Norm
        # text_emb = text_emb + self.dropout1(text_emb)
        # text_emb = self.norm1(text_emb)

        # box_emb = box_emb + self.dropout1(box_emb)
        # box_emb = self.norm1(box_emb)

        # print(text_emb.shape, box_emb.shape)

        # embedding matching
        x , _ = self.self_attn(box_emb, text_emb, box_emb)

        # print(x.shape, box_emb.shape)
        
        # Add & Norm
        x = box_emb + self.dropout1(x)
        x = self.norm1(x)

        # feedforward
        x = self.linear2(self.dropout(self.activation(self.linear1(x))))

        x = box_emb + self.dropout2(x)
        x = self.norm2(x)

        return x




class TransformerEncoder(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
        super().__init__()
        
        # self.encoder_block = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)
        # self.transformer_encoder = nn.TransformerEncoder(self.encoder_block, num_layers)

        self.transformer_encoder = [TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout).to(device).type(torch.float32) for _ in range(num_layers)]


    def forward(self, text_emb, box_emb):

        # # matching between the text and the first box
        # x0 = self.transformer_encoder(text_emb, box_emb[:,0,:].unsqueeze(1))
        # # matching between the text and the second box
        # x1 = self.transformer_encoder(text_emb, box_emb[:,1,:].unsqueeze(1))
        
        # # concatenate the two boxes
        # # shape: (batch_size, 2, d_model)
        # x = torch.cat([x0, x1], axis=1)


        # text_emb = text_emb.to(device)
        # box_emb = box_emb.to(device)

        text_emb = text_emb.cuda()
        box_emb = box_emb.cuda()
        
        x0 = text_emb.to(device)
        x1 = text_emb.to(device)

        for layer in self.transformer_encoder:
            # matching between the text and the first box
            x0 = layer(x0, box_emb[:,0,:].unsqueeze(1))
            # matching between the text and the second box
            x1 = layer(x1, box_emb[:,1,:].unsqueeze(1))

        # concatenate the two boxes
        # shape: (batch_size, 2, d_model)
        x = torch.cat([x0, x1], axis=1)

        return x
    


class BoxRegressor(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
        super().__init__()
        
        self.transformer_encoder = TransformerEncoder(d_model, nhead, num_layers, dim_feedforward, dropout).to(device).type(torch.float32)
        self.mlp_regressor = MLP(1034, dropout).to(device).type(torch.float32)
        self.flatten = nn.Flatten(start_dim=1).to(device)


    def forward(self, text_encoding, box_encoding, box_coords):

        # text_encoding = text_encoding.to(device)
        # box_encoding = box_encoding.to(device)
        # box_coords = box_coords.to(device)

        text_encoding = text_encoding.cuda()
        box_encoding = box_encoding.cuda()
        box_coords = box_coords.cuda()

        # compute the similarity matrix between the text and the boxes encoding
        similarity_matrix = torch.bmm(text_encoding.permute(0, 2, 1), box_encoding)

        # get the index of the top two boxes with the highest score
        top2_indices = torch.topk(similarity_matrix, k=2, dim=-1).indices.squeeze(1)
        top2 = torch.topk(similarity_matrix, k=2, dim=-1).indices.squeeze(1)


        # permute the dimensions to get the top two boxes
        box_encoding = box_encoding.permute(0, 2, 1)
        # get the top two boxes
        top2_boxes = box_encoding[torch.arange(box_encoding.shape[0]).unsqueeze(1), top2_indices]
        # print(top2_boxes.shape)

        # get the top two boxes coordinates
        top2_boxes_coords = box_coords[torch.arange(box_encoding.shape[0]).unsqueeze(1), top2_indices]
        # print(top2_boxes_coords.shape)

        top2_boxes = top2_boxes.to(device)

        # compute the matching between the text and the top two boxes
        out_matching = self.transformer_encoder(text_encoding.permute(0, 2, 1), top2_boxes)

        # concatenate the matching score with the top two boxes coordinates
        matching_score = torch.cat([top2.unsqueeze(2), top2_boxes_coords, out_matching], axis=-1)

        return self.mlp_regressor(self.flatten(matching_score))
    


# def SUM_MSE_loss(pred, target):
#     return (pred - target).pow(2).sum(axis=-1).mean()

from torch.nn import HuberLoss
from torch.nn import MSELoss

# class Net(pl.LightningModule):
#     def __init__(self, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
#         super().__init__()
        
#         self.box_regressor = BoxRegressor(d_model, nhead, num_layers, dim_feedforward, dropout).to(device)
#         self.loss = MSELoss()

#     def forward(self, text_encoding, box_encoding, box_coords):
#         return self.box_regressor(text_encoding, box_encoding, box_coords)

#     def training_step(self, batch, batch_idx):
#         # training_step defined the train loop.
#         # It is independent of forward
#         text_encoding, box_encoding, box_coords, labels = batch

#         # text_encoding = text_encoding.to(device)
#         # box_encoding = box_encoding.to(device)
#         # box_coords = box_coords.to(device)
#         # labels = labels.to(device)

#         text_encoding = text_encoding.cuda()
#         box_encoding = box_encoding.cuda()
#         box_coords = box_coords.cuda()
#         labels = labels.cuda()

#         self = self.to(device)

#         out = self(text_encoding, box_encoding, box_coords)

#         # print(F.mse_loss(1/(self.forward(text_encoding, box_encoding, box_coords).squeeze(1)+1),  1/(labels+1)))

#         # out = out.squeeze(1)
#         # loss = F.mse_loss(1/(out+1), 1/(labels+1))
#         loss = self.loss(out, labels)/labels.shape[0]
#         # loss = SUM_MSE_loss(out, labels)

#         print(loss)

#         # Logging to TensorBoard by default
#         self.log('train_loss', loss)

#         return loss

#     def validation_step(self, batch, batch_idx):

#         text_encoding, box_encoding, box_coords, labels = batch

#         # text_encoding = text_encoding.to(device)
#         # box_encoding = box_encoding.to(device)
#         # box_coords = box_coords.to(device)
#         # labels = labels.to(device)

#         text_encoding = text_encoding.cuda()
#         box_encoding = box_encoding.cuda()
#         box_coords = box_coords.cuda()
#         labels = labels.cuda()

#         out = self(text_encoding, box_encoding, box_coords)

#         print(out.shape)

#         # out = out.squeeze(1)
#         # loss = F.mse_loss(1/(out+1), 1/(labels+1))
#         loss = self.loss(out, labels)/labels.shape[0]
#         # loss = SUM_MSE_loss(out, labels)

#         # Logging to TensorBoard by default
#         self.log('val_loss', loss)

#         return loss

#     def configure_optimizers(self):
#         optimizer = torch.optim.AdamW(self.parameters(), lr=5e-4)
#         scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
#         return [optimizer], [scheduler]


class Net(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
        super().__init__()
        
        self.box_regressor = BoxRegressor(d_model, nhead, num_layers, dim_feedforward, dropout).to(device)
        self.loss = MSELoss()

    def forward(self, text_encoding, box_encoding, box_coords):
        return self.box_regressor(text_encoding, box_encoding, box_coords)


def training(model, train_loader, val_loader, optimizer, criterion = MSELoss(), device = 'cuda', epochs = 10):

    sample = 0.0
    cum_loss = 0.0

    for e in range(epochs):

        model.train()
        
        for batch_idx, (text_encoding, box_encoding, box_coords, labels) in enumerate(train_loader):

            text_encoding = text_encoding.cuda()
            box_encoding = box_encoding.cuda()
            box_coords = box_coords.cuda()

            labels = labels.cuda()

            output = model(text_encoding, box_encoding, box_coords)

            output = output.squeeze(1)
            labels = labels.squeeze(1)

            # print(output - labels)
            # print(output)
            # print(output.shape, labels.shape)

            loss = criterion(output, 1/labels)
            loss.backward()
            optimizer.step()
            
            optimizer.zero_grad()

            sample += len(text_encoding)
            cum_loss += loss.item()

        test_fn(model, val_loader, MSELoss(), device)

        print(f'Train Epoch: {e} Loss: {cum_loss/sample}')    


def test_fn(model, test_loader, criterion = MSELoss(), device = 'cuda'):

    sample = 0.0
    cum_loss = 0.0

    model.eval()

    
    with torch.no_grad():
        for batch_idx, (text_encoding, box_encoding, box_coords, labels) in enumerate(test_loader):

            text_encoding = text_encoding.cuda()
            box_encoding = box_encoding.cuda()
            box_coords = box_coords.cuda()

            labels = labels.cuda()

            output = model(text_encoding, box_encoding, box_coords)

            output = output.squeeze(1)
            labels = labels.squeeze(1)

            loss = criterion(output, 1/labels)

            print(loss)
            
            sample += len(text_encoding)
            cum_loss += loss.item()

        print(f'Test Loss: {cum_loss/sample}')      
        

import torch
torch.manual_seed(42)

print(device)

# text_encoding = torch.randn(10000, 512, 1).to(device)
# box_encoding = torch.randn(10000, 512, 10).to(device)
# box_coords = torch.randn(10000, 10, 4).to(device)
# target_boxes = torch.randn(10000, 1, 4).to(device)


# # get dataset
# from torch.utils.data import DataLoader, TensorDataset

# batch_size = 256

# dataset = TensorDataset(text_encoding, box_encoding, box_coords, target_boxes)
# train_loader = DataLoader(dataset, batch_size=100, shuffle=True)


# # init model
# model = Net(512, 8, 2, 2048, 0.1).cuda()

# print(model)
# print('number of parameter: ',sum(p.numel() for p in model.parameters() if p.requires_grad)/1000000.0, 'M')

# # most basic trainer, uses good defaults
# trainer = pl.Trainer(accelerator='auto', max_epochs=10)

# # train the model
# trainer.fit(model, train_loader)


cuda


In [54]:
# init model
model = Net(512, 1, 1, 10, 0.1).type(torch.float32).cuda()

print(model)
print('number of parameter: ',sum(p.numel() for p in model.parameters() if p.requires_grad)/1000000.0, 'M')


del dataset_test, dataset_train, dataset_val

dataset_test = TensorDataset(text_encoding_test.type(torch.float32), box_encoding_test.type(torch.float32), box_coords_test.type(torch.float32), target_boxes_test.type(torch.float32))
dataset_val = TensorDataset(text_encoding_val.type(torch.float32), box_encoding_val.type(torch.float32), box_coords_val.type(torch.float32), target_boxes_val.type(torch.float32))
# dataset_train = TensorDataset(text_encoding_train, box_encoding_train, box_coords_train, target_boxes_train)


test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)


training(model, test_loader, val_loader, torch.optim.AdamW(model.parameters(), lr=5e-4), MSELoss(), device, 5)



Net(
  (box_regressor): BoxRegressor(
    (transformer_encoder): TransformerEncoder()
    (mlp_regressor): MLP(
      (linear1): Linear(in_features=1034, out_features=258, bias=True)
      (layer_norm1): LayerNorm((258,), eps=1e-05, elementwise_affine=True)
      (linear2): Linear(in_features=258, out_features=64, bias=True)
      (layer_norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (linear3): Linear(in_features=64, out_features=32, bias=True)
      (layer_norm3): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (linear4): Linear(in_features=32, out_features=4, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (loss): MSELoss()
)
number of parameter:  0.286526 M
tensor(nan, device='cuda:0')
tensor(nan, device='cuda:0')
tensor(nan, device='cuda:0')
tensor(nan, device='cuda:0')
tensor(nan, device='cuda:0')
tensor(nan, device='cuda:0')
tensor(nan, device='cuda:0')
tensor(nan, device='cuda:0'

In [None]:
# load the dataset

import pickle

# load test dataset
file_name = './data/yolo_v8x/yolo_v8x_1_dictionary_full_test.p'
with open(file_name, 'rb') as f:
    data_test = pickle.load(f)

# load val dataset
file_name = './data/yolo_v8x/yolo_v8x_1_dictionary_full_val.p'
with open(file_name, 'rb') as f:
    data_val = pickle.load(f)

# load train dataset
file_name = './data/yolo_v8x/yolo_v8x_1_dictionary_full_train.p'
with open(file_name, 'rb') as f:
    data_train = pickle.load(f)



# text_encoding = torch.randn(10000, 512, 1).to(device)
# box_encoding = torch.randn(10000, 512, 10).to(device)
# box_coords = torch.randn(10000, 10, 4).to(device)
# target_boxes = torch.randn(10000, 1, 4).to(device)



In [None]:
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

def get_data(full_data):

    text_encoding, box_encoding, box_coords, target_boxes = [], [], [], []

    for idx in tqdm(list(full_data)):
        # for _ in range(data['image_emb'].shape[0]):
        for idx_text in range(full_data[idx]['text_emb'].shape[0]):
            
            # number of available crops
            number_of_crop = min(full_data[idx]['image_emb'].shape[0], len(full_data[idx]['df_boxes']))

            if number_of_crop == 0:
                break

            # shape: (number of samples, 512, 1)
            text_encoding.append(full_data[idx]['text_emb'][idx_text].unsqueeze(1))

            number_of_crop = min(full_data[idx]['image_emb'].shape[0], len(full_data[idx]['df_boxes']))

            # shape: (number of samples, 512, number of crop embeddings)
            box_encoding.append(full_data[idx]['image_emb'][:number_of_crop,:].permute(1, 0))

            # shape: (number of samples, number of boxes, 4)
            box_coords.append(torch.stack([torch.tensor(full_data[idx]['df_boxes'].iloc[i][:4]).type(torch.float16) 
                                                    for i in range(number_of_crop)]))
            
            # shape: (number of samples, 1, 4)
            target_boxes.append(torch.tensor(full_data[idx]['bbox_target']).type(torch.float16).unsqueeze(0))

    return torch.stack(text_encoding), torch.stack([torch.nn.functional.pad(b.permute(1, 0), (0, 0, 0, 48 - b.shape[1])).permute(1, 0) for b in box_encoding]), torch.stack([torch.nn.functional.pad(b, (0, 0, 0, 48 - b.shape[0])) for b in box_coords]), torch.stack(target_boxes)



text_encoding_test, box_encoding_test, box_coords_test, target_boxes_test = get_data(data_test)
text_encoding_val, box_encoding_val, box_coords_val, target_boxes_val = get_data(data_val)
text_encoding_train, box_encoding_train, box_coords_train, target_boxes_train = get_data(data_train)



100%|██████████| 5023/5023 [00:10<00:00, 467.57it/s]
100%|██████████| 2573/2573 [00:05<00:00, 478.34it/s]
100%|██████████| 42226/42226 [01:24<00:00, 502.26it/s]


In [None]:
def box_norm_rescale(box_target):
    """ Rescale the box_target 
    Args:
        box_target: (number of samples, 1, 4)

    Returns:
        box_target: (number of samples, 1, 4)

    """
    # convert the box_pred to x1, y1, x2, y2
    box_target[:, 0, 2] = box_target[:, 0, 0] + box_target[:, 0, 2]
    box_target[:, 0, 3] = box_target[:, 0, 1] + box_target[:, 0, 3]

    return box_target

# box rescaling
target_boxes_test = box_norm_rescale(target_boxes_test)
target_boxes_val = box_norm_rescale(target_boxes_val)
target_boxes_train = box_norm_rescale(target_boxes_train)


In [None]:
# dataloaders

from torch.utils.data import DataLoader, TensorDataset

# text_encoding_test = text_encoding_test.cpu().type(torch.float32)
# box_encoding_test = box_encoding_test.cpu().type(torch.float32)
# box_coords_test = box_coords_test.cpu().type(torch.float32)

# text_encoding_val = text_encoding_val.cpu().type(torch.float32)
# box_encoding_val = box_encoding_val.cpu().type(torch.float32)
# box_coords_val = box_coords_val.cpu().type(torch.float32)

# text_encoding_train = text_encoding_train.cpu().type(torch.float32)
# box_encoding_train = box_encoding_train.cpu().type(torch.float32)
# box_coords_train = box_coords_train.cpu().type(torch.float32)


batch_size = 256

dataset_test = TensorDataset(text_encoding_test, box_encoding_test, box_coords_test, target_boxes_test)
dataset_val = TensorDataset(text_encoding_val, box_encoding_val, box_coords_val, target_boxes_val)
dataset_train = TensorDataset(text_encoding_train, box_encoding_train, box_coords_train, target_boxes_train)

test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)


# init model
model = Net(512, 2, 1, 128, 0.1).type(torch.float16).cuda()

print(model)
print('number of parameter: ',sum(p.numel() for p in model.parameters() if p.requires_grad)/1000000.0, 'M')

# most basic trainer, uses good defaults
from pytorch_lightning.callbacks import EarlyStopping
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=3, verbose=True, mode="min")


trainer = pl.Trainer(accelerator='auto', max_epochs=5)#, callbacks=[early_stop_callback])

# train the model
trainer.fit(model, train_loader, val_loader)

# test the model
# trainer.test(test_dataloaders=test_loader)

# max([b.shape[0] for b in box_coords])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type         | Params
-----------------------------------------------
0 | box_regressor | BoxRegressor | 286 K 
1 | loss          | MSELoss      | 0     
-----------------------------------------------
286 K     Trainable params
0         Non-trainable params
286 K     Total params
1.146     Total estimated model params size (MB)


Net(
  (box_regressor): BoxRegressor(
    (transformer_encoder): TransformerEncoder()
    (mlp_regressor): MLP(
      (linear1): Linear(in_features=1034, out_features=258, bias=True)
      (layer_norm1): LayerNorm((258,), eps=1e-05, elementwise_affine=True)
      (linear2): Linear(in_features=258, out_features=64, bias=True)
      (layer_norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (linear3): Linear(in_features=64, out_features=32, bias=True)
      (layer_norm3): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (linear4): Linear(in_features=32, out_features=4, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (loss): MSELoss()
)
number of parameter:  0.286526 M
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]torch.Size([256, 4])
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00, 61.85it/s]torch.Size([256, 4])
                                              

  rank_zero_warn(
  rank_zero_warn(
  return F.mse_loss(input, target, reduction=self.reduction)
  rank_zero_warn(


Epoch 0:   0%|          | 0/315 [00:00<?, ?it/s] tensor(inf, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 0:   0%|          | 1/315 [00:00<00:10, 29.01it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 0:   1%|          | 2/315 [00:00<00:10, 30.63it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 0:   1%|          | 3/315 [00:00<00:09, 32.55it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 0:   1%|▏         | 4/315 [00:00<00:09, 32.37it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 0:   2%|▏         | 5/315 [00:00<00:09, 32.48it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 0:   2%|▏         | 6/315 [00:00<00:09, 33.05it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 0:   2%|▏         | 7/315 [00:0

  return F.mse_loss(input, target, reduction=self.reduction)


torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([32, 4])
Epoch 1:   0%|          | 0/315 [00:00<?, ?it/s, v_num=53]          tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   0%|          | 1/315 [00:00<00:11, 26.99it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   1%|          | 2/315 [00:00<00:10, 29.01it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   1%|          | 3/315 [00:00<00:10, 30.38it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   1%|▏    

  return F.mse_loss(input, target, reduction=self.reduction)


tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   2%|▏         | 6/315 [00:00<00:09, 33.08it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   2%|▏         | 7/315 [00:00<00:09, 33.36it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   3%|▎         | 8/315 [00:00<00:09, 33.18it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   3%|▎         | 9/315 [00:00<00:09, 33.55it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   3%|▎         | 10/315 [00:00<00:09, 33.30it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   3%|▎         | 11/315 [00:00<00:09, 33.59it/s, v_num=53]tensor(nan, device='cuda:0', dtype=torch.float16, grad_fn=<DivBackward0>)
Epoch 1:   4%|▍         | 12/315 [00:00<00:08, 33.91it/s, v_num=53]tensor(nan, devic

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
import numpy as np
print(max([b.shape[0] for b in box_coords]), min([b.shape[0] for b in box_coords]), np.array([b.shape[0] for b in box_coords]).mean(), np.array([b.shape[0] for b in box_coords]).std())

48 1 6.852848301124163 4.310411508820321


In [None]:
# text_encoding = torch.randn(10000, 512, 1).to(device)
# box_encoding = torch.randn(10000, 512, 10).to(device)
# box_coords = torch.randn(10000, 10, 4).to(device)
# target_boxes = torch.randn(10000, 1, 4).to(device)

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/