In [15]:
!pip install pytorch-lightning



In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import pytorch_lightning as pl
import torchmetrics
import torchvision

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
#############################################################
# with multiple textual prompts
#############################################################

############################ 1. MLP

from typing import Any, Optional


from pytorch_lightning.utilities.types import STEP_OUTPUT


class MLP(nn.Module):
    def __init__(self, out_dim=512):
        super().__init__()
        self.l1 = nn.Linear(4, 128)
        self.l2 = nn.Linear(128, 256)
        self.l3 = nn.Linear(256, out_dim)
        self.gelu = nn.GELU()
        
    def forward(self, x):
        x = self.gelu(self.l1(x))
        x = self.gelu(self.l2(x))
        return self.l3(x)


############################ 2. BoxEncoder

class BoxEncoder(pl.LightningModule):
    def __init__(self, out_dim_box=512, latent_dim=512):
        super().__init__()
        
        self.mlp = MLP(out_dim_box)
        
        # self.linear_1 = nn.Linear(3, 768)
        # self.layernorm_1 = nn.LayerNorm(768)
        # self.linear_2 = nn.Linear(768, 256)
        # self.layernorm_2 = nn.LayerNorm(256)
        # self.weighted_sum = nn.Linear(256, 1)

        self.linear_1 = nn.Linear(3, 3)
        self.layernorm_1 = nn.LayerNorm(3)
        self.linear_2 = nn.Linear(3, 3)
        self.layernorm_2 = nn.LayerNorm(3)
        self.weighted_sum = nn.Linear(3, 1)
        
        self.gelu = nn.GELU()
        
    def forward(self, x, box, text_emb):
        
        ##################################################   BOX ENCODER BLOCK
        # x: (batch_size, 512, n_boxes)
        box_emb = self.mlp(box).permute(0, 2, 1).unsqueeze(1)
        
        # x: (batch_size, 3, 512, n_boxes)
        x = torch.cat([x, box_emb], dim=1)              # concat EMB and box
        del box_emb                                     # del box_emb
        
        ##########################################   WEIGHTED COMBINATION BLOCK
        # x: (batch_size, n_boxes, 512, 3)
        x_1 = self.linear_1(x.permute(0, 3, 2, 1))  # linear transformation
        x = self.gelu(x_1)                          # non-linearity
        x = self.layernorm_1(x_1 + x)               # add & norm
        del x_1                                     # del x_1
        x_2 = self.linear_2(x)                      # linear transformation
        x = self.gelu(x_2)                          # non-linearity
        x_2 = self.layernorm_2(x_2 + x)             # add & norm
        del x_2                                     # del x_2
        x = self.weighted_sum(x)                    # get a weight for each box


        ##########################################   CONTRASTIVE SCORE BLOCK
        # x: torch.Size([BATCH_SIZE, N_BOXES, 512, 1]) -> torch.Size([BATCH_SIZE, N_BOXES, 1, 512])
        box_encoding = x.permute(0, 1, 3, 2)
        del x

        # normalize
        box_encoding.norm(dim=-1, keepdim=True)
        text_emb.norm(dim=-1, keepdim=True)


        # Get the batched text encoding
        x = torch.stack([
                torch.stack([
                    torch.stack([
                    # (torch.Size([1, 512]) @ torch.Size([512, 1])) -> torch.Size([1, 1])
                    # [tex_i @ box_1.T, tex_i @ box_2.T,              ..., tex_i @ box_j]
                    text_emb[i][j_tex] @ box_encoding[i][j_box].T 
                            for j_box in range(box_encoding.shape[1])])                             # get the matching score for each box
                        for j_tex in range(text_emb.shape[1])])                                     # stack the matching scores in a matrix
                    for i in range(box_encoding.shape[0])]).squeeze(4).squeeze(3).softmax(dim=-1)   # torch.Size([BATCH_SIZE, N_BOXES, N_IMAGES, 1, 1]) -> torch.Size([BATCH_SIZE, N_BOXES, N_IMAGES])
            
                                                                                                    # with the softmax we are making sure that the scores for
                                                                                                    # each piece of text sum to 1
        # I should sum over the row to get the overall score for each box
        # specifically, I am summing the probabilities of each box being 
        # the target box according to the matching score of the text encoding


        x = x.sum(dim=-2)                                                                           # torch.Size([64, 10, 10]) -> torch.Size([64, 10])

        
        ''' output matrix
            x:  [tex_0 @ box_1.T, tex_0 @ box_2.T, ..., tex_0 @ box_j] 
                [tex_2 @ box_1.T, tex_2 @ box_2.T, ..., tex_2 @ box_j]
                [tex_3 @ box_1.T, tex_3 @ box_2.T, ..., tex_3 @ box_j]
                                        ...
                                        ...
                [tex_i @ box_1.T, tex_i @ box_2.T, ..., tex_i @ box_j]

            I am normalizing over the rows and summing over the columns
            to get the overall score for each box.

            x summarize the matching in between the sentence and the boxes
            x: [sum(tex_[:] @ box_1.T), sum(tex_[:] @ box_2.T), ..., sum(tex_[:] @ box_j)]
        '''

        # get the index of the box with the highest score
        x = torch.argmax(x, dim=-1)                                                                # torch.Size([BATCH_SIZE, N_BOXES]) -> torch.Size([BATCH_SIZE])

        #x = torch.tensor(x, requires_grad=True)

        # get the box with the highest score
        # box = box[torch.arange(box_encoding.shape[0]), x]
        # box = torch.tensor(box[torch.arange(box_encoding.shape[0]), x], requires_grad=True)

        box = torch.autograd.Variable(box[torch.arange(box_encoding.shape[0]), x], requires_grad=True)
    
        return box
    
    def training_step(self, batch, batch_idx):
        cat_emb_text, box, t_emb, y = batch
        # print(cat_emb_text.shape, box.shape, y.shape)
        y_hat = self(cat_emb_text, box, t_emb)
        # https://arxiv.org/pdf/2108.12627
        # loss = F.huber_loss(y_hat, y)
        # https://arxiv.org/abs/1911.08287
        # loss = torchvision.ops.distance_box_iou_loss(y_hat, y)
        # https://arxiv.org/abs/1902.09630
        # loss = torchvision.ops.generalized_box_iou_loss(y_hat, y)
        loss = F.mse_loss(y_hat, y)

        # loss = F.cross_entropy(y_hat, y)

        self.log('train_loss', loss)

        return loss
    
    def validation_step(self, batch, batch_idx):
        cat_emb_text, box, t_emb, y = batch
        # print(cat_emb_text.shape, box.shape, y.shape)
        y_hat = self(cat_emb_text, box, t_emb)
        # https://arxiv.org/pdf/2108.12627
        # loss = F.huber_loss(y_hat, y)
        # https://arxiv.org/abs/1911.08287
        # loss = torchvision.ops.distance_box_iou_loss(y_hat, y)
        # https://arxiv.org/abs/1902.09630
        # loss = torchvision.ops.generalized_box_iou_loss(y_hat, y)
        loss = F.mse_loss(y_hat, y)

        # loss = F.cross_entropy(y_hat, y)

        self.log('val_loss', loss)

        return loss


    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
        return [optimizer], [scheduler]
        




In [None]:
#### junk code

# torch.manual_seed(42)

# text_encoding = torch.randn(400, 512, 1)
# box_encoding = torch.randn(400, 512, 10)



# cat_encoding_text = torch.cat([
#                         box_encoding.unsqueeze(1), 
#                         torch.stack([
#                             text_encoding.squeeze(2) for _ in range(10)], dim=1).permute(0, 2, 1).unsqueeze(1)
#                             ], dim=1)

# text_encoding = torch.randn(400, 1, 512, 1)

# # text_encoding = torch.stack([text_encoding.squeeze(2) for _ in range(10)], dim=1).permute(0, 2, 1).unsqueeze(1).permute(0, 3, 2, 1)

# box_coords = torch.randn(400, 10, 4)
# box_coords_target = box_coords[:, 0, :]

# print(cat_encoding_text.shape, box_coords.shape, box_coords_target.shape, text_encoding.shape)

# # dataloader

# from torch.utils.data import DataLoader, TensorDataset

# # get dataset
# dataset = TensorDataset(cat_encoding_text, box_coords, text_encoding.permute(0, 1, 3, 2), box_coords_target) # .permute(0, 1, 3, 2)

# # get dataloader
# dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# # split with stratified kfold
# # from sklearn.model_selection import StratifiedKFold

# # skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # kfold = []

# # for fold, (train_idx, val_idx) in enumerate(skf.split(cat_encoding_text, torch.zeros(4000), torch.zeros(4000))):
    
# #     print(f"Fold: {fold}")
    
# #     # get dataloader
# #     train_loader = DataLoader(dataset[train_idx], batch_size=64, shuffle=True)
# #     val_loader = DataLoader(dataset[val_idx], batch_size=64, shuffle=False)
    
# #     # print(train_idx, val_idx)
    
# #     # append the fold
# #     kfold.append((train_loader, val_loader))



# # train the model
# model = BoxEncoder()

# print(model)

# # print number of parameters
# print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

# trainer = pl.Trainer(max_epochs=10, accelerator='auto')

# trainer.fit(model, dataloader)

#### preprocess the data

In [3]:
import pickle

# load test dataset
file_name = './data/yolo_v8x/yolo_v8x_1_dictionary_full_test.p'
with open(file_name, 'rb') as f:
    data_test = pickle.load(f)

# load val dataset
file_name = './data/yolo_v8x/yolo_v8x_1_dictionary_full_val.p'
with open(file_name, 'rb') as f:
    data_val = pickle.load(f)

# load train dataset
# file_name = './data/yolo_v8x/yolo_v8x_1_dictionary_full_train.p'
# with open(file_name, 'rb') as f:
#     data_train = pickle.load(f)


In [4]:
### TODO: make the experiment using single prompt 
### TODO: make the experiment using multiple prompts whenever available to make the model more robust
### TODO: do the experiment rephrasing the content in the prompt

SyntaxError: invalid syntax (1977629505.py, line 5)

In [5]:
# load the dataset

from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

def get_data_single_prompt(full_data):

    text_encoding, cat_encoding_text, box_coords, target_boxes = [], [], [], []

    for idx in tqdm(list(full_data)):
        # for _ in range(data['image_emb'].shape[0]):
        for idx_text in range(full_data[idx]['text_emb'].shape[0]):
            
            # number of available crops
            number_of_crop = min(full_data[idx]['image_emb'].shape[0], len(full_data[idx]['df_boxes']))

            if number_of_crop == 0:
                break
            
            text_enc = full_data[idx]['text_emb'][idx_text].unsqueeze(1)
            box_enc = full_data[idx]['image_emb'][:number_of_crop,:].permute(1, 0)

            # shape: (number of samples, 512, 1)
            text_encoding.append(text_enc)

            # shape: (number of samples, 512, number of crop embeddings)
            # box_encoding.append(box_enc)

            # shape: (number of samples, number of boxes, 4)
            box_coords.append(torch.stack([torch.tensor(full_data[idx]['df_boxes'].iloc[i][:4]).type(torch.float16) 
                                                    for i in range(number_of_crop)]))
                        
            # shape: torch.Size(number of samples, 2, 512, number of boxes])
            cat_encoding_text.append(torch.cat([box_enc.unsqueeze(0), torch.stack([text_enc.squeeze(1) for _ in range(number_of_crop)], dim=1).unsqueeze(0)], dim=0))
            
            # shape: (number of samples, 1, 4)
            target_boxes.append(torch.tensor(full_data[idx]['bbox_target']).type(torch.float16).unsqueeze(0))

           # (number of samples, 1, 512, 1)          padd the tensors to stack them togheter
    return torch.stack(text_encoding).unsqueeze(1), torch.stack([torch.nn.functional.pad(b.permute(0, 2, 1), (0, 0, 0, 48 - b.shape[2])).permute(0, 2, 1) for b in cat_encoding_text]), torch.stack([torch.nn.functional.pad(b, (0, 0, 0, 48 - b.shape[0])) for b in box_coords]), torch.stack(target_boxes)



text_encoding_test, box_encoding_test, box_coords_test, target_boxes_test = get_data_single_prompt(data_test)
text_encoding_val, box_encoding_val, box_coords_val, target_boxes_val = get_data_single_prompt(data_val)
# text_encoding_train, box_encoding_train, box_coords_train, target_boxes_train = get_data_single_prompt(data_train)

def box_norm_rescale(box_target):
    """ Rescale the box_target 
    Args:
        box_target: (number of samples, 1, 4)

    Returns:
        box_target: (number of samples, 1, 4)

    """
    # convert the box_pred to x1, y1, x2, y2
    box_target[:, 0, 2] = box_target[:, 0, 0] + box_target[:, 0, 2]
    box_target[:, 0, 3] = box_target[:, 0, 1] + box_target[:, 0, 3]

    return box_target

# box rescaling
target_boxes_test = box_norm_rescale(target_boxes_test)
target_boxes_val = box_norm_rescale(target_boxes_val)
# target_boxes_train = box_norm_rescale(target_boxes_train)



print(text_encoding_test.shape)
print(box_encoding_test.shape)
print(box_coords_test.shape)
print(target_boxes_test.shape)




100%|██████████| 5023/5023 [00:10<00:00, 457.92it/s]
100%|██████████| 2573/2573 [00:05<00:00, 469.02it/s]


torch.Size([9598, 1, 512, 1])
torch.Size([9598, 2, 512, 48])
torch.Size([9598, 48, 4])
torch.Size([9598, 1, 4])


In [6]:
target_boxes_test = target_boxes_test.squeeze(1)
target_boxes_val = target_boxes_val.squeeze(1)

In [20]:
# dataloaders

from torch.utils.data import DataLoader, TensorDataset

batch_size = 512

dataset_test = TensorDataset(box_encoding_test.type(torch.float32), box_coords_test.type(torch.float32), text_encoding_test.type(torch.float32).permute(0, 1, 3, 2), target_boxes_test.type(torch.float32))
dataset_val = TensorDataset(box_encoding_val.type(torch.float32), box_coords_val.type(torch.float32), text_encoding_val.type(torch.float32).permute(0, 1, 3, 2), target_boxes_val.type(torch.float32))
# dataset_train = TensorDataset(text_encoding_train.type(torch.float32), box_encoding_train.type(torch.float32), box_coords_train.type(torch.float32), target_boxes_train.type(torch.float32))

test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)
# train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)


# most basic trainer, uses good defaults
from pytorch_lightning.callbacks import EarlyStopping
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=3, verbose=True, mode="min")


# train the model
model = BoxEncoder().cuda()

print(model)

# print number of parameters
print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)/1000000.0} M")

trainer = pl.Trainer(accelerator='auto', max_epochs=5, callbacks=[early_stop_callback])

# train the model
trainer.fit(model, test_loader, val_loader)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type      | Params
-------------------------------------------
0 | mlp          | MLP       | 165 K 
1 | linear_1     | Linear    | 12    
2 | layernorm_1  | LayerNorm | 6     
3 | linear_2     | Linear    | 12    
4 | layernorm_2  | LayerNorm | 6     
5 | weighted_sum | Linear    | 4     
6 | gelu         | GELU      | 0     
-------------------------------------------
165 K     Trainable params
0         Non-trainable params
165 K     Total params
0.661     Total estimated model params size (MB)


BoxEncoder(
  (mlp): MLP(
    (l1): Linear(in_features=4, out_features=128, bias=True)
    (l2): Linear(in_features=128, out_features=256, bias=True)
    (l3): Linear(in_features=256, out_features=512, bias=True)
    (gelu): GELU(approximate='none')
  )
  (linear_1): Linear(in_features=3, out_features=3, bias=True)
  (layernorm_1): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
  (linear_2): Linear(in_features=3, out_features=3, bias=True)
  (layernorm_2): LayerNorm((3,), eps=1e-05, elementwise_affine=True)
  (weighted_sum): Linear(in_features=3, out_features=1, bias=True)
  (gelu): GELU(approximate='none')
)
Number of parameters: 0.165288 M
                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 19/19 [00:58<00:00,  3.09s/it, v_num=77]

Metric val_loss improved. New best score: 70670.633


Epoch 1:  26%|██▋       | 5/19 [00:10<00:30,  2.20s/it, v_num=77] 

In [None]:
# BATCH_SIZE = 64

# box_encoding = torch.randn(BATCH_SIZE, 10, 512, 1).permute(0, 1, 3, 2)
# text_encoding = torch.randn(BATCH_SIZE, 10, 1, 512)

# box_encoding.norm(dim=-1, keepdim=True)
# text_encoding.norm(dim=-1, keepdim=True)


# # Get the batched text encoding
# x = torch.stack([
#         torch.stack([
#             torch.stack([
#             # (torch.Size([1, 512]) @ torch.Size([512, 1])) -> torch.Size([1, 1])
#             # [tex_i @ box_1.T, tex_i @ box_2.T, ..., tex_i @ box_j]
#             text_encoding[i][j_tex] @ box_encoding[i][j_box].T 
#                     for j_box in range(box_encoding.shape[1])])                            # get the matching score for each box
#                 for j_tex in range(text_encoding.shape[1])])                                # stack the matching scores in a matrix
#             for i in range(BATCH_SIZE)]).squeeze(4).squeeze(3).softmax(dim=-1)  # torch.Size([BATCH_SIZE, N_BOXES, N_IMAGES, 1, 1]) -> torch.Size([BATCH_SIZE, N_BOXES, N_IMAGES])
    
#                                                                         # with the softmax we are making sure that the scores for
#                                                                         # each piece of text sum to 1

# # I should sum over the row to get the overall score for each box
# # specifically, I am summing the probabilities of each box being 
# # the target box according to the matching score of the text encoding

# x = x.sum(dim=-2) # torch.Size([64, 10, 10]) -> torch.Size([64, 10])


# ''' output matrix
#     x:  [tex_0 @ box_1.T, tex_0 @ box_2.T, ..., tex_0 @ box_j] 
#         [tex_2 @ box_1.T, tex_2 @ box_2.T, ..., tex_2 @ box_j]
#         [tex_3 @ box_1.T, tex_3 @ box_2.T, ..., tex_3 @ box_j]
#                                 ...
#                                 ...
#         [tex_i @ box_1.T, tex_i @ box_2.T, ..., tex_i @ box_j]

#     I am normalizing over the rows and summing over the columns
#     to get the overall score for each box.

#     x summarize the matching in between the sentence and the boxes
#     x: [sum(tex_[:] @ box_1.T), sum(tex_[:] @ box_2.T), ..., sum(tex_[:] @ box_j)]
# '''

# # get the index of the box with the highest score
# x = torch.argmax(x, dim=-1) # torch.Size([BATCH_SIZE, N_BOXES]) -> torch.Size([BATCH_SIZE])

# box = torch.randn(BATCH_SIZE, 10, 4)

# # get the box with the highest score
# box[torch.arange(BATCH_SIZE), x].shape # torch.Size([BATCH_SIZE, N_BOXES, 4]) -> torch.Size([BATCH_SIZE, 4])

torch.Size([64, 4])

In [None]:
# batched mat mul
torch.bmm(box_encoding, text_encoding).squeeze(3).squeeze(2)

RuntimeError: batch1 must be a 3D tensor

In [None]:
import os
import clip
import torch
from torchvision.datasets import CIFAR100

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Download the dataset
cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)

# Prepare the inputs
image, class_id = cifar100[3637]
image_input = preprocess(image).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device)

# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image_input)
    print(image_features.shape) # torch.Size([1, 512])
    text_features = model.encode_text(text_inputs)
    print(text_features.shape) # torch.Size([100, 512])

# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

print(( image_features @ text_features.T).shape)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")

Files already downloaded and verified
torch.Size([1, 512])
torch.Size([100, 512])
torch.Size([1, 100])

Top predictions:

           snake: 65.48%
          turtle: 12.30%
    sweet_pepper: 3.87%
          lizard: 1.86%
       crocodile: 1.69%


In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0mm
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.

### Experiments with resnet

The aim of this task is to determine the categories of objects present in the image. Without being so specific...

In [None]:
import torch
from PIL import Image
from torchvision import transforms

import urllib
url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
try: urllib.URLopener().retrieve(url, filename)
except: urllib.request.urlretrieve(url, filename)


model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=True)
model.eval()



input_image = Image.open(filename)

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)


with torch.no_grad():
    output = model(input_batch)
    
    
output

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /Users/riccardotedoldi/.cache/torch/hub/v0.10.0.zip
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/riccardotedoldi/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

tensor([[-9.4537e-02, -1.4596e+00, -1.1330e+00, -1.6520e+00, -2.8367e+00,
          2.5034e-01, -1.0562e+00,  3.0290e+00,  4.8499e+00,  1.4322e-01,
         -3.0074e+00, -1.2253e+00, -1.7812e+00, -2.6095e+00, -2.7430e+00,
         -1.6470e+00, -6.3207e-01,  1.2096e+00, -1.2136e-01, -1.0899e+00,
         -2.0130e+00, -6.3550e-01, -5.5107e-01,  1.1292e+00, -1.7935e+00,
         -4.5523e-01, -6.1320e-01, -6.3687e-01, -1.0281e+00,  1.1140e+00,
         -1.1821e+00, -9.3301e-01, -6.6413e-01, -2.8717e+00, -2.2735e+00,
         -1.1740e+00, -1.2014e+00, -1.5959e+00, -2.8250e+00, -5.8934e-01,
         -3.9307e-01, -2.6001e+00, -1.5767e+00, -2.5568e+00,  5.8025e-01,
         -2.1373e+00, -5.2948e-01, -1.5152e+00, -2.4959e+00, -1.1388e+00,
          1.6644e-01, -4.5050e-01,  1.6418e-01,  2.7721e-01, -1.1867e+00,
         -9.4941e-01, -1.2710e+00, -9.4497e-01, -5.2795e-01, -5.3808e-01,
          1.5079e+00, -1.5966e+00, -2.4035e+00, -1.5511e+00, -3.0469e+00,
         -2.0768e+00, -9.0381e-01, -9.

In [None]:
probabilities = torch.nn.functional.softmax(output[0], dim=0)
# print(probabilities)

# Read the categories
with open("imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]
# Show top categories per image
top5_prob, top5_catid = torch.topk(probabilities, 5)
for i in range(top5_prob.size(0)):
    print(categories[top5_catid[i]], top5_prob[i].item())

Samoyed 0.8732963800430298
Pomeranian 0.030270840972661972
white wolf 0.019671104848384857
keeshond 0.011073537170886993
Eskimo dog 0.00920423399657011


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np


# as we could see in this simple example the embeddings
# found in the latent space are very similar to each other
# when ever the objects are involved in the sentence

# I chose the MiniLM model because it is the smallest model
# which can perform this task accurately without introducing
# overhead

sentences = ["the woman with the red dress is eating a pizza"]
s = ["person", 
             "dog",
             "ball",
             "ice cream"]

def euclidean_distance(a, b):
    return np.linalg.norm(a-b)

def coseine_similarity(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

import time
start = time.time()
embeddings = model.encode(sentences)
print('time required:',time.time()-start)

print(coseine_similarity(embeddings[0], embeddings[0]))
print(coseine_similarity(embeddings[0], embeddings[1]))
print(coseine_similarity(embeddings[0], embeddings[2]))
print(coseine_similarity(embeddings[0], embeddings[3]))
print(coseine_similarity(embeddings[0], embeddings[4]))

print(sentences)



time required: 0.013755083084106445
1.0000001
0.03727182
-0.0235776
-0.018512128
0.04793311
['the woman with the red dress is eating a pizza', 'person', 'dog', 'ball', 'ice cream']


In [None]:
file_name = 'yolov5l6+clip/1_dictionary_full_train.p'

with open(file_name, 'rb') as f:
    dictionary = pickle.load(f)

print('maximum number of boxes found: ',max([len(dictionary[sample]['image_emb']) for sample in dictionary.keys()]))
print('mean number of boxes found: ',np.array([len(dictionary[sample]['image_emb']) for sample in dictionary.keys()]).mean())
print('std number of boxes found: ',np.array([len(dictionary[sample]['image_emb']) for sample in dictionary.keys()]).std())
print('count max number of boxes found: ',[len(dictionary[sample]['image_emb']) for sample in dictionary.keys()].count(32))
print('count of number of boxes found:')
print([[len(dictionary[sample]['image_emb']) for sample in dictionary.keys()].count(xx+1) for xx in range(32)])

maximum number of boxes found:  32
mean number of boxes found:  5.56635722066973
std number of boxes found:  2.7148657554979914
count max number of boxes found:  3
count of number of boxes found:
[248, 1577, 6883, 8627, 7880, 5624, 3697, 2583, 1698, 1039, 776, 498, 341, 283, 144, 98, 56, 50, 36, 28, 17, 14, 9, 6, 2, 9, 0, 0, 0, 0, 0, 3]
