In [2]:
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.0.2-py3-none-any.whl (719 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.0/719.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting lightning-utilities>=0.7.0
  Downloading lightning_utilities-0.8.0-py3-none-any.whl (20 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.8.0 pytorch-lightning-2.0.2 torchmetrics-0.11.4


In [142]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import pytorch_lightning as pl

In [69]:

############################ 1. MLP

class MLP(nn.Module):
    def __init__(self, out_dim=512):
        super().__init__()
        self.l1 = nn.Linear(4, 128)
        self.l2 = nn.Linear(128, 256)
        self.l3 = nn.Linear(256, out_dim)
        self.gelu = nn.GELU()
        
    def forward(self, x):
        x = self.gelu(self.l1(x))
        x = self.gelu(self.l2(x))
        return self.l3(x)


############################ 2. BoxEncoder

class BoxEncoder(pl.LightningModule):
    def __init__(self, out_dim_box=512, latent_dim=512):
        super().__init__()
        
        self.mlp = MLP(out_dim_box)
        
        self.linear_1 = nn.Linear(3, 3)
        self.linear_2 = nn.Linear(3, 3)
        
        self.weighted_sum = nn.Linear(3, 1)
        
        self.gelu = nn.GELU()
        
    def forward(self, x, box, text_emb):
        
        # x: (batch_size, 512, n_boxes)
        box = self.mlp(box).permute(0, 2, 1).unsqueeze(1)
        
        # concat x and box
        # x: (batch_size, 3, 512, n_boxes)
        x = torch.cat([x, box], dim=1)
        
        # free box
        del box
        
        # cross_weighted_sum_block
        # x: (batch_size, n_boxes, 512, 3)
        x = self.linear_1(x.permute(0, 3, 2, 1))
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.gelu(x)
        x = self.weighted_sum(x)
        
        print(x.shape)
    
        return box
    
    def training_step(self, batch, batch_idx):
        cat_emb_text, box, t_emb, y = batch
        # print(cat_emb_text.shape, box.shape, y.shape)
        y_hat = self(cat_emb_text, box, t_emb)
        loss = F.cross_entropy(y_hat, y)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
        return [optimizer], [scheduler]
        




In [145]:
torch.manual_seed(42)

text_encoding = torch.zeros(400, 512, 1)
box_encoding = torch.randn(400, 512, 10)



cat_encoding_text = torch.cat([
                        box_encoding.unsqueeze(1), 
                        torch.stack([
                            text_encoding.squeeze(2) for _ in range(10)], dim=1).permute(0, 2, 1).unsqueeze(1)
                            ], dim=1)

text_encoding = torch.stack([text_encoding.squeeze(2) for _ in range(10)], dim=1).permute(0, 2, 1).unsqueeze(1).permute(0, 3, 2, 1)

box_coords = torch.randn(400, 10, 4)
box_coords_target = box_coords[:, 0, :]

print(cat_encoding_text.shape, box_coords.shape, box_coords_target.shape, text_encoding.shape)

# dataloader

from torch.utils.data import DataLoader, TensorDataset

# get dataset
dataset = TensorDataset(cat_encoding_text, box_coords, text_encoding,box_coords_target)

# get dataloader
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# split with stratified kfold
# from sklearn.model_selection import StratifiedKFold

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# kfold = []

# for fold, (train_idx, val_idx) in enumerate(skf.split(cat_encoding_text, torch.zeros(4000), torch.zeros(4000))):
    
#     print(f"Fold: {fold}")
    
#     # get dataloader
#     train_loader = DataLoader(dataset[train_idx], batch_size=64, shuffle=True)
#     val_loader = DataLoader(dataset[val_idx], batch_size=64, shuffle=False)
    
#     # print(train_idx, val_idx)
    
#     # append the fold
#     kfold.append((train_loader, val_loader))


torch.Size([400, 2, 512, 10]) torch.Size([400, 10, 4]) torch.Size([400, 4]) torch.Size([400, 10, 512, 1])


In [186]:

# multiply the text encoding in batch
[[text_encoding[i][j].T @ text_encoding[i][j] for j in range(10)] for i in range(400)]


[[tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]])],
 [tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]])],
 [tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]])],
 [tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]])],
 [tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]])],
 [tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  tensor([[0.]]),
  ten

In [70]:
# train the model
model = BoxEncoder()

trainer = pl.Trainer(max_epochs=10, accelerator='auto')

trainer.fit(model, dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name         | Type   | Params
----------------------------------------
0 | mlp          | MLP    | 165 K 
1 | linear_1     | Linear | 12    
2 | linear_2     | Linear | 12    
3 | weighted_sum | Linear | 4     
4 | gelu         | GELU   | 0     
----------------------------------------
165 K     Trainable params
0         Non-trainable params
165 K     Total params
0.661     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

torch.Size([64, 2, 512, 10]) torch.Size([64, 10, 4]) torch.Size([64, 4])
torch.Size([64, 10, 512, 1])


UnboundLocalError: local variable 'box' referenced before assignment

In [71]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0mm
Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.

### Experiments with resnet

The aim of this task is to determine the categories of objects present in the image. Without being so specific...

In [99]:
import torch
from PIL import Image
from torchvision import transforms

import urllib
url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
try: urllib.URLopener().retrieve(url, filename)
except: urllib.request.urlretrieve(url, filename)


model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=True)
model.eval()



input_image = Image.open(filename)

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)


with torch.no_grad():
    output = model(input_batch)
    
    
output

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /Users/riccardotedoldi/.cache/torch/hub/v0.10.0.zip
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /Users/riccardotedoldi/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

tensor([[-9.4537e-02, -1.4596e+00, -1.1330e+00, -1.6520e+00, -2.8367e+00,
          2.5034e-01, -1.0562e+00,  3.0290e+00,  4.8499e+00,  1.4322e-01,
         -3.0074e+00, -1.2253e+00, -1.7812e+00, -2.6095e+00, -2.7430e+00,
         -1.6470e+00, -6.3207e-01,  1.2096e+00, -1.2136e-01, -1.0899e+00,
         -2.0130e+00, -6.3550e-01, -5.5107e-01,  1.1292e+00, -1.7935e+00,
         -4.5523e-01, -6.1320e-01, -6.3687e-01, -1.0281e+00,  1.1140e+00,
         -1.1821e+00, -9.3301e-01, -6.6413e-01, -2.8717e+00, -2.2735e+00,
         -1.1740e+00, -1.2014e+00, -1.5959e+00, -2.8250e+00, -5.8934e-01,
         -3.9307e-01, -2.6001e+00, -1.5767e+00, -2.5568e+00,  5.8025e-01,
         -2.1373e+00, -5.2948e-01, -1.5152e+00, -2.4959e+00, -1.1388e+00,
          1.6644e-01, -4.5050e-01,  1.6418e-01,  2.7721e-01, -1.1867e+00,
         -9.4941e-01, -1.2710e+00, -9.4497e-01, -5.2795e-01, -5.3808e-01,
          1.5079e+00, -1.5966e+00, -2.4035e+00, -1.5511e+00, -3.0469e+00,
         -2.0768e+00, -9.0381e-01, -9.

In [102]:
probabilities = torch.nn.functional.softmax(output[0], dim=0)
# print(probabilities)

# Read the categories
with open("imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]
# Show top categories per image
top5_prob, top5_catid = torch.topk(probabilities, 5)
for i in range(top5_prob.size(0)):
    print(categories[top5_catid[i]], top5_prob[i].item())

Samoyed 0.8732963800430298
Pomeranian 0.030270840972661972
white wolf 0.019671104848384857
keeshond 0.011073537170886993
Eskimo dog 0.00920423399657011


In [109]:
from sentence_transformers import SentenceTransformer
import numpy as np


# as we could see in this simple example the embeddings
# found in the latent space are very similar to each other
# when ever the objects are involved in the sentence

# I chose the MiniLM model because it is the smallest model
# which can perform this task accurately without introducing
# overhead

sentences = ["the woman with the red dress is eating a pizza"]
s = ["person", 
             "dog",
             "ball",
             "ice cream"]

def euclidean_distance(a, b):
    return np.linalg.norm(a-b)

def coseine_similarity(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

import time
start = time.time()
embeddings = model.encode(sentences)
print('time required:',time.time()-start)

print(coseine_similarity(embeddings[0], embeddings[0]))
print(coseine_similarity(embeddings[0], embeddings[1]))
print(coseine_similarity(embeddings[0], embeddings[2]))
print(coseine_similarity(embeddings[0], embeddings[3]))
print(coseine_similarity(embeddings[0], embeddings[4]))

print(sentences)



time required: 0.013755083084106445
1.0000001
0.03727182
-0.0235776
-0.018512128
0.04793311
['the woman with the red dress is eating a pizza', 'person', 'dog', 'ball', 'ice cream']


In [141]:
file_name = 'yolov5l6+clip/1_dictionary_full_train.p'

with open(file_name, 'rb') as f:
    dictionary = pickle.load(f)

print('maximum number of boxes found: ',max([len(dictionary[sample]['image_emb']) for sample in dictionary.keys()]))
print('mean number of boxes found: ',np.array([len(dictionary[sample]['image_emb']) for sample in dictionary.keys()]).mean())
print('std number of boxes found: ',np.array([len(dictionary[sample]['image_emb']) for sample in dictionary.keys()]).std())
print('count max number of boxes found: ',[len(dictionary[sample]['image_emb']) for sample in dictionary.keys()].count(32))
print('count of number of boxes found:')
print([[len(dictionary[sample]['image_emb']) for sample in dictionary.keys()].count(xx+1) for xx in range(32)])

maximum number of boxes found:  32
mean number of boxes found:  5.56635722066973
std number of boxes found:  2.7148657554979914
count max number of boxes found:  3
count of number of boxes found:
[248, 1577, 6883, 8627, 7880, 5624, 3697, 2583, 1698, 1039, 776, 498, 341, 283, 144, 98, 56, 50, 36, 28, 17, 14, 9, 6, 2, 9, 0, 0, 0, 0, 0, 3]
