In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
DIRECTORY = "/content/drive/My Drive/cs231n/final project"
%cd $DIRECTORY

Mounted at /content/drive
/content/drive/My Drive/cs231n/final project


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import json

import numpy as np

USE_GPU = True
dtype = torch.float32 # We will be using float throughout this tutorial.

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss.
print_every = 100
print('using device:', device)

using device: cpu


In [11]:
imageEmbeddingSize, queryTextEmbeddingSize = 1000, 384

In [4]:
from ast import literal_eval

# Load in data and unwrap it
# Function to convert stringified tuple keys back to tuples
def unwrap_keys(mapping):
    return {literal_eval(k): v for k, v in mapping.items()}

# Load the JSON file
with open('./embeddings.json', 'r') as json_file:
    data_from_json = json.load(json_file)

# print(data_from_json)
# Unwrap the keys to their original tuple format
unwrapped_data = unwrap_keys(data_from_json)

In [8]:
text_key = "text_embedding"
image_key = "image_embeddings_all"
score_key = "scores"
image_embeddings_top_5 = "image_embeddings_top5_idx"
image_embeddings = []
text_embeddings = []
y_output = []
X_image_eval = []
X_text_embed_eval = []
y_eval = []
prompts = []
for key, sub_dataset in unwrapped_data.items():
  # pprint.pp(sub_dataset)
  text_embedding = sub_dataset[text_key]
  image_embedding = sub_dataset[image_key]
  # print(len(image_embedding))
  scores = sub_dataset[score_key]
  top5 = sub_dataset[image_embeddings_top_5]
  # print(len(image_embedding), len(text_embedding), len(scores))
  # if len(image_embedding) != len(scores):
  #   continue
  # print(top5, len(image_embedding))
  # print(key)
  for i in range(1):
    idx = top5[i]
    # print(top5, len(image_embedding))
    if idx >= len(image_embedding):
      # print(top5, len(image_embedding))
      print(key)
      continue
    image_embeddings.append(image_embedding[idx])
    text_embeddings.append(text_embedding)
    y_output.append(scores[i])
  if top5[0] == 0:
    continue
  X_image_eval.append(image_embedding)
  X_text_embed_eval.append([text_embedding] * len(image_embedding))
  y_eval.append(top5)
  prompts.append(key)

# print(y_output)
N = len(image_embeddings)
print(len(image_embeddings), len(text_embeddings), len(y_output))
# print(y_output)


224 224 224


In [9]:
X_images = torch.tensor(image_embeddings)
X_queries = torch.tensor(text_embeddings)
y = torch.tensor(y_output)
print(X_images.shape, X_queries.shape, y.shape)
dataset = TensorDataset(X_images, X_queries, y)
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# print(X_images.shape)


torch.Size([224, 1000]) torch.Size([224, 384]) torch.Size([224])


In [12]:
class EmbeddingProjectionNN(nn.Module):
    def __init__(self, embeddingSize):
      super().__init__()
      self.linear_relu_stack = nn.Sequential(
          nn.Linear(embeddingSize, 512),
          nn.ReLU(),
          nn.Dropout(),
          nn.Linear(512, 512),
          nn.LayerNorm(512),
          nn.ReLU(),
          nn.Dropout(),
          nn.Linear(512, 256),
          nn.Tanh()
      )

    def forward(self, x):
      # input (N, E)
      projection = self.linear_relu_stack(x)
      return projection

class RetrieverNN(nn.Module):
  def __init__(self, imageEmbedding, queryEmbedding):
    super().__init__()
    self.imageProj = EmbeddingProjectionNN(imageEmbedding)
    self.queryProj = EmbeddingProjectionNN(queryEmbedding)
    self.similarity = nn.CosineSimilarity(dim=1)


  def forward(self, images, query):
    imageProj = self.imageProj(images)
    queryProj = self.queryProj(query)
    similarities = self.similarity(imageProj, queryProj)
    # make similarities a prob scores between 0 and 1
    scores = similarities * 0.5 + 0.5
    return scores



In [13]:
loss = nn.BCELoss()
def train_part34(model, optimizer, epochs=10, scheduler = None):
    """
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for

    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for imageEmbeddings, queryEmbedding, y in dataloader:
          # (B, E1), (B, E2), (B,)
          model.train()  # put model to training mode
          imageEmbeddings = imageEmbeddings.to(device=device, dtype=dtype)  # move to device, e.g. GPU
          queryEmbedding = queryEmbedding.to(device=device, dtype=dtype)
          y = y.to(device=device, dtype=dtype)

          scores = model(imageEmbeddings, queryEmbedding)
          # print(scores)
          # print(y)
          output = loss(scores, y)
          # print(scores, y)

          # Zero out all of the gradients for the variables which the optimizer
          # will update.
          optimizer.zero_grad()

          # This is the backwards pass: compute the gradient of the loss with
          # respect to each  parameter of the model.
          output.backward()

          # Actually update the parameters of the model using the gradients
          # computed by the backwards pass.
          optimizer.step()
          if scheduler:
            scheduler.step()
          print('Iteration %d, loss = %.4f' % (e, output.item()))




In [14]:

model = RetrieverNN(imageEmbeddingSize, queryTextEmbeddingSize)
optimizer = optim.Adam(model.parameters(), lr=0.01)
train_part34(model, optimizer, epochs = 20)


Iteration 0, loss = 0.6924
Iteration 0, loss = 0.7081
Iteration 0, loss = 0.8779
Iteration 0, loss = 1.0768
Iteration 0, loss = 0.6839
Iteration 0, loss = 0.7129
Iteration 0, loss = 0.6876
Iteration 1, loss = 0.6651
Iteration 1, loss = 0.7220
Iteration 1, loss = 0.6787
Iteration 1, loss = 0.6998
Iteration 1, loss = 0.6953
Iteration 1, loss = 0.6884
Iteration 1, loss = 0.6964
Iteration 2, loss = 0.6853
Iteration 2, loss = 0.6773
Iteration 2, loss = 0.6773
Iteration 2, loss = 0.6798
Iteration 2, loss = 0.6646
Iteration 2, loss = 0.6637
Iteration 2, loss = 0.6764
Iteration 3, loss = 0.6923
Iteration 3, loss = 0.6573
Iteration 3, loss = 0.6540
Iteration 3, loss = 0.6637
Iteration 3, loss = 0.6488
Iteration 3, loss = 0.6541
Iteration 3, loss = 0.6153
Iteration 4, loss = 0.6426
Iteration 4, loss = 0.6075
Iteration 4, loss = 0.6134
Iteration 4, loss = 0.6127
Iteration 4, loss = 0.6394
Iteration 4, loss = 0.6108
Iteration 4, loss = 0.6207
Iteration 5, loss = 0.6216
Iteration 5, loss = 0.5922
I

In [19]:
model.eval()  # set model to evaluation mode
eval_loose = 0
eval_tight = 0
N = len(X_image_eval)
with torch.no_grad():
  for i in range(N):
    X_image = torch.tensor(X_image_eval[i])
    X_query_eval = torch.tensor(X_text_embed_eval[i])
    top5 = y_eval[i]
    probs = model(X_image, X_query_eval)
    top_pred = probs.detach().numpy().squeeze()
    # print(prompts[i])
    print(top5[0], top_pred)
    pred_idx = np.argmax(top_pred)
    if pred_idx in top5:
      eval_loose += 1
    if pred_idx == top5[0]:
      eval_tight += 1
print(float(eval_tight)/ N, float(eval_loose)/N)



7 [0.7594562  0.96876264 0.9687709  0.9561261  0.49438563 0.67538357
 0.96875215 0.3707183  0.9687599  0.47646883 0.67671984 0.42568076
 0.33498877 0.3527168  0.6985896  0.44032323 0.4178658  0.9350209
 0.7201774  0.8628553 ]
1 [0.7594563  0.96876264 0.96877086 0.95612615 0.49438563 0.67538357
 0.96875226 0.3707183  0.96875995 0.4764688  0.67671984 0.42568076
 0.3349887  0.35271674 0.6985896  0.44032323 0.4178658  0.9350209
 0.7201775  0.8628553 ]
7 [0.7594563  0.96876264 0.96877086 0.95612615 0.49438563 0.67538357
 0.96875226 0.3707183  0.96875995 0.4764688  0.67671984 0.42568076
 0.3349887  0.35271674 0.6985896  0.44032323 0.4178658  0.9350209
 0.7201775  0.8628553 ]
2 [0.7594562  0.96876264 0.96877086 0.95612615 0.49438563 0.67538357
 0.96875215 0.3707183  0.9687599  0.47646883 0.6767198  0.42568076
 0.33498874 0.3527168  0.69858956 0.44032323 0.41786584 0.9350209
 0.7201774  0.8628553 ]
7 [0.7594562  0.96876264 0.96877086 0.95612615 0.49438563 0.67538357
 0.96875215 0.3707183  0.96