In [3]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
DIRECTORY = "/content/drive/My Drive/cs231n/final project"
%cd $DIRECTORY

Mounted at /content/drive
/content/drive/My Drive/cs231n/final project


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import json

import numpy as np

USE_GPU = True
dtype = torch.float32 # We will be using float throughout this tutorial.

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss.
print_every = 100
print('using device:', device)

using device: cpu


In [5]:
imageEmbeddingSize, queryTextEmbeddingSize = 1000, 384

In [6]:
from ast import literal_eval

# Load in data and unwrap it
# Function to convert stringified tuple keys back to tuples
def unwrap_keys(mapping):
    return {literal_eval(k): v for k, v in mapping.items()}

# Load the JSON file
with open('./embeddings.json', 'r') as json_file:
    data_from_json = json.load(json_file)

# print(data_from_json)
# Unwrap the keys to their original tuple format
unwrapped_data = unwrap_keys(data_from_json)

In [8]:
text_key = "text_embedding"
image_key = "image_embeddings_all"
score_key = "scores"
image_embeddings_top_5 = "image_embeddings_top5_idx"
image_embeddings = []
text_embeddings = []
y_output = []
X_image_eval = []
X_text_embed_eval = []
y_eval = []
prompts = []
for key, sub_dataset in unwrapped_data.items():
  # pprint.pp(sub_dataset)
  text_embedding = sub_dataset[text_key]
  image_embedding = sub_dataset[image_key]
  # print(len(image_embedding))
  scores = sub_dataset[score_key]
  top5 = sub_dataset[image_embeddings_top_5]
  # print(len(image_embedding), len(text_embedding), len(scores))
  # if len(image_embedding) != len(scores):
  #   continue
  # print(top5, len(image_embedding))
  # print(key)
  for i in range(1):
    idx = top5[i]
    # print(top5, len(image_embedding))
    if idx >= len(image_embedding):
      # print(top5, len(image_embedding))
      print(key)
      continue
    image_embeddings.append(image_embedding[idx])
    text_embeddings.append(text_embedding)
    y_output.append(scores[i])
  if top5[0] == 0:
    continue
  X_image_eval.append(image_embedding)
  X_text_embed_eval.append([text_embedding] * len(image_embedding))
  y_eval.append(top5)
  prompts.append(key)

# print(y_output)
N = len(image_embeddings)
print(len(image_embeddings), len(text_embeddings), len(y_output))
# print(y_output)


224 224 224


In [9]:
X_images = torch.tensor(image_embeddings)
X_queries = torch.tensor(text_embeddings)
y = torch.tensor(y_output)
print(X_images.shape, X_queries.shape, y.shape)
dataset = TensorDataset(X_images, X_queries, y)
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# print(X_images.shape)


torch.Size([224, 1000]) torch.Size([224, 384]) torch.Size([224])


In [42]:
class EmbeddingProjectionNN(nn.Module):
    def __init__(self, embeddingSize):
      super().__init__()
      self.linear_relu_stack = nn.Sequential(
          nn.Linear(embeddingSize, 512),
          nn.ReLU(),
          nn.Dropout(),
          nn.Linear(512, 512),
          nn.LayerNorm(512),
          nn.ReLU(),
          nn.Dropout(),
          nn.Linear(512, 256),
          nn.ReLU()
      )

    def forward(self, x):
      # input (N, E)
      projection = self.linear_relu_stack(x)
      return projection

class RetrieverNN(nn.Module):
  def __init__(self, imageEmbedding, queryEmbedding):
    super().__init__()
    self.imageProj = EmbeddingProjectionNN(imageEmbedding)
    self.queryProj = EmbeddingProjectionNN(queryEmbedding)
    self.similarity = nn.CosineSimilarity(dim=1)


  def forward(self, images, query):
    imageProj = self.imageProj(images)
    queryProj = self.queryProj(query)
    similarities = self.similarity(imageProj, queryProj)
    # make similarities a prob scores between 0 and 1
    scores = similarities * 0.5 + 0.5
    return scores



In [13]:
loss = nn.BCELoss()
def train_part34(model, optimizer, epochs=10, scheduler = None):
    """
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for

    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for imageEmbeddings, queryEmbedding, y in dataloader:
          # (B, E1), (B, E2), (B,)
          model.train()  # put model to training mode
          imageEmbeddings = imageEmbeddings.to(device=device, dtype=dtype)  # move to device, e.g. GPU
          queryEmbedding = queryEmbedding.to(device=device, dtype=dtype)
          y = y.to(device=device, dtype=dtype)

          scores = model(imageEmbeddings, queryEmbedding)
          # print(scores)
          # print(y)
          output = loss(scores, y)
          # print(scores, y)

          # Zero out all of the gradients for the variables which the optimizer
          # will update.
          optimizer.zero_grad()

          # This is the backwards pass: compute the gradient of the loss with
          # respect to each  parameter of the model.
          output.backward()

          # Actually update the parameters of the model using the gradients
          # computed by the backwards pass.
          optimizer.step()
          if scheduler:
            scheduler.step()
          # print('Iteration %d, loss = %.4f' % (e, output.item()))




In [None]:

model = RetrieverNN(imageEmbeddingSize, queryTextEmbeddingSize)
optimizer = optim.Adam(model.parameters(), lr=0.01)
train_part34(model, optimizer, epochs = 20)


In [53]:
def eval_model():
#Evaluate model
  model.eval()  # set model to evaluation mode
  eval_loose = 0
  eval_tight = 0
  N = len(X_image_eval)
  with torch.no_grad():
    for i in range(N):
      X_image = torch.tensor(X_image_eval[i])
      X_query_eval = torch.tensor(X_text_embed_eval[i])
      top5 = y_eval[i]
      probs = model(X_image, X_query_eval)
      top_pred = probs.detach().numpy().squeeze()
      # print(prompts[i])
      # print(top5[0], top_pred)
      pred_idx = np.argmax(top_pred)
      if pred_idx in top5:
        eval_loose += 1
      if pred_idx == top5[0]:
        eval_tight += 1
  return (float(eval_tight)/ N, float(eval_loose)/N)



Training with contrastive examples

In [28]:
text_key = "text_embedding"
image_key = "image_embeddings_all"
score_key = "scores"
image_embeddings_top_5 = "image_embeddings_top5_idx"
image_embeddings = []
text_embeddings = []
y_output = []
X_image_eval = []
X_text_embed_eval = []
y_eval = []
prompts = []
indeces = []
for key, sub_dataset in unwrapped_data.items():
  # print(key)
  if key[1] != 1:
    continue
  text_embedding = sub_dataset[text_key]
  image_embedding = sub_dataset[image_key]
  scores = sub_dataset[score_key]
  top5 = sub_dataset[image_embeddings_top_5]
  for i in range(1):
    idx = top5[i]
    # print(top5, len(image_embedding))
    if idx >= len(image_embedding):
      # print(top5, len(image_embedding))
      print(key)
      continue
    indeces.append(idx)
    image_embeddings.append(image_embedding[idx])
    text_embeddings.append(text_embedding)
    y_output.append(scores[i])
  # if top5[0] == 0:
  #   continue
  X_image_eval.append(image_embedding)
  X_text_embed_eval.append([text_embedding] * len(image_embedding))
  y_eval.append(top5)
  # prompts.append(key)

# print(y_output)
N = len(image_embeddings)
print(len(image_embeddings), len(text_embeddings), len(y_output))
# print(y_output)


112 112 112


In [29]:
X_images = torch.tensor(image_embeddings)
X_queries = torch.tensor(text_embeddings)
y = torch.tensor(y_output)

In [57]:
loss = nn.BCELoss()
def train_part_contrastive(model, optimizer, epochs=10, scheduler = None):
    """
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for

    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
      interval = 32
      for i in range(0, N, interval):
          # Get the current batch and train that batch
          image_batch = X_images[i:i + interval].to(device=device, dtype=dtype)
          query_batch = X_queries[i:i + interval].to(device=device, dtype=dtype)
          y_batch = y[i:i + interval].to(device=device, dtype=dtype)
          index_batch = indeces[i:i + interval]
          model.train()  # put model to training mode
          scores = model(image_batch, query_batch)
          output = loss(scores, y_batch)
          optimizer.zero_grad()
          # output.backward()


          # find the best contrastive examples for each text prompt
          model.eval()
          eval_images = X_image_eval[i:i + interval]
          query_eval = X_text_embed_eval[i:i + interval]

          contrastive_images = torch.zeros(len(eval_images), imageEmbeddingSize)
          for j in range(len(eval_images)):
            images_i = torch.tensor(eval_images[j]).to(device=device, dtype=dtype)
            queries_i = torch.tensor(query_eval[j]).to(device=device, dtype=dtype)
            probs = model(images_i, queries_i).detach().numpy().squeeze()
            sorted = np.argsort(probs)[::-1]
            # print(index_batch, j)
            correct = index_batch[j]

            if sorted[0] != correct:
              contrastive_images[j] = images_i[sorted[0]]
            else:
              contrastive_images[j] = images_i[sorted[1]]

          # print(contrastive_images)
          # image_batch = torch.tensor(contrastive_images).to(device=device, dtype=dtype)
          y_batch = torch.zeros(len(eval_images))

          model.train()  # put model to training mode
          scores = model(contrastive_images, query_batch)
          output += loss(scores, y_batch)
          output.backward()

          optimizer.step()
          if scheduler:
            scheduler.step()
      eval_acc = eval_model()
      print('Iteration %d, loss = %.4f \n' % (e, output.item()))
      print('Iteration %d, 1_accuracy = %.4f, 5_accuracy = %.4f \n' % (e, eval_acc[0], eval_acc[1]))




In [59]:
model = RetrieverNN(imageEmbeddingSize, queryTextEmbeddingSize)
optimizer = optim.Adam(model.parameters(), lr=0.01)
train_part_contrastive(model, optimizer, epochs = 20)

Iteration 0, loss = 1.3949 

Iteration 0, 1_accuracy = 0.1696, 5_accuracy = 0.7857 

Iteration 1, loss = 1.3855 

Iteration 1, 1_accuracy = 0.1161, 5_accuracy = 0.4107 

Iteration 2, loss = 1.3830 

Iteration 2, 1_accuracy = 0.1786, 5_accuracy = 0.7500 

Iteration 3, loss = 1.3876 

Iteration 3, 1_accuracy = 0.1518, 5_accuracy = 0.4732 

Iteration 4, loss = 1.3912 

Iteration 4, 1_accuracy = 0.1339, 5_accuracy = 0.5179 

Iteration 5, loss = 1.3855 

Iteration 5, 1_accuracy = 0.2054, 5_accuracy = 0.7411 

Iteration 6, loss = 1.3863 

Iteration 6, 1_accuracy = 0.1071, 5_accuracy = 0.3214 

Iteration 7, loss = 1.3865 

Iteration 7, 1_accuracy = 0.0893, 5_accuracy = 0.2946 

Iteration 8, loss = 1.3878 

Iteration 8, 1_accuracy = 0.0804, 5_accuracy = 0.2143 

Iteration 9, loss = 1.3867 

Iteration 9, 1_accuracy = 0.0982, 5_accuracy = 0.5357 

Iteration 10, loss = 1.3835 

Iteration 10, 1_accuracy = 0.2411, 5_accuracy = 0.6429 

Iteration 11, loss = 1.3893 

Iteration 11, 1_accuracy = 0.1964