In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
DIRECTORY = "/content/drive/My Drive/CS231n-Final-Project/small-dataset"
%cd $DIRECTORY

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/CS231n-Final-Project/small-dataset


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import json

import numpy as np

USE_GPU = True
dtype = torch.float32 # We will be using float throughout this tutorial.

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss.
print_every = 100
print('using device:', device)

using device: cuda


In [3]:
imageEmbeddingSize, queryTextEmbeddingSize = 512, 384

In [4]:
from ast import literal_eval

# Load in data and unwrap it
# Function to convert stringified tuple keys back to tuples
def unwrap_keys(mapping):
    return {literal_eval(k): v for k, v in mapping.items()}

# Load the JSON file
with open('./clip_embeddings.json', 'r') as json_file:
    data_from_json = json.load(json_file)

# print(data_from_json)
# Unwrap the keys to their original tuple format
unwrapped_data = unwrap_keys(data_from_json)

In [5]:
text_key = "text_embedding"
image_key = "image_embeddings_all"
score_key = "scores"
image_embeddings_top_5 = "image_embeddings_top5_idx"
image_embeddings = []
text_embeddings = []
y_output = []
X_image_eval = []
X_text_embed_eval = []
y_eval = []
prompts = []
for key, sub_dataset in unwrapped_data.items():
  # pprint.pp(sub_dataset)
  text_embedding = sub_dataset[text_key]
  image_embedding = sub_dataset[image_key]
  # print(len(image_embedding))
  scores = sub_dataset[score_key]
  top5 = sub_dataset[image_embeddings_top_5]
  # print(len(image_embedding), len(text_embedding), len(scores))
  # if len(image_embedding) != len(scores):
  #   continue
  # print(top5, len(image_embedding))
  # print(key)
  for i in range(1):
    idx = top5[i]
    # print(top5, len(image_embedding))
    if idx >= len(image_embedding):
      # print(top5, len(image_embedding))
      print(key)
      continue
    image_embeddings.append(image_embedding[idx])
    text_embeddings.append(text_embedding)
    y_output.append(scores[i])
  if top5[0] == 0:
    continue
  X_image_eval.append(image_embedding)
  X_text_embed_eval.append([text_embedding] * len(image_embedding))
  y_eval.append(top5)
  prompts.append(key)

# print(y_output)
N = len(image_embeddings)
print(len(image_embeddings), len(text_embeddings), len(y_output))
# print(y_output)


224 224 224


In [6]:
X_images = torch.tensor(image_embeddings).squeeze()
X_queries = torch.tensor(text_embeddings)
y = torch.tensor(y_output)
print(X_images.shape, X_queries.shape, y.shape)
dataset = TensorDataset(X_images, X_queries, y)
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# print(X_images.shape)


torch.Size([224, 512]) torch.Size([224, 384]) torch.Size([224])


In [7]:
class EmbeddingProjectionNN(nn.Module):
    def __init__(self, embeddingSize):
      super().__init__()
      self.linear_relu_stack = nn.Sequential(
          nn.Linear(embeddingSize, 512),
          nn.ReLU(),
          nn.Dropout(),
          nn.Linear(512, 512),
          nn.LayerNorm(512),
          nn.ReLU(),
          nn.Dropout(),
          nn.Linear(512, 256),
          nn.Tanh()
      )

    def forward(self, x):
      # input (N, E)
      projection = self.linear_relu_stack(x)
      return projection

class RetrieverNN(nn.Module):
  def __init__(self, imageEmbedding, queryEmbedding):
    super().__init__()
    self.imageProj = EmbeddingProjectionNN(imageEmbedding)
    self.queryProj = EmbeddingProjectionNN(queryEmbedding)
    self.similarity = nn.CosineSimilarity(dim=1)


  def forward(self, images, query):
    imageProj = self.imageProj(images)
    queryProj = self.queryProj(query)
    similarities = self.similarity(imageProj, queryProj)
    # make similarities a prob scores between 0 and 1
    scores = similarities * 0.5 + 0.5
    return scores



In [8]:
loss = nn.BCELoss()
def train_part34(model, optimizer, epochs=10, scheduler = None):
    """
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for

    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for imageEmbeddings, queryEmbedding, y in dataloader:
          # (B, E1), (B, E2), (B,)
          model.train()  # put model to training mode
          imageEmbeddings = imageEmbeddings.to(device=device, dtype=dtype)  # move to device, e.g. GPU
          queryEmbedding = queryEmbedding.to(device=device, dtype=dtype)
          y = y.to(device=device, dtype=dtype)

          scores = model(imageEmbeddings, queryEmbedding)
          # print(scores)
          # print(y)
          output = loss(scores, y)
          # print(scores, y)

          # Zero out all of the gradients for the variables which the optimizer
          # will update.
          optimizer.zero_grad()

          # This is the backwards pass: compute the gradient of the loss with
          # respect to each  parameter of the model.
          output.backward()

          # Actually update the parameters of the model using the gradients
          # computed by the backwards pass.
          optimizer.step()
          if scheduler:
            scheduler.step()
          print('Iteration %d, loss = %.4f' % (e, output.item()))




In [9]:

model = RetrieverNN(imageEmbeddingSize, queryTextEmbeddingSize)
optimizer = optim.Adam(model.parameters(), lr=0.01)
train_part34(model, optimizer, epochs = 20)


Iteration 0, loss = 0.6921
Iteration 0, loss = 0.6902
Iteration 0, loss = 0.8683
Iteration 0, loss = 0.7305
Iteration 0, loss = 0.6703
Iteration 0, loss = 0.6261
Iteration 0, loss = 0.5144
Iteration 1, loss = 0.5991
Iteration 1, loss = 0.5107
Iteration 1, loss = 0.5712
Iteration 1, loss = 0.4067
Iteration 1, loss = 0.6026
Iteration 1, loss = 0.5231
Iteration 1, loss = 0.5346
Iteration 2, loss = 0.4555
Iteration 2, loss = 0.5263
Iteration 2, loss = 0.4441
Iteration 2, loss = 0.5994
Iteration 2, loss = 0.4538
Iteration 2, loss = 0.5415
Iteration 2, loss = 0.4855
Iteration 3, loss = 0.4552
Iteration 3, loss = 0.5680
Iteration 3, loss = 0.4143
Iteration 3, loss = 0.4618
Iteration 3, loss = 0.4579
Iteration 3, loss = 0.4654
Iteration 3, loss = 0.3881
Iteration 4, loss = 0.3814
Iteration 4, loss = 0.3554
Iteration 4, loss = 0.4721
Iteration 4, loss = 0.5502
Iteration 4, loss = 0.3796
Iteration 4, loss = 0.4187
Iteration 4, loss = 0.5285
Iteration 5, loss = 0.3585
Iteration 5, loss = 0.3889
I

In [10]:
# Check if a GPU is available and if not, use the CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# Move the model to the appropriate device
model.to(device)
model.eval()  # set model to evaluation mode
eval_loose = 0
eval_tight = 0
N = len(X_image_eval)
with torch.no_grad():
  for i in range(N):
    X_image = torch.tensor(X_image_eval[i]).squeeze().to(device)
    # print(X_image.shape)
    X_query_eval = torch.tensor(X_text_embed_eval[i]).to(device)
    # print(X_query_eval.shape)
    top5 = y_eval[i]
    probs = model(X_image, X_query_eval).cpu()
    top_pred = probs.detach().numpy().squeeze()
    # print(prompts[i])
    # print(top_pred.shape)
    pred_idx = np.argmax(top_pred)
    if pred_idx in top5:
      eval_loose += 1
    if pred_idx == top5[0]:
      eval_tight += 1
print(float(eval_tight)/ N, float(eval_loose)/N)



cuda
0.115 0.395


## Connect to Retriever

In [11]:
import json
from ast import literal_eval
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# from huggingface_hub import login

# login("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")
# Simulated model output
# probs = model(X_image, X_query_eval)
# top_pred = probs.detach().numpy().squeeze()
# For the sake of this example, let's simulate top_pred
X_image = torch.tensor(X_image_eval[0]).squeeze().to(device)
X_query_eval = torch.tensor(X_text_embed_eval[0]).to(device)
probs = model(X_image, X_query_eval).cpu()
top_pred = probs.detach().numpy().squeeze()
print(top_pred.shape)
top_5_indices = np.argsort(top_pred)[-5:][::-1]

print(top_5_indices)
prompt = ('what is the Alex desk', 1)
# Retrieve the image embeddings corresponding to the top predictions
retrieved_image_embeddings = [unwrapped_data[prompt]["image_embeddings_all"][idx] for idx in top_5_indices]
print(([len(x) for x in retrieved_image_embeddings]))
print("Successfully retrieved image embeddings")


(20,)
[1 2 3 9 6]
[1, 1, 1, 1, 1]
Successfully retrieved image embeddings


In [53]:
def generate_and_tokenize_prompt(prompt, retrieved_image_embeddings):
    full_prompt =f"""Given a target question and a set of image embeddings, representing relevant images related to the question, answer the question in as much detail as possible.
### Target question:
{prompt[0]+"?"}
# Image embeddings:
{str(retrieved_image_embeddings[:1])}
"""
    print(full_prompt)
    return full_prompt
    # return tokenizer(full_prompt, return_tensors="pt").to(device)

In [16]:
%pip install scale-llm-engine


Collecting scale-llm-engine
  Downloading scale_llm_engine-0.0.0b33-py3-none-any.whl (26 kB)
Installing collected packages: scale-llm-engine
Successfully installed scale-llm-engine-0.0.0b33


In [22]:
import os

os.environ["SCALE_API_KEY"] = ""



In [54]:
from llmengine import Completion
full_prompt = generate_and_tokenize_prompt(prompt, retrieved_image_embeddings)
response = Completion.create(
    model="mixtral-8x7b",
    prompt=full_prompt,
    max_new_tokens=1000,
    temperature=0.2,
)
print("---------ANSWER---------")
print(response.output.text)

Given a target question and a set of image embeddings, representing relevant images related to the question, answer the question in as much detail as possible.
### Target question:
what is the Alex desk?
# Image embeddings:
[[[-0.026401400566101074, -0.005056341644376516, 0.02107362262904644, -0.02604062482714653, 0.022420773282647133, 0.009395401924848557, 0.02781788446009159, 0.03330061584711075, -0.03486843407154083, -0.027320774272084236, 0.0016439397586509585, -0.019156916067004204, 0.08204422891139984, 0.0014538150280714035, 0.029461843892931938, 0.0272693894803524, -0.04049578681588173, 0.03573795408010483, -0.07547422498464584, 0.00340963713824749, -0.0959882140159607, 0.003399668727070093, 0.05353952571749687, 0.01446089893579483, -0.06419672816991806, 0.012848060578107834, 0.006372035481035709, -0.01642022281885147, -0.01827085390686989, 0.04628729447722435, -0.011242721229791641, 0.036169618368148804, 0.02607705071568489, -0.018905164673924446, -0.017777783796191216, -0.0197

In [47]:
import json

res = response.json()
res = json.loads(res)

In [48]:
res["output"]["text"]

''