In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
DIRECTORY = "/content/drive/My Drive/cs231n/final project"
%cd $DIRECTORY

Mounted at /content/drive
/content/drive/My Drive/cs231n/final project


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

import json

import numpy as np

USE_GPU = True
dtype = torch.float32 # We will be using float throughout this tutorial.

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss.
print_every = 100
print('using device:', device)

using device: cpu


In [84]:
imageEmbeddingSize, queryTextEmbeddingSize = 512, 384

In [85]:
from ast import literal_eval

# Load in data and unwrap it
# Function to convert stringified tuple keys back to tuples
def unwrap_keys(mapping):
    return {literal_eval(k): v for k, v in mapping.items()}

# Load the JSON file
with open('data_augmentation/embeddings/clip_embeddings.json', 'r') as json_file:
    data_from_json = json.load(json_file)

# print(data_from_json)
# Unwrap the keys to their original tuple format
unwrapped_data = unwrap_keys(data_from_json)

In [86]:
text_key = "text_embedding"
image_key = "image_embeddings_all"
score_key = "scores"
image_embeddings_top_5 = "image_embeddings_top5_idx"
image_embeddings = []
text_embeddings = []
y_output = []
X_image_eval = []
X_text_embed_eval = []
y_eval = []
prompts = []
for key, sub_dataset in unwrapped_data.items():
  # pprint.pp(sub_dataset)
  text_embedding = sub_dataset[text_key]
  image_embedding = sub_dataset[image_key]
  # print(len(image_embedding))
  scores = sub_dataset[score_key]
  top5 = sub_dataset[image_embeddings_top_5]
  # print(len(image_embedding), len(text_embedding), len(scores))
  # if len(image_embedding) != len(scores):
  #   continue
  # print(top5, len(image_embedding))
  # print(key)
  for i in range(1):
    idx = top5[i]
    # print(top5, len(image_embedding))
    if idx >= len(image_embedding):
      # print(top5, len(image_embedding))
      print(key)
      continue
    image_embeddings.append(image_embedding[idx])
    text_embeddings.append(text_embedding)
    y_output.append(scores[i])
  if top5[0] == 0:
    continue
  X_image_eval.append(image_embedding)
  X_text_embed_eval.append([text_embedding] * len(image_embedding))
  y_eval.append(top5)
  prompts.append(key)

# print(y_output)
N = len(image_embeddings)
print(len(image_embeddings), len(text_embeddings), len(y_output))
# print(y_output)


224 224 224


In [88]:
X_images = torch.tensor(image_embeddings).squeeze()
X_queries = torch.tensor(text_embeddings)
y = torch.tensor(y_output)
print(X_images.shape, X_queries.shape, y.shape)
dataset = TensorDataset(X_images, X_queries, y)
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# print(X_images.shape)


torch.Size([224, 512]) torch.Size([224, 384]) torch.Size([224])


In [89]:
class EmbeddingProjectionNN(nn.Module):
    def __init__(self, embeddingSize):
      super().__init__()
      self.linear_relu_stack = nn.Sequential(
          nn.Linear(embeddingSize, 512),
          nn.ReLU(),
          nn.Dropout(),
          nn.Linear(512, 512),
          nn.LayerNorm(512),
          nn.ReLU(),
          nn.Dropout(),
          nn.Linear(512, 256),
          nn.Tanh()
      )

    def forward(self, x):
      # input (N, E)
      projection = self.linear_relu_stack(x)
      return projection

class RetrieverNN(nn.Module):
  def __init__(self, imageEmbedding, queryEmbedding):
    super().__init__()
    self.imageProj = EmbeddingProjectionNN(imageEmbedding)
    self.queryProj = EmbeddingProjectionNN(queryEmbedding)
    self.similarity = nn.CosineSimilarity(dim=1)


  def forward(self, images, query):
    imageProj = self.imageProj(images)
    queryProj = self.queryProj(query)
    similarities = self.similarity(imageProj, queryProj)
    # make similarities a prob scores between 0 and 1
    scores = similarities * 0.5 + 0.5
    return scores



In [90]:
loss = nn.BCELoss()
def train_part34(model, optimizer, epochs=10, scheduler = None):
    """
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for

    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        for imageEmbeddings, queryEmbedding, y in dataloader:
          # (B, E1), (B, E2), (B,)
          model.train()  # put model to training mode
          imageEmbeddings = imageEmbeddings.to(device=device, dtype=dtype)  # move to device, e.g. GPU
          queryEmbedding = queryEmbedding.to(device=device, dtype=dtype)
          y = y.to(device=device, dtype=dtype)

          scores = model(imageEmbeddings, queryEmbedding)
          # print(scores)
          # print(y)
          output = loss(scores, y)
          # print(scores, y)

          # Zero out all of the gradients for the variables which the optimizer
          # will update.
          optimizer.zero_grad()

          # This is the backwards pass: compute the gradient of the loss with
          # respect to each  parameter of the model.
          output.backward()

          # Actually update the parameters of the model using the gradients
          # computed by the backwards pass.
          optimizer.step()
          if scheduler:
            scheduler.step()
          print('Iteration %d, loss = %.4f' % (e, output.item()))




In [93]:

model = RetrieverNN(imageEmbeddingSize, queryTextEmbeddingSize)
optimizer = optim.Adam(model.parameters(), lr=0.01)
train_part34(model, optimizer, epochs = 20)


Iteration 0, loss = 0.7039
Iteration 0, loss = 0.7194
Iteration 0, loss = 0.8504
Iteration 0, loss = 0.6669
Iteration 0, loss = 0.9827
Iteration 0, loss = 0.6768
Iteration 0, loss = 0.7512
Iteration 1, loss = 0.6793
Iteration 1, loss = 0.6346
Iteration 1, loss = 0.6451
Iteration 1, loss = 0.6103
Iteration 1, loss = 0.5618
Iteration 1, loss = 0.5766
Iteration 1, loss = 0.5602
Iteration 2, loss = 0.4614
Iteration 2, loss = 0.5956
Iteration 2, loss = 0.5109
Iteration 2, loss = 0.5378
Iteration 2, loss = 0.5843
Iteration 2, loss = 0.5393
Iteration 2, loss = 0.5371
Iteration 3, loss = 0.5448
Iteration 3, loss = 0.5375
Iteration 3, loss = 0.5944
Iteration 3, loss = 0.5614
Iteration 3, loss = 0.4577
Iteration 3, loss = 0.4617
Iteration 3, loss = 0.4943
Iteration 4, loss = 0.5357
Iteration 4, loss = 0.4877
Iteration 4, loss = 0.5172
Iteration 4, loss = 0.4461
Iteration 4, loss = 0.5394
Iteration 4, loss = 0.5378
Iteration 4, loss = 0.5534
Iteration 5, loss = 0.5018
Iteration 5, loss = 0.5575
I

In [97]:
model.eval()  # set model to evaluation mode
eval_loose = 0
eval_tight = 0
N = len(X_image_eval)
with torch.no_grad():
  for i in range(N):
    X_image = torch.tensor(X_image_eval[i]).squeeze()
    # print(X_image.shape)
    X_query_eval = torch.tensor(X_text_embed_eval[i])
    # print(X_query_eval.shape)
    top5 = y_eval[i]
    probs = model(X_image, X_query_eval)
    top_pred = probs.detach().numpy().squeeze()
    # print(prompts[i])
    print(top_pred.shape)
    pred_idx = np.argmax(top_pred)
    if pred_idx in top5:
      eval_loose += 1
    if pred_idx == top5[0]:
      eval_tight += 1
print(float(eval_tight)/ N, float(eval_loose)/N)



(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(24,)
(24,)
(24,)
(24,)
(24,)
(24,)
(24,)
(24,)
(24,)
(24,)
(24,)
(24,)
(24,)
(24,)
(24,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(16,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,)
(12,

## Connect to Retriever 

In [98]:
import json
from ast import literal_eval
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# from huggingface_hub import login

# login("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")
# Simulated model output
# probs = model(X_image, X_query_eval)
# top_pred = probs.detach().numpy().squeeze()
# For the sake of this example, let's simulate top_pred
X_image = torch.tensor(X_image_eval[0]).squeeze()
X_query_eval = torch.tensor(X_text_embed_eval[0])
probs = model(X_image, X_query_eval)
top_pred = probs.detach().numpy().squeeze()
print(top_pred.shape)
top_5_indices = np.argsort(top_pred)[-5:][::-1]

print(top_5_indices)
prompt = ('what is the Alex desk', 1)
# Retrieve the image embeddings corresponding to the top predictions
retrieved_image_embeddings = [unwrapped_data[prompt]["image_embeddings_all"][idx] for idx in top_5_indices]
print(([len(x) for x in retrieved_image_embeddings]))
print("Successfully retrieved image embeddings")


(20,)
[1 2 6 3 8]
[1, 1, 1, 1, 1]
Successfully retrieved image embeddings


In [112]:
def generate_and_tokenize_prompt(prompt, retrieved_image_embeddings):
    full_prompt =f"""Given a target sentence and a set of image embeddings, representing relevant images related to the sentence, answer the question in the sentence.
### Target sentence:
{prompt}
"""
### Image embeddings:
# {str(retrieved_image_embeddings[:2])}
# """
    # print(full_prompt)
    return tokenizer(full_prompt, return_tensors="pt")

In [103]:
# Load the generator model (GPT-Neo in this example) and tokenizer

model_name = "microsoft/Phi-3-mini-128k-instruct"  # You can choose a different model from Hugging Face
generator_model = AutoModelForCausalLM.from_pretrained(model_name, token="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",resume_download=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, token="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",resume_download=True)
print("Loaded Generator and Tokenizer")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded Generator and Tokenizer


In [None]:
from transformers import pipeline
messages = [
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

pipe = pipeline(
    "text-generation",
    model=generator_model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

You are not running the flash-attention implementation, expect numerical differences.


In [113]:
# Prepare the context input for the generator model
# Tokenize the input context
input_ids = generate_and_tokenize_prompt(prompt, retrieved_image_embeddings)
print(len(input_ids))
# Generate the output using the context
output = generator_model.generate(input_ids, max_length=50)

# Decode the generated output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


2


AttributeError: 