## Data Processing and Vector Generation using FiftyOne and Pinecone

This notebook generates text embeddings for the COCO2017 dataset, using CLIP. These are both loaded from FiftyOne. Execute each cell, and create a config.py file in your directory, in which you should place your PINECONE_KEY. Make sure you have a Pinecone Index available to create a new one. 



In [2]:
import fiftyone.zoo as foz
import pinecone
import numpy as np
from pkg_resources import packaging
import torch
from tqdm.autonotebook import tqdm


#from config import PINECONE_KEY

## Loading the data and the model

In [66]:
dataset = foz.load_zoo_dataset("coco-2017", split="validation")
model = foz.load_zoo_model("clip-vit-base32-torch")

Downloading split 'validation' to 'C:\Users\Dell\fiftyone\coco-2017\validation' if necessary
Found annotations at 'C:\Users\Dell\fiftyone\coco-2017\raw\instances_val2017.json'
Images already downloaded
Existing download of split 'validation' is sufficient
Loading existing dataset 'coco-2017-validation'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use


## Generate embeddings

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

if packaging.version.parse(
  torch.__version__
) < packaging.version.parse("1.8.0"):
  dtype = torch.long
else:
  dtype = torch.int

In [6]:
# generating the embeddings
dataset.compute_embeddings(
    model, 
    embeddings_field="embedding",
)

 100% |███████████████| 5000/5000 [15.4m elapsed, 0s remaining, 6.0 samples/s]      


In [7]:
# keep the data in my computer
dataset.persistent = True

## Initializing the pinecone index and upserting the vectors

In [8]:
# initialize pinecone client
#pinecone.init(api_key='508a1fea-8fd8-4b51-ae51-053df59dd9a7', environment="us-east4-gcp")

In [13]:
from pinecone import Pinecone, ServerlessSpec
from tqdm.autonotebook import tqdm

pc = Pinecone(api_key="508a1fea-8fd8-4b51-ae51-053df59dd9a7")

In [30]:

pc.create_index(
    "clip-image-search", 
    dimension=512, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)
# initialize index
index = pc.Index("clip-image-search")

In [31]:
# convert numpy arrays to lists for pinecone
embeddings = [arr.tolist() for arr in dataset.values("embedding")]
#ids = ["http://images.cocodataset.org/val2017/" + file.split('/')[-1] for file in dataset.values("filepath")]
ids = [file for file in dataset.values("filepath")]

In [32]:
# create tuples of (id, embedding) for each sample
index_vectors = list(zip(ids, embeddings))

# upsert vectors in batches of 100
def upsert_vectors(index, vectors):
    num_vectors = len(vectors)
    num_vectors_per_step = 100
    num_steps = int(np.ceil(num_vectors/num_vectors_per_step))
    for i in range(num_steps):
        min_ind = num_vectors_per_step * i
        max_ind = min(num_vectors_per_step * (i+1), num_vectors)
        index.upsert(index_vectors[min_ind:max_ind])

upsert_vectors(index, index_vectors)

## Testing the image search

In [33]:
def get_text_embedding(prompt, clip_model):
    tokenizer = clip_model._tokenizer

    # standard start-of-text token
    sot_token = tokenizer.encoder["<|startoftext|>"]

    # standard end-of-text token
    eot_token = tokenizer.encoder["<|endoftext|>"]

    prompt_tokens = tokenizer.encode(prompt)
    all_tokens = [[sot_token] + prompt_tokens + [eot_token]]

    text_features = torch.zeros(
        len(all_tokens),
        clip_model.config.context_length,
        dtype=dtype,
        device=device,
    )

    # insert tokens into feature vector
    text_features[0, : len(all_tokens[0])] = torch.tensor(all_tokens)

    # encode text
    embedding = clip_model._model.encode_text(text_features).to(device)

    # convert to list for Pinecone
    return embedding.tolist()

In [54]:
query_vector

AttributeError: 'list' object has no attribute 'shape'

In [34]:
prompt = "a smile"
query_vector = get_text_embedding(prompt, model)
top_k_samples = index.query(
    vector=query_vector,
    top_k=10,
    include_values=False
)

top_k_samples

{'matches': [{'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000019924.jpg',
              'score': 0.257860392,
              'values': []},
             {'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000167067.jpg',
              'score': 0.240282223,
              'values': []},
             {'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000139260.jpg',
              'score': 0.237064257,
              'values': []},
             {'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000474854.jpg',
              'score': 0.235772446,
              'values': []},
             {'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000022705.jpg',
              'score': 0.235523269,
              'values': []},
             {'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000402096.jpg',
              'score': 0.235292241,
              'values': []},
             {'id': 'C:\\Users\\De

In [35]:
import os
import faiss
import torch
import skimage
import requests
import pinecone
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
import IPython.display
import matplotlib.pyplot as plt
from datasets import load_dataset
from collections import OrderedDict
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer

In [36]:
def get_image(image_URL):
   response = requests.get(image_URL)
   image = Image.open(BytesIO(response.content)).convert("RGB")
   return image

In [38]:
def get_model_info(model_ID, device):
	# Save the model to device
	model = CLIPModel.from_pretrained(model_ID).to(device)
 	# Get the processor
	processor = CLIPProcessor.from_pretrained(model_ID)
	# Get the tokenizer
	tokenizer = CLIPTokenizer.from_pretrained(model_ID)
  # Return model, processor & tokenizer
	return model, processor, tokenizer

In [39]:
# Set the device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Define the model ID
model_ID = "openai/clip-vit-base-patch32"
# Get model, processor & tokenizer
model, processor, tokenizer = get_model_info(model_ID, device)



In [40]:
model, processor, tokenizer

(CLIPModel(
   (text_model): CLIPTextTransformer(
     (embeddings): CLIPTextEmbeddings(
       (token_embedding): Embedding(49408, 512)
       (position_embedding): Embedding(77, 512)
     )
     (encoder): CLIPEncoder(
       (layers): ModuleList(
         (0-11): 12 x CLIPEncoderLayer(
           (self_attn): CLIPAttention(
             (k_proj): Linear(in_features=512, out_features=512, bias=True)
             (v_proj): Linear(in_features=512, out_features=512, bias=True)
             (q_proj): Linear(in_features=512, out_features=512, bias=True)
             (out_proj): Linear(in_features=512, out_features=512, bias=True)
           )
           (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
           (mlp): CLIPMLP(
             (activation_fn): QuickGELUActivation()
             (fc1): Linear(in_features=512, out_features=2048, bias=True)
             (fc2): Linear(in_features=2048, out_features=512, bias=True)
           )
           (layer_norm2): LayerN

In [42]:
def get_image(image_URL):
   response = requests.get(image_URL)
   image = Image.open(BytesIO(response.content)).convert("RGB")
   return image

In [58]:
image_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/640px-Cat03.jpg'
image_test = get_image(image_url)

In [59]:
def get_single_image_embedding(my_image,processor, model, device):
  image = processor(
      text = None,
      images = my_image,
      return_tensors="pt"
      )["pixel_values"].to(device)
  embedding = model.get_image_features(image)
  # convert the embeddings to numpy array
  return embedding.cpu().detach().numpy()

In [60]:
image_vector_embeddings = get_single_image_embedding(image_test, processor, model, device)

In [61]:
image_vector_embeddings = image_vector_embeddings.tolist()


In [62]:
top_k_samples = index.query(
    vector=image_vector_embeddings,
    top_k=10,
    include_values=False
)
print(top_k_samples)

{'matches': [{'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000209747.jpg',
              'score': 0.796100795,
              'values': []},
             {'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000213445.jpg',
              'score': 0.786400259,
              'values': []},
             {'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000491757.jpg',
              'score': 0.777820408,
              'values': []},
             {'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000524280.jpg',
              'score': 0.775030613,
              'values': []},
             {'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000574810.jpg',
              'score': 0.772797406,
              'values': []},
             {'id': 'C:\\Users\\Dell\\fiftyone\\coco-2017\\validation\\data\\000000291490.jpg',
              'score': 0.770050347,
              'values': []},
             {'id': 'C:\\Users\\De