In [2]:
import torch 
import torchvision.transforms as T
from transformers import AutoFeatureExtractor, AutoModel
from PIL import Image
import numpy as np

In [10]:
model_ckpt = "google/vit-base-patch16-224-in21k"
model = AutoModel.from_pretrained(model_ckpt)
hidden_dim = model.config.hidden_size

In [32]:
#prepare the image
transformation_chain = T.Compose(
    [
        # resize and tensorize
        T.Resize((224,224)),
        T.ToTensor()
    ]
)

In [33]:
def extract_embeddings(model: torch.nn.Module, images):
    device = model.device


    # `transformation_chain` is a compostion of preprocessing
    # transformations we apply to the input images to prepare them
    # for the model. For more details, check out the accompanying Colab Notebook.
    image_batch_transformed = torch.stack(
        [transformation_chain(image) for image in images]
    )
    new_batch = image_batch_transformed.to(device)
    with torch.no_grad():
        embeddings = model(new_batch).last_hidden_state[:, 0].cpu()
    return embeddings


In [24]:
images = [Image.open("data/" + str(k) + ".jpg") for k in range(1,5)]
image1 = [Image.open("data/doritos.png").convert("RGB")]

In [64]:
mdev = model.to("cuda")
embeddings2 = extract_embeddings(mdev, images)
embeddings1 = extract_embeddings(mdev, image1)
embeddings = np.array([np.array(i) for i in list(embeddings2)])

In [71]:
import faiss
vector_dimension = embeddings.shape[1]
print(vector_dimension)
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(embeddings)
index.add(embeddings)

768


In [76]:
k = 2
_vector = np.array(embeddings1)
faiss.normalize_L2(_vector)
distances, ann = index.search(_vector, k=k)
print(distances)
print(ann)

[[0.84335566 0.9449022 ]]
[[0 2]]


In [78]:
def find_efficient_cosine_similarity(source_representation, test_representation):
    a = np.dot(source_representation, test_representation)
    b = np.sqrt(source_representation.dot(source_representation))
    c = np.sqrt(test_representation.dot(test_representation))
    return 1 - (a / (b*c))
            

In [79]:
categories = ["Doritos", "Lays", "Cheetos", "Sun chips"]

csim = [find_efficient_cosine_similarity(i, embeddings1[0]) for i in embeddings]
print(csim)
print(categories[csim.index(min(csim))])

[tensor(0.4217), tensor(0.5853), tensor(0.4725), tensor(0.6342)]
Doritos
