In [1]:
from PIL import Image
import requests
import os
import json
import matplotlib.pyplot as plt
import torch
from tqdm.notebook import tqdm
import numpy as np

from transformers import CLIPProcessor, CLIPModel

In [2]:
data_dir = "data"
dataset_id = "20220329222400"

dataset_dir = os.path.join(data_dir, dataset_id)

In [3]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [4]:
def get_embeddings(text, images, batch_size=100):
    all_text_embeds = []
    all_image_embeds = []
    input_ids_list = []
    attention_mask_list = []
    pixel_values_list = []
    for i in tqdm(range(0, max(len(images), len(text)), batch_size)):
        text_batch = text[i:i+batch_size] if i+batch_size < len(text) else text[i:]
        image_batch = images[i:i+batch_size] if i+batch_size < len(images) else images[i:]

        inputs = processor(text=text_batch, images=image_batch, return_tensors="pt", padding=True)
        input_ids = inputs["input_ids"][:, :77]
        attention_mask = inputs["attention_mask"][:, :77]
        pixel_values = inputs["pixel_values"]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        # outputs = model(**inputs)
        text_embeds = outputs.text_embeds
        image_embeds = outputs.image_embeds
        # get as numpy array
        text_embeds = text_embeds.detach().cpu().numpy()
        image_embeds = image_embeds.detach().cpu().numpy()

        all_text_embeds.append(text_embeds)
        all_image_embeds.append(image_embeds)

        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        pixel_values_list.append(pixel_values)

    all_text_embeds = np.concatenate(all_text_embeds, axis=0)
    all_image_embeds = np.concatenate(all_image_embeds, axis=0)

    input_ids_list = np.concatenate(input_ids_list, axis=0)
    attention_mask_list = np.concatenate(attention_mask_list, axis=0)
    pixel_values_list = np.concatenate(pixel_values_list, axis=0)

    return all_text_embeds, all_image_embeds, (input_ids_list, attention_mask_list, pixel_values_list)

In [5]:
search_file = os.path.join(dataset_dir, "detailed.json")

# read search_file
with open(search_file, "r") as f:
    search = json.load(f)

texts = []
images = []
prices = []
ids = []

for post_id, post in tqdm(search.items()):
    title = post["name"]
    description = post["description"]
    text = f"{title}\n{description}"
    texts.append(text)

    image_file = os.path.join(dataset_dir, "images", post_id + ".jpg")
    image = Image.open(image_file)
    images.append(image)

    price = post["price"]
    prices.append(price)

    ids.append(post_id)

prices = np.array(prices)
ids = np.array(ids)

  0%|          | 0/995 [00:00<?, ?it/s]

In [6]:


text_embeds, image_embeds, inputs = get_embeddings(texts, images)
# save embeddings as text_embeddings.npy and image_embeddings.npy and prices and ids
np.save(os.path.join(dataset_dir, "text_embeds.npy"), text_embeds)
np.save(os.path.join(dataset_dir, "image_embeds.npy"), image_embeds)
np.save(os.path.join(dataset_dir, "prices.npy"), prices)
np.save(os.path.join(dataset_dir, "ids.npy"), ids)

input_ids, attention_mask, pixel_values = inputs
np.save(os.path.join(dataset_dir, "input_ids.npy"), input_ids)
np.save(os.path.join(dataset_dir, "attention_mask.npy"), attention_mask)
np.save(os.path.join(dataset_dir, "pixel_values.npy"), pixel_values)



  0%|          | 0/10 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (128 > 77). Running this sequence through the model will result in indexing errors
