In [1]:
from fashion_clip.fashion_clip import FashionCLIP
from PIL import Image
import json
import os
from tqdm import tqdm
import pickle
import torch
torch.backends.cudnn.benchmark = True
from concurrent.futures import ThreadPoolExecutor

  from .autonotebook import tqdm as notebook_tqdm


### Import data

In [2]:
# Image.open('polyvore/images/214181831/2.jpg')

In [3]:
# with open("polyvore/train_no_dup.json", "r") as f:
#     all_sets = json.load(f)

# # Filter and collect valid (image_path, description) pairs
# valid_items = []
# for outfit in all_sets:
#     set_id = outfit.get("set_id")
#     for item in outfit.get("items", []):
#         name = item.get("name", "").strip().lower()
#         if not name or name == 'polyvore':
#             continue
#         index = item["index"]
#         local_path = f"polyvore/images/{set_id}/{index}.jpg"
#         if os.path.exists(local_path):  # only if image exists locally
#             valid_items.append((local_path, item["name"]))

In [4]:
# len(valid_items)

In [5]:
# # store valid_items as it took to run 
# with open("valid_items.pkl", "wb") as f:
#     pickle.dump(valid_items, f)

In [6]:
with open("valid_items.pkl", "rb") as f:
    valid_items = pickle.load(f)

In [7]:
def load_image(path):
    try:
        return Image.open(path).convert("RGB")
    except:
        return None

In [8]:
batch_size = 1024
image_embeddings = []
text_embeddings = []

fclip = FashionCLIP("fashion-clip")
fclip.device = "cuda"

for i in tqdm(range(0, len(valid_items), batch_size)):
    batch = valid_items[i:i + batch_size]
    image_paths = [img_path for img_path, _ in batch]
    descriptions = [desc for _, desc in batch]

    # Load images in parallel
    with ThreadPoolExecutor(max_workers=8) as executor:
        images = list(executor.map(load_image, image_paths))

    # Filter failed image loads
    filtered_batch = [(img, desc) for img, desc in zip(images, descriptions) if img is not None]
    if not filtered_batch:
        continue

    images, descriptions = zip(*filtered_batch)

    # Encode with FashionCLIP
    img_emb = fclip.encode_images(list(images), batch_size=len(images))
    txt_emb = fclip.encode_text(list(descriptions), batch_size=len(images))

    image_embeddings.append(img_emb)
    text_embeddings.append(txt_emb)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
100%|██████████| 1/1 [00:08<00:00,  8.37s/it]
Map: 100%|██████████| 1024/1024 [00:00<00:00, 21380.02 examples/s]
100%|██████████| 1/1 [00:02<00:00,  2.55s/it]
100%|██████████| 1/1 [00:07<00:00,  7.29s/it]/it]
Map: 100%|██████████| 1024/1024 [00:00<00:00, 24433.63 examples/s]
100%|██████████| 1/1 [00:02<00:00,  2.41s/it]
100%|██████████| 1/1 [00:08<00:00,  8.37s/it]/it]
Map: 100%|██████████| 1024/1024 [00:00<00:00, 21542.37 examples/s]
100%|██████████| 1/1 [00:00<00:00, -2.30it/s]
100%|██████████| 1/1 [00:04<00:00,  4.34s/it]/it]
Map: 100%|██████████| 1024/1024 [00:00<00:00, 23769.02 examples/s]
100%|██████████| 1/1 [00:02<00:00,  2.51s/it]
100%|██████████| 1/1 [00:06<

In [15]:
print(len(image_embeddings), image_embeddings[0].shape)
print(len(text_embeddings), text_embeddings[0].shape)

111 (1024, 512)
111 (1024, 512)


In [20]:
image_emb_torch = [torch.from_numpy(emb) for emb in image_embeddings]
text_emb_torch = [torch.from_numpy(emb) for emb in text_embeddings]

image_embeddings = torch.cat(image_emb_torch, dim=0)
text_embeddings = torch.cat(text_emb_torch, dim=0)

torch.save(image_embeddings, "fashionclip_polyvore_image_embeddings.pt")
torch.save(text_embeddings, "fashionclip_polyvore_text_embeddings.pt")