In [1]:
import os
import random
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
from io import BytesIO

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import CLIPProcessor, CLIPModel
from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [9]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CLIP_MODEL = "openai/clip-vit-base-patch32"
SENT_MODEL = "all-MiniLM-L6-v2"

In [10]:
sent_model = SentenceTransformer(SENT_MODEL, device=DEVICE)
clip_model = CLIPModel.from_pretrained(CLIP_MODEL).to(DEVICE)
clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [11]:
import requests
def load_image_safe(url):
    try:
        resp = requests.get(url, timeout=8)
        img = Image.open(BytesIO(resp.content)).convert("RGB")
        return img
    except:
        return Image.new("RGB", (224, 224), color=(255,255,255))

In [12]:
def process_batch(batch_path, batch_id):
    df = pd.read_csv(batch_path)
    print(f"\nProcessing {batch_path} ... ({len(df)} samples)")

    # --- Text embeddings ---
    texts = df['catalog_content'].astype(str).tolist()
    text_embs = sent_model.encode(texts, batch_size=128, convert_to_numpy=True, show_progress_bar=True)
    np.save(f"embeddings/text_batch{batch_id}.npy", text_embs)

    # --- Image embeddings ---
    all_img_embs = []
    for i in tqdm(range(0, len(df), 32), desc=f"Batch {batch_id} CLIP"):
        imgs = [load_image_safe(u) for u in df['image_link'].iloc[i:i+32]]
        inputs = clip_processor(images=imgs, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            feats = clip_model.get_image_features(**inputs).cpu().numpy()
        # normalize
        feats = feats / (np.linalg.norm(feats, axis=1, keepdims=True) + 1e-8)
        all_img_embs.append(feats)
    img_embs = np.vstack(all_img_embs)
    np.save(f"embeddings/img_batch{batch_id}.npy", img_embs)
    print(f"✅ Saved embeddings for batch {batch_id}")

In [13]:
!mkdir -p dataset

In [14]:
!mkdir -p embeddings

In [None]:
for i in range(1, 14):  # 13 batches
    process_batch(f"dataset/batch_{i}.csv", i)


Processing dataset/batch_1.csv ... (6000 samples)


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Batch 1 CLIP: 100%|██████████| 188/188 [11:55<00:00,  3.81s/it]


✅ Saved embeddings for batch 1

Processing dataset/batch_2.csv ... (6000 samples)


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Batch 2 CLIP: 100%|██████████| 188/188 [12:20<00:00,  3.94s/it]


✅ Saved embeddings for batch 2

Processing dataset/batch_3.csv ... (6000 samples)


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Batch 3 CLIP: 100%|██████████| 188/188 [12:05<00:00,  3.86s/it]


✅ Saved embeddings for batch 3

Processing dataset/batch_4.csv ... (6000 samples)


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Batch 4 CLIP: 100%|██████████| 188/188 [11:55<00:00,  3.81s/it]

✅ Saved embeddings for batch 4



