In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/GenAI final project/cleaned_product_data.csv")

In [2]:
df.head()

Unnamed: 0,Image,full_description
0,https://images-na.ssl-images-amazon.com/images...,"Title: DB Longboards CoreFlex Crossbow 41"" Bam..."
1,https://images-na.ssl-images-amazon.com/images...,Title: Electronic Snap Circuits Mini Kits Clas...
2,https://images-na.ssl-images-amazon.com/images...,Title: 3Doodler Create Flexy 3D Printing Filam...
3,https://images-na.ssl-images-amazon.com/images...,Title: Guillow Airplane Design Studio with Tra...
4,https://images-na.ssl-images-amazon.com/images...,Title: Woodstock- Collage 500 pc Puzzle\nBrand...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Image             10001 non-null  object
 1   full_description  10001 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


## Finetune CLIP to improve Recall

In [4]:
!pip install -q open-clip-torch timm torchmetrics

In [5]:
import torch, torch.nn as nn, torch.nn.functional as F
import open_clip, random, numpy as np, pandas as pd, requests, re
from PIL import Image
from io import BytesIO
from torch.utils.data import Dataset, DataLoader
from torchmetrics.functional import retrieval_recall
from functools import lru_cache


device = "cuda" if torch.cuda.is_available() else "cpu"

model, _, preprocess = open_clip.create_model_and_transforms(
    'ViT-L-14-336', pretrained='openai', device=device)

tokenizer = open_clip.get_tokenizer('ViT-L-14-336')


for p in model.parameters():
    p.requires_grad = False

# Unfreeze the final transformer block of the image encoder block
for p in model.visual.transformer.resblocks[-1].parameters():
    p.requires_grad = True

# Unfreeze the final transformer block of the text encoder block
for p in model.transformer.resblocks[-1].parameters():
    p.requires_grad = True

# Unfreeze the `logit_scale` parameter.
model.logit_scale.requires_grad = True

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
# Custom text cleaning/summarizing for CLIP
def count_clip_tokens(text: str) -> int:
    return len(tokenizer.encode(text)) + 2

def clean_text(x: str) -> str:
    x = str(x or "")
    x = re.sub(r"\s+", " ", x)
    x = x.replace("|", ", ")
    return x.strip()

# Strip boilerplate / disclaimers / marketing fluff
_BOILER = [
    r"make sure this fits.*?$",
    r"warning:.*?$",
    r"view shipping rates.*?$",
    r"not real food.*?$",
    r"for ages\s*\d+\+?",
    r"choking hazard.*?$",]

_boiler_re = re.compile("|".join(_BOILER), flags=re.I)

def strip_boilerplate(s: str) -> str:
    return _boiler_re.sub("", s)

# Parse labeled sections like "Title:", "Category:", etc.
_SECTION_KEYS = ["Title","Brand","Price","Category","About","Description","Specs","Technical"]
_SECTION_RE = re.compile(
    r"(Title|Brand|Price|Category|About|Description|Specs|Technical)\s*:\s*(.*?)\s*(?=(Title|Brand|Price|Category|About|Description|Specs|Technical)\s*:|$)",
    flags=re.I|re.DOTALL)

def extract_sections(text: str) -> dict:
    text = clean_text(text)
    sec = {}
    for k, v, _ in _SECTION_RE.findall(text):
        sec[k.capitalize()] = clean_text(v)
    if not sec:
        sec["Description"] = text
    return sec

def split_phrases(s: str):
    # break on pipes, bullets, semicolons, or sentence ends
    parts = re.split(r"[|•·;]+|\s*(?<=\.)\s+|\n+", s)
    return [p.strip(" ,.-") for p in parts if p and len(p.strip()) > 2]

def score_phrase(p: str) -> float:
    # prefer phrases with numbers and moderate length
    has_num = 1.5 if re.search(r"\d", p) else 0.0
    length = min(len(p), 80) / 80.0
    return has_num + length

def pick_phrases(phrases, max_n=15):
    # dedupe (case-insensitive) then rank
    seen = set(); uniq = []
    for p in phrases:
        q = p.lower()
        if q not in seen:
            seen.add(q); uniq.append(p)
    uniq.sort(key=score_phrase, reverse=True)
    return uniq[:max_n]

def shorten_category(cat: str) -> str:
    if not cat: return ""
    parts = [c.strip() for c in re.split(r"[|/>]", cat) if c.strip()]
    return ", ".join(parts[:2])

def compress_specs(specs: str) -> str:
    if not specs: return ""
    parts = [p.strip() for p in re.split(r"[|,;/]", specs)]
    parts = [p for p in parts if re.search(r"\d", p)]  # keep numeric bits
    return ", ".join(parts[:6])

def pack_for_clip(full_text: str) -> str:
    sec = extract_sections(full_text)
    title = sec.get("Title","")
    cat   = shorten_category(sec.get("Category",""))
    about = " ".join([sec.get("About",""), sec.get("Description",""), sec.get("Technical","")])
    about = strip_boilerplate(about)
    phrases = pick_phrases(split_phrases(about), max_n=18)
    specs = compress_specs(sec.get("Specs",""))

    pieces = []
    if title: pieces.append(f"Title: {title}")
    if cat:   pieces.append(f"Category: {cat}")

    # add phrases until we hit the token ceiling
    for ph in phrases:
        candidate = " | ".join(pieces + [ph])
        if count_clip_tokens(candidate) <= 77:
            pieces.append(ph)
        else:
            break

    if specs:
        candidate = " | ".join(pieces + [f"Specs: {specs}"])
        if count_clip_tokens(candidate) <= 77:
            pieces.append(f"Specs: {specs}")

    packed = " | ".join(pieces)

    # final tiny trim if needed
    while count_clip_tokens(packed) > 77 and " " in packed:
        packed = packed.rsplit(" ", 1)[0]
    return packed

In [7]:
session = requests.Session()
session.headers.update({"User-Agent":"Mozilla/5.0"})

# PyTorch Dataset for loading (image, text) pairs from a DataFrame
class ProductPairs(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    @lru_cache(maxsize=4096)
    def _load_img(self, url):
        r = session.get(url, timeout=10); r.raise_for_status()
        return Image.open(BytesIO(r.content)).convert("RGB")

    def __len__(self): return len(self.df)

    def __getitem__(self, i):
        row = self.df.iloc[i]
        img = self._load_img(row['Image'])
        text = pack_for_clip(row['full_description'])
        return preprocess(img), text

# Collate function for PyTorch DataLoader
def collate_fn(batch):
    imgs, texts = zip(*batch)
    toks = tokenizer(list(texts))
    return torch.stack(imgs), toks

In [8]:
# Build CLIP embeddings
_session = requests.Session()
_session.headers.update({"User-Agent": "Mozilla/5.0 (open-clip)"})

def embed_text(text: str):
    s = (text or "")[:480]
    toks = tokenizer([s]).to(device)            # <-- open_clip tokenizer
    with torch.no_grad():
        vec = model.encode_text(toks)
        vec = F.normalize(vec, dim=-1)
    return vec.squeeze(0).detach().cpu().numpy().astype("float32")

def embed_image(url_or_pil):
    try:
        if isinstance(url_or_pil, str):
            r = _session.get(url_or_pil, timeout=12)
            r.raise_for_status()
            img = Image.open(BytesIO(r.content)).convert("RGB")
        else:
            img = url_or_pil.convert("RGB")

        t = preprocess(img).unsqueeze(0).to(device)
        with torch.no_grad():
            vec = model.encode_image(t)
            vec = F.normalize(vec, dim=-1)
        return vec.squeeze(0).detach().cpu().numpy().astype("float32")
    except Exception:
        return None

# sanity check: should be 768 for ViT-L/14-336
with torch.no_grad():
    d_t = model.encode_text(tokenizer(["test"]).to(device)).shape[-1]
    print("Embed dim:", d_t)

Embed dim: 768


In [13]:
from tqdm import tqdm

text_vecs, image_vecs, kept_idx, packed_texts = [], [], [], []

for i, row in tqdm(df.iterrows(), total=len(df)):
    packed = pack_for_clip(row["full_description"])
    tvec = embed_text(packed)                   # <-- uses open_clip
    ivec = embed_image(row["Image"])            # <-- uses open_clip
    if ivec is None:
        continue
    text_vecs.append(tvec)
    image_vecs.append(ivec)
    packed_texts.append(packed)
    kept_idx.append(i)

100%|██████████| 10001/10001 [09:19<00:00, 17.88it/s]


In [14]:
# Embedd
text_mat  = np.vstack(text_vecs).astype("float32")
image_mat = np.vstack(image_vecs).astype("float32")
meta = df.iloc[kept_idx].copy()
meta["clip_packed"] = packed_texts

text_mat.shape, image_mat.shape, meta.shape

((9974, 768), (9974, 768), (9974, 3))

In [15]:
# Split train/val (e.g., 95/5)
df_train = meta[['Image','full_description']].sample(frac=0.95, random_state=42)
df_val   = meta.drop(df_train.index)

train_ds = ProductPairs(df_train)
val_ds   = ProductPairs(df_val)

BATCH = 64
train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True, num_workers=4, collate_fn=collate_fn, pin_memory=True)
val_dl   = DataLoader(val_ds,   batch_size=BATCH, shuffle=False, num_workers=4, collate_fn=collate_fn, pin_memory=True)

In [16]:
def clip_contrastive_loss(im_emb, tx_emb, logit_scale):
    # normalize
    im = F.normalize(im_emb, dim=-1)
    tx = F.normalize(tx_emb, dim=-1)
    logits_per_image  = logit_scale * im @ tx.t()
    logits_per_text   = logit_scale * tx @ im.t()
    targets = torch.arange(im.size(0), device=im.device)
    loss_i = F.cross_entropy(logits_per_image, targets)
    loss_t = F.cross_entropy(logits_per_text,  targets)
    return (loss_i + loss_t)/2

# only train params with grad
opt = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                        lr=1e-5, weight_decay=0.2)
scaler = torch.cuda.amp.GradScaler(enabled=(device=="cuda"))

  scaler = torch.cuda.amp.GradScaler(enabled=(device=="cuda"))


In [17]:
from time import time

EPOCHS = 20
model.train()
for epoch in range(1, EPOCHS+1):
    t0 = time(); total = 0; n = 0
    for imgs, toks in train_dl:
        imgs = imgs.to(device, non_blocking=True)
        toks = toks.to(device, non_blocking=True)

        opt.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(device=="cuda")):
            im_emb = model.encode_image(imgs)
            tx_emb = model.encode_text(toks)
            loss = clip_contrastive_loss(im_emb, tx_emb, model.logit_scale.exp())
        scaler.scale(loss).backward()
        scaler.step(opt); scaler.update()

        total += loss.item()*imgs.size(0); n += imgs.size(0)

    print(f"Epoch {epoch} | train loss {total/n:.4f} | time {(time()-t0):.1f}s")

  with torch.cuda.amp.autocast(enabled=(device=="cuda")):


Epoch 1 | train loss 0.1894 | time 43.1s
Epoch 2 | train loss 0.1062 | time 45.8s
Epoch 3 | train loss 0.0675 | time 42.9s
Epoch 4 | train loss 0.0479 | time 44.5s
Epoch 5 | train loss 0.0396 | time 56.6s
Epoch 6 | train loss 0.0328 | time 43.0s
Epoch 7 | train loss 0.0283 | time 77.8s
Epoch 8 | train loss 0.0276 | time 85.5s
Epoch 9 | train loss 0.0211 | time 55.2s
Epoch 10 | train loss 0.0207 | time 44.1s
Epoch 11 | train loss 0.0212 | time 92.8s
Epoch 12 | train loss 0.0177 | time 43.8s
Epoch 13 | train loss 0.0185 | time 44.0s
Epoch 14 | train loss 0.0147 | time 53.1s
Epoch 15 | train loss 0.0163 | time 41.9s
Epoch 16 | train loss 0.0122 | time 45.4s
Epoch 17 | train loss 0.0143 | time 42.9s
Epoch 18 | train loss 0.0133 | time 45.1s
Epoch 19 | train loss 0.0105 | time 118.2s
Epoch 20 | train loss 0.0098 | time 82.1s


In [18]:
model.eval()
# Re-embed IMAGES
img_vecs = []
with torch.no_grad(), torch.cuda.amp.autocast(enabled=(device=="cuda")):
    for i in range(0, len(meta), 64):
        batch = []
        for url in meta['Image'].iloc[i:i+64]:
            r = session.get(url, timeout=10); r.raise_for_status()
            im = Image.open(BytesIO(r.content)).convert("RGB")
            batch.append(preprocess(im))
        batch = torch.stack(batch).to(device)
        v = model.encode_image(batch)
        v = F.normalize(v, dim=-1)
        img_vecs.append(v.cpu())
image_mat = torch.cat(img_vecs, dim=0).numpy().astype("float32")

# Re-embed TEXTS (use your packed texts)
texts = [pack_for_clip(t) for t in meta['full_description'].tolist()]
text_vecs = []
with torch.no_grad(), torch.cuda.amp.autocast(enabled=(device=="cuda")):
    for i in range(0, len(texts), 256):
        toks = tokenizer(texts[i:i+256]).to(device)
        t = model.encode_text(toks)
        t = F.normalize(t, dim=-1)
        text_vecs.append(t.cpu())
text_mat = torch.cat(text_vecs, dim=0).numpy().astype("float32")

  with torch.no_grad(), torch.cuda.amp.autocast(enabled=(device=="cuda")):
  with torch.no_grad(), torch.cuda.amp.autocast(enabled=(device=="cuda")):


In [27]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1


In [19]:
import faiss, numpy as np
dim = image_mat.shape[1]
index = faiss.IndexFlatIP(dim); index.add(image_mat)

def recall_at_k(text_mat, image_mat, K=1):
    idx = faiss.IndexFlatIP(image_mat.shape[1]); idx.add(image_mat)
    correct=0
    for i in range(text_mat.shape[0]):
        _, I = idx.search(text_mat[i:i+1], K)
        if i in I[0]: correct+=1
    return correct/len(text_mat)

print("Recall@1:", recall_at_k(text_mat, image_mat, 1))
print("Recall@5:", recall_at_k(text_mat, image_mat, 5))
print("Recall@10:", recall_at_k(text_mat, image_mat, 10))

Recall@1: 0.7141568077000201
Recall@5: 0.9719270102265891
Recall@10: 0.9919791457790255


In [20]:
# export embeddings
import numpy as np, pandas as pd, hashlib, json, os

N = text_mat.shape[0]
assert image_mat.shape[0] == N

def stable_id(url):
    return hashlib.md5(url.encode('utf-8')).hexdigest()

meta = meta.reset_index(drop=True).copy()
meta["id"] = meta["Image"].apply(stable_id)


payload = pd.DataFrame({
    "id": meta["id"],
    "image_url": meta["Image"],
    "text_full": meta["full_description"],
    "text_packed": meta["clip_packed"],})

payload.head()

Unnamed: 0,id,image_url,text_full,text_packed
0,744f4c8558198f665155aea00db17784,https://images-na.ssl-images-amazon.com/images...,"Title: DB Longboards CoreFlex Crossbow 41"" Bam...","Title: DB Longboards CoreFlex Crossbow 41"" Bam..."
1,71bfbce557deb92100d84384b5d563fb,https://images-na.ssl-images-amazon.com/images...,Title: Electronic Snap Circuits Mini Kits Clas...,Title: Electronic Snap Circuits Mini Kits Clas...
2,827ecfdaed78a718b51208db2d5eb30e,https://images-na.ssl-images-amazon.com/images...,Title: 3Doodler Create Flexy 3D Printing Filam...,Title: 3Doodler Create Flexy 3D Printing Filam...
3,5617bdbdd8f48bd1e40fcb07ddd9f14b,https://images-na.ssl-images-amazon.com/images...,Title: Guillow Airplane Design Studio with Tra...,Title: Guillow Airplane Design Studio with Tra...
4,0f0bb2f12ef4c4c107b5945eb3427c16,https://images-na.ssl-images-amazon.com/images...,Title: Woodstock- Collage 500 pc Puzzle\nBrand...,Title: Woodstock- Collage 500 pc Puzzle | Cate...


In [22]:
faiss.write_index(index, "clip_image.index") # nearest-neighbor lookups locally without a separate vector database
meta.to_parquet("clip_meta.parquet", index=False) # metadata

In [25]:
np.save("clip_image_embeddings.npy", image_mat) # embeddings for vector database

In [26]:
# save fine tune clip
torch.save(model.state_dict(), "ft_clip.pt")

In [27]:
# Tokenizer + preprocess transforms
import pickle
with open("clip_preprocess.pkl", "wb") as f:
    pickle.dump(preprocess, f)