In [None]:
import pandas as pd
import numpy as np

from PIL import Image

import requests
from io import BytesIO

import torch
from torchvision import transforms
from sentence_transformers import SentenceTransformer
from torchvision.models import vit_b_16, ViT_B_16_Weights

FINAL_CSV = "../data/products_clean.csv"
EMBEDDINGS_FILE = "../data/product_embeddings.parquet"

In [None]:
df = pd.read_csv(FINAL_CSV)
print("Clean dataset loaded:", df.shape)

Text embeddings

In [None]:
# combine text fields
text_fields = ['title', 'description', 'categories', 'brand', 'manufacturer', 'material', 'color', 'country_of_origin']

def combine_text(row):
    categories = ' '.join(eval(row['categories'])) if pd.notnull(row['categories']) else ''
    return f"{row['title']} {row['description']} {categories} {row['brand']} {row['manufacturer']} {row['material']} {row['color']} {row['country_of_origin']}"

df['text_for_embedding'] = df.apply(combine_text, axis=1)

In [None]:
# generate embeddings
text_model = SentenceTransformer('all-MiniLM-L6-v2')

# save model for main.py
text_model.save("../models/text_model")

text_embeddings = text_model.encode(df['text_for_embedding'].tolist(), show_progress_bar=True)
print("Text embeddings shape:", text_embeddings.shape)

Image embeddings

In [None]:
# Initialize model
weights = ViT_B_16_Weights.DEFAULT
image_model = vit_b_16(weights=weights)
image_model.eval()

# save model for main.py
torch.save(image_model.state_dict(), "../models/image_model.pth")

# Image transforms
transform = weights.transforms()

def get_image_embedding(url):
    try:
        response = requests.get(url.strip())
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img_t = transform(img).unsqueeze(0)  # add batch dim
        with torch.no_grad():
            emb = image_model(img_t)
        return emb.squeeze().numpy()
    except:
        return np.zeros(768)  # fallback vector

df['image_url'] = df['images'].apply(lambda x: eval(x)[0] if len(eval(x))>0 else "")
image_embeddings = np.stack(df['image_url'].apply(get_image_embedding).to_list())
print("Image embeddings shape:", image_embeddings.shape)

Save embeddings with meta data

In [None]:
# Save embeddings with uniq_id
embedding_df = pd.DataFrame({
    'uniq_id': df['uniq_id'],
    'title': df['title'],
    'brand': df['brand'],
    'price': df['price'],
    'color': df['color'],
    'material': df['material'],
    'country_of_origin': df['country_of_origin'],
    'package_dimensions': df['package_dimensions'], 
    'image_url': df['image_url'],
    'text_embedding': list(text_embeddings),
    'image_embedding': list(image_embeddings)
})

embedding_df.to_parquet(EMBEDDINGS_FILE, index=False)
print("Embeddings saved to:", EMBEDDINGS_FILE)

In [None]:
embedding_df.head(2)