In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

import requests
from PIL import Image
from io import BytesIO

from transformers import CLIPProcessor, CLIPModel # CLIP
#import longclip # LONG CLIP
import torch

import chromadb
import os

# Reload data
df = pd.read_csv('/Users/brunamedeiros/Documents/GitHub/Amazon-Multimodal-Chatbot/data.csv')
print(f"df has {len(df)} products")
exploded_df = pd.read_csv('/Users/brunamedeiros/Documents/GitHub/Amazon-Multimodal-Chatbot/exploded_df.csv')
print(f"exploded_df has {len(exploded_df)} products")

# Reconnect to vector store

# OLD
#client = chromadb.PersistentClient(path="./my_vectorstore")
#collection = client.get_or_create_collection(name="amazon_products")

# client = chromadb.PersistentClient(path="./my_vectorstore_exploded")
# collection = client.get_or_create_collection(name="amazon_products_exploded") 

client = chromadb.PersistentClient(path="./my_vectorstore_exploded_v2")
collection = client.get_or_create_collection(name="amazon_products_exploded_v2") 
print(f"Vector store was reconnected! total embeddings: {collection.count()}\n")

# Reload CLIP model

# CLIP
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

df has 10002 products
exploded_df has 43870 products
Vector store was reconnected! total embeddings: 0



Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


# DON'T RUN - DELETE EMBEDDINGS

### Image

In [2]:
# # Delete code inside vector store
# all_data = collection.get()
# if all_data['ids']:
#     collection.delete(ids=all_data['ids'])
#     print(f"Deleted {len(all_data['ids'])} embeddings")
# else:
#     print("No embeddings to delete")

### Text

In [3]:
# # Get all existing text embeddings and delete them
# all_data = collection.get()
# text_ids = [id for id in all_data['ids'] if id.startswith('text_')]

# if text_ids:
#     collection.delete(ids=text_ids)
#     print(f"Deleted {len(text_ids)} existing text embeddings")
# else:
#     print("No existing text embeddings found")

# EDA

- `Image` and `Variants` column, if my understanding is correct, have the identical image. The `Image` column has an actual image while the `Variants` has the link to the Amazon site for that specific product. A lot of those products are not on Amazon anymore so the link leads to an error.

In [4]:

print("="*60)
print("Entire dataset:")
print("="*60)
display(df.head(1))

print("="*60)
print("Dataset shape:")
print("="*60)
print(df.shape)


print("\n")
print("="*60)
print("Columns in the dataset:")
print("="*60)
print(df.columns.tolist())

print("\n")
print("="*60)
print("Column types:")
print("="*60)
print(df.dtypes)

print("\n")
print("="*60)
print("NaN counts:")
print("="*60)
print(df.isna().sum())

print("\n")
print("="*60)
print("Columns where all values are NaN:")
print("="*60)
print("The columns that should be removed because all values are NaN are:")
for col in df.columns:
    if df[col].isna().sum() == df.shape[0]:
        print(f"  {col}")


print("\n")
print("="*60)
print("Columns we can work with")
print("="*60)

for col in df.columns:
    if df[col].isna().sum() != df.shape[0]:
        print(f"  {col}")
valid_cols = [col for col in df.columns if df[col].isna().sum() != df.shape[0]]


print("\n")
print("="*60)
print("Final Dataset")
print("="*60)
display(df[valid_cols].head(1))


Entire dataset:


Unnamed: 0,Uniq Id,Product Name,Brand Name,Asin,Category,Upc Ean Code,List Price,Selling Price,Quantity,Model Number,...,Product Url,Stock,Product Details,Dimensions,Color,Ingredients,Direction To Use,Is Amazon Seller,Size Quantity Variant,Product Description
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",,,Sports & Outdoors | Outdoor Recreation | Skate...,,,$237.68,,,...,https://www.amazon.com/DB-Longboards-CoreFlex-...,,,,,,,Y,,


Dataset shape:
(10002, 28)


Columns in the dataset:
['Uniq Id', 'Product Name', 'Brand Name', 'Asin', 'Category', 'Upc Ean Code', 'List Price', 'Selling Price', 'Quantity', 'Model Number', 'About Product', 'Product Specification', 'Technical Details', 'Shipping Weight', 'Product Dimensions', 'Image', 'Variants', 'Sku', 'Product Url', 'Stock', 'Product Details', 'Dimensions', 'Color', 'Ingredients', 'Direction To Use', 'Is Amazon Seller', 'Size Quantity Variant', 'Product Description']


Column types:
Uniq Id                   object
Product Name              object
Brand Name               float64
Asin                     float64
Category                  object
Upc Ean Code              object
List Price               float64
Selling Price             object
Quantity                 float64
Model Number              object
About Product             object
Product Specification     object
Technical Details         object
Shipping Weight           object
Product Dimensions        objec

Unnamed: 0,Uniq Id,Product Name,Category,Upc Ean Code,Selling Price,Model Number,About Product,Product Specification,Technical Details,Shipping Weight,Product Dimensions,Image,Variants,Product Url,Is Amazon Seller
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",Sports & Outdoors | Outdoor Recreation | Skate...,,$237.68,,Make sure this fits by entering your model num...,Shipping Weight: 10.7 pounds (View shipping ra...,,10.7 pounds,,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/DB-Longboards-CoreFlex-...,https://www.amazon.com/DB-Longboards-CoreFlex-...,Y


# Data Cleaning
- Strip `Uniq id` column
    - we will use that as naming for the images and metadata for embedding. we are using .strip() to ensure no errors arise later
- Clean URLs: ended up not using this one because checking every URL will take a long time

In [5]:
print("Cleaning Uniq Id column (.strip())...")
df['Uniq Id'] = df['Uniq Id'].astype(str).str.strip()
print()
print(df['Uniq Id'].head())

# Check for any issues
print(f"\nUnique IDs: {df['Uniq Id'].nunique()}")
print(f"Any duplicates: {df['Uniq Id'].duplicated().sum()}")

Cleaning Uniq Id column (.strip())...

0    4c69b61db1fc16e7013b43fc926e502d
1    66d49bbed043f5be260fa9f7fbff5957
2    2c55cae269aebf53838484b0d7dd931a
3    18018b6bc416dab347b1b7db79994afa
4    e04b990e95bf73bbe6a3fa09785d7cd0
Name: Uniq Id, dtype: object

Unique IDs: 10002
Any duplicates: 0


# Image Embedding

Each inidivual row under `Image` column has more than one https link in it.

For instance, row 1: `https://images-na.ssl-images-amazon.com/images/I/51j3fPQTQkL.jpg|https://images-na.ssl-images-amazon.com/images/I/31hKM3cSoSL.jpg|https://images-na.ssl-images-amazon.com/images/I/51WlHdwghfL.jpg|https://images-na.ssl-images-amazon.com/images/I/51FsyLRBzwL.jpg|https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/transparent-pixel.jpg`

Each product has more than 1 image (showing different perspectives of product). Instead of putting them in a different column, they concatenated all URLs in the same one, dividing them by the |

---

The `download_first_image` does the following (**WE DIDN'T USE THIS ONE**):
- separate the many https through the |
- skip the transparent pixel
    - the transparent pixel is a 1x1 pixel invisible image that Amazon uses a placeholder/tracking pixel.It looks like this `https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/transparent-pixel.jpg`. It's literally a transparent image Amazon uses for things such as web tracking, analytics, layout spacing... Therefore, we need to remove it, or else our CLIP model will try to embed an empty image).
- Saves one image per product
    - Majority of products (cell in `Image` column) have more than 1 image per product. We could either save 1 image per product or save all images per product. This function only saves one image per product as it is more feasible with the project deadline. It would be much more complex to handle CLIP if we were to have more than 1 image
- Name images as the `Uniq Id`: easy to look up
    - When we connect to Chroma and create embeddings, we can store `Uniq Id` as metadata

`download_first_image` was the first attempt. now we changed to `get_image_embedding_from_url`. It does the same as the previous one but:
- doesn't save images locally. It processes image and automatically embed them into chroma.

In [6]:
# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# def get_image_embedding(image):
#     inputs = processor(images=[image], return_tensors="pt", padding=True)
#     with torch.no_grad():
#         embedding = model.get_image_features(**inputs)
#     return embedding.numpy()[0]


# # LIGHT METADATA
# def get_image_embedding_from_url(url_string, uniq_id):
#     """Download image from URL and get CLIP embedding + metadata - no saving on local"""
#     try:
#         # split by | and get all urls
#         urls = url_string.split('|')

#         # find first URL that is not a transparent pixel
#         for url in urls:
#             url = url.strip()
#             if 'transparent-pixel.jpg' not in url:
#                 print(f"Trying: {url}")

#                 # download to memory (not disk)
#                 response = requests.get(url)
#                 if response.status_code == 200:
#                     # get image
#                     image = Image.open(BytesIO(response.content))

#                     # get CLIP embedding
#                     embedding = get_image_embedding(image) # This just returns a numpy array - no metadata! Therefore, need to add metadata

#                     # Metadata
#                     metadata = {
#                         "uniq_id": str(uniq_id),
#                         "type": "image"
#                     }

#                     #print(f"Got embedding for {uniq_id}")
#                     return embedding, metadata
        
#         # If no URLs worked
#         print(f"No valid URLs found for {uniq_id}")
#         return None, None

#     except Exception as e:
#         print(f"ERROR: {e}")
#         return None, None


# # Create/connect to vector store
# client = chromadb.PersistentClient(path="./my_vectorstore")
# collection = client.get_or_create_collection(name="amazon_products")

# # BATCH PROCESSING SETUP
# BATCH_SIZE = 100  # Process 100 images at a time
# embeddings_to_store = []
# metadatas_to_store = []
# ids_to_store = []

    
# # # Check/delete existing data
# # print(f"Current embeddings in store: {collection.count()}\n")
# # if collection.count() > 0:
# #     response = input("Delete existing data? (y/n): ")
# #     if response.lower() == 'y':
# #         all_data = collection.get()
# #         collection.delete(ids=all_data['ids'])
# #         print("Deleted existing data\n")


# # EMBED 
# for i in range(len(df)):
#     if i < len(df) and pd.notna(df['Image'].iloc[i]):
#         uniq_id = df['Uniq Id'].iloc[i]  # Just get uniq_id

#         # Skip if image was already embedded
#         try:
#             existing = collection.get(ids=[f"img_{uniq_id}"])
#             if existing['ids']:  # If it exists, skip
#                 print(f"Skipping {uniq_id} - already exists")
#                 continue
#         except:
#             pass  # Doesn't exist, continue processing

#         url_string = df['Image'].iloc[i]

#         embedding, metadata = get_image_embedding_from_url(url_string, uniq_id)  # Pass uniq_id
#         if embedding is not None:
#             embeddings_to_store.append(embedding.tolist())  # Convert to list
#             metadatas_to_store.append(metadata)
#             ids_to_store.append(f"img_{uniq_id}")  # Unique ID for Chroma
            
#             #print(f"Embedding shape: {embedding.shape}")
#             #print("Metadata:", metadata)
#             #print("-" * 50)
        
#         # Store every BATCH_SIZE embeddings
#         if len(embeddings_to_store) >= BATCH_SIZE:
#             collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store)
#             print(f"Stored batch of {len(embeddings_to_store)} embeddings")
#             # Clear lists for next batch
#             embeddings_to_store = []
#             metadatas_to_store = []
#             ids_to_store = []

# # # Store all embeddings at once
# # if embeddings_to_store:
# #     collection.add(
# #         embeddings=embeddings_to_store,
# #         metadatas=metadatas_to_store,
# #         ids=ids_to_store
# #     )
# #     print(f"Stored {len(embeddings_to_store)} embeddings in vector store!")

# # Store final batch (if any remaining)
# if embeddings_to_store:
#     collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store)
#     print(f"Stored final batch of {len(embeddings_to_store)} embeddings")


### CLIP with exploded_df

In [7]:
def get_image_embedding(image):
    inputs = processor(images=[image], return_tensors="pt", padding=True)
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
    return embedding.numpy()[0]


def get_image_embedding_from_single_url(url_string, uniq_id):
    """Download image from URL and get CLIP embedding + metadata - no saving on local"""
    url = url_string.strip()
    try:
        # find first URL that is not a transparent pixel
        if 'transparent-pixel.jpg' not in url:
            print(f"Trying: {url}")

            # download to memory (not disk)
            response = requests.get(url)
            if response.status_code == 200:
                
                # get image
                image = Image.open(BytesIO(response.content))

                # get CLIP embedding
                embedding = get_image_embedding(image) # This just returns a numpy array - no metadata! Therefore, need to add metadata

                # Metadata
                metadata = {
                    "uniq_id": str(uniq_id),
                    "type": "image"
                }
                return embedding, metadata
        
        # If no URLs worked
        print(f"No valid URLs found for {uniq_id}")
        return None, None

    except Exception as e:
        print(f"ERROR: {e}")
        return None, None

# BATCH PROCESSING SETUP
BATCH_SIZE = 100  # Process 100 images at a time
embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []

In [8]:
print("="*60)
print("TESTING IMAGE EMBEDDING")
print("="*60)


print("-"*60)
print("Selecting 4 images linked to same product")
print("-"*60)
# Testing with 4 rows (those 4 images are linked to the same product)
test_df = exploded_df[exploded_df['Uniq Id'] == '4c69b61db1fc16e7013b43fc926e502d'].head(4)
print(test_df[['Uniq Id', 'Image']])


print("-"*60)
print("EMBEDDING")
print("-"*60)
#for i in range(len(exploded_df)):
for i in range(len(test_df)):

    row = exploded_df.iloc[i]
    if pd.notna(row['Image']):
        uniq_id = row['Uniq Id']
        single_url = row['Image']

        # create unique ChromaDB ID for each image
        unique_chroma_id = f"img_{uniq_id}_{i}"


        # Skip if image was already embedded
        try:
            existing = collection.get(ids=[unique_chroma_id])
            if existing['ids']: 
                print(f"Skipping {unique_chroma_id} - already exists")
                continue
        except:
            pass

        embedding, metadata = get_image_embedding_from_single_url(single_url, uniq_id)  # Just like old code
        if embedding is not None:  # Just like old code
            embeddings_to_store.append(embedding.tolist())
            metadatas_to_store.append(metadata)
            ids_to_store.append(unique_chroma_id)
            
            #print(f"Embedding shape: {embedding.shape}")
            #print("Metadata:", metadata)
            #print("-" * 50)
        
        # Store every BATCH_SIZE embeddings
        if len(embeddings_to_store) >= BATCH_SIZE:
            collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store)
            print(f"Stored batch of {len(embeddings_to_store)} embeddings")
            # Clear lists for next batch
            embeddings_to_store = []
            metadatas_to_store = []
            ids_to_store = []

# Store final batch (if any remaining)
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} embeddings")


print("-"*60)
print("Metadata Analysis")
print("-"*60)

# Showing metadata 
all_data = collection.get()
print(f"Total embeddings: {len(all_data['ids'])}")
print("\nMetadata for each embedding:")
for i, (id, metadata) in enumerate(zip(all_data['ids'], all_data['metadatas'])):
   print(f"{i+1}. ID: {id} | Metadata: {metadata}")

# Check if all uniq_ids are the same
uniq_ids = [metadata['uniq_id'] for metadata in all_data['metadatas']]
all_same = len(set(uniq_ids)) == 1
print(f"\nAll uniq_ids are the same: {all_same}")
if all_same:
   print(f"Uniq ID: {uniq_ids[0]}") 
   print(f"Ready to run on full dataset!")

TESTING IMAGE EMBEDDING
------------------------------------------------------------
Selecting 4 images linked to same product
------------------------------------------------------------
                            Uniq Id  \
0  4c69b61db1fc16e7013b43fc926e502d   
1  4c69b61db1fc16e7013b43fc926e502d   
2  4c69b61db1fc16e7013b43fc926e502d   
3  4c69b61db1fc16e7013b43fc926e502d   

                                               Image  
0  https://images-na.ssl-images-amazon.com/images...  
1  https://images-na.ssl-images-amazon.com/images...  
2  https://images-na.ssl-images-amazon.com/images...  
3  https://images-na.ssl-images-amazon.com/images...  
------------------------------------------------------------
EMBEDDING
------------------------------------------------------------
Trying: https://images-na.ssl-images-amazon.com/images/I/51j3fPQTQkL.jpg
Trying: https://images-na.ssl-images-amazon.com/images/I/31hKM3cSoSL.jpg
Trying: https://images-na.ssl-images-amazon.com/images/I/51WlHd

In [9]:
print("="*60)
print("FULL DATASET IMAGE EMBEDDING")
print("="*60)


print("-"*60)
print("EMBEDDING")
print("-"*60)
for i in range(len(exploded_df)):
    row = exploded_df.iloc[i]
    if pd.notna(row['Image']):
        uniq_id = row['Uniq Id']
        single_url = row['Image']

        # create unique ChromaDB ID for each image
        unique_chroma_id = f"img_{uniq_id}_{i}"


        # Skip if image was already embedded
        try:
            existing = collection.get(ids=[unique_chroma_id])
            if existing['ids']: 
                print(f"Skipping {unique_chroma_id} - already exists")
                continue
        except:
            pass

        embedding, metadata = get_image_embedding_from_single_url(single_url, uniq_id)  # Just like old code
        if embedding is not None:  # Just like old code
            embeddings_to_store.append(embedding.tolist())
            metadatas_to_store.append(metadata)
            ids_to_store.append(unique_chroma_id)
            
            #print(f"Embedding shape: {embedding.shape}")
            #print("Metadata:", metadata)
            #print("-" * 50)
        
        # Store every BATCH_SIZE embeddings
        if len(embeddings_to_store) >= BATCH_SIZE:
            collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store)
            print(f"Stored batch of {len(embeddings_to_store)} embeddings")
            # Clear lists for next batch
            embeddings_to_store = []
            metadatas_to_store = []
            ids_to_store = []

# Store final batch (if any remaining)
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} embeddings")


print("-"*60)
print("RESULTS")
print("-"*60)

all_data = collection.get()
print(f"Total embeddings: {len(all_data['ids'])}")

# Count unique uniq_ids
uniq_ids = [metadata['uniq_id'] for metadata in all_data['metadatas']]
unique_uniq_ids = set(uniq_ids)
print(f"Number of different uniq_ids: {len(unique_uniq_ids)}")
print(f"Uniq IDs found: {list(unique_uniq_ids)}")

FULL DATASET IMAGE EMBEDDING
------------------------------------------------------------
EMBEDDING
------------------------------------------------------------
Skipping img_4c69b61db1fc16e7013b43fc926e502d_0 - already exists
Skipping img_4c69b61db1fc16e7013b43fc926e502d_1 - already exists
Skipping img_4c69b61db1fc16e7013b43fc926e502d_2 - already exists
Skipping img_4c69b61db1fc16e7013b43fc926e502d_3 - already exists
No valid URLs found for 4c69b61db1fc16e7013b43fc926e502d
Trying: https://images-na.ssl-images-amazon.com/images/I/51M0KnJxjKL.jpg
Trying: https://images-na.ssl-images-amazon.com/images/I/5166GD8OkXL.jpg
Trying: https://images-na.ssl-images-amazon.com/images/I/61o5S1VnaNL.jpg
Trying: https://images-na.ssl-images-amazon.com/images/I/61t4Q0rPYjL.jpg
Trying: https://images-na.ssl-images-amazon.com/images/I/61NASUAyqcL.jpg
Trying: https://images-na.ssl-images-amazon.com/images/I/51OMrADdyJL.jpg
No valid URLs found for 66d49bbed043f5be260fa9f7fbff5957
Trying: https://images-na.s

In [10]:
print("="*60)
print("VECTOR STORE INFO:")
print("="*60)
print(f"Collection name: {collection.name}")
print(f"Total embeddings: {collection.count()}")

# Check what's inside
if collection.count() > 0:
    peek = collection.peek(limit=3)
    print(f"\nSample IDs: {peek['ids']}")
    print(f"\nSample metadata: {peek['metadatas']}")
else:
    print("No embeddings stored yet")

VECTOR STORE INFO:
Collection name: amazon_products_exploded_v2
Total embeddings: 33974

Sample IDs: ['img_4c69b61db1fc16e7013b43fc926e502d_0', 'img_4c69b61db1fc16e7013b43fc926e502d_1', 'img_4c69b61db1fc16e7013b43fc926e502d_2']

Sample metadata: [{'type': 'image', 'uniq_id': '4c69b61db1fc16e7013b43fc926e502d'}, {'uniq_id': '4c69b61db1fc16e7013b43fc926e502d', 'type': 'image'}, {'uniq_id': '4c69b61db1fc16e7013b43fc926e502d', 'type': 'image'}]


`Sample IDs` = Chroma's interal IDs (chroma's way of finding embedding)

`Uniq ID` = our product ID in metadata

---

# Text Embedding

For text embedding, there were 2 options:
- CLIP 
    - PROBLEM: 77 token limit
- Another LM model like all-mini
    - Won't align with CLIP image embeddings

We opted to go with CLIP embeddings. We opted to do chunking instead of cutting down the text size to not lose valuable information.

In [11]:
# Text concatenation
def create_product_text(row):
    """Concatenate all text columns into one description"""
    text_parts = []
    
    # product name
    if pd.notna(row['Product Name']):
        text_parts.append(f"This product is {row['Product Name']}")
    
    # category
    if pd.notna(row['Category']):
        text_parts.append(f"It falls under the category of {row['Category']}")
    
    # price
    if pd.notna(row['Selling Price']):
        text_parts.append(f"The price is {row['Selling Price']}")
    
    # model number
    if pd.notna(row['Model Number']):
        text_parts.append(f"The model number is {row['Model Number']}")
    
    # main description
    if pd.notna(row['About Product']):
        text_parts.append(f"Product description: {row['About Product']}")
    
    # technical details
    if pd.notna(row['Technical Details']):
        text_parts.append(f"Technical specifications: {row['Technical Details']}")
    
    # shipping info
    if pd.notna(row['Shipping Weight']):
        text_parts.append(f"Shipping weight is {row['Shipping Weight']}")
    
    # dimensions
    if pd.notna(row['Product Dimensions']):
        text_parts.append(f"Product dimensions are {row['Product Dimensions']}")
    
    # seller info
    if pd.notna(row['Is Amazon Seller']):
        seller_text = "The product is sold by Amazon" if str(row['Is Amazon Seller']).lower() == 'true' else "The product is not sold by Amazon"
        text_parts.append(f"This item is {seller_text}")
    
    # combine into one text
    return ". ".join(text_parts) + "."

# Test it
sample_text = create_product_text(exploded_df.iloc[0])
print("="*60)
print("TESTING...")
print("="*60)
print(sample_text)

# create new column
print("\nCreating new column...")
exploded_df['text_to_embed'] = exploded_df.apply(create_product_text,axis=1)
print("...new column created!")


TESTING...
This product is DB Longboards CoreFlex Crossbow 41" Bamboo Fiberglass Longboard Complete. It falls under the category of Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards. The price is $237.68. Product description: Make sure this fits by entering your model number. | RESPONSIVE FLEX: The Crossbow features a bamboo core encased in triaxial fiberglass and HD plastic for a responsive flex pattern that’s second to none. Pumping & carving have never been so satisfying! Flex 2 is recommended for people 120 to 170 pounds. | COREFLEX TECH: CoreFlex construction is water resistant, impact resistant, scratch resistant and has a flex like you won’t believe. These boards combine fiberglass, epoxy, HD plastic and bamboo to create a perfect blend of performance and strength. | INSPIRED BY THE NORTHWEST: Our founding ideal is chasing adventure & riding the best boards possible, inspired by the hills, wav

In [12]:
# FUNCTION SETUP
def get_text_embedding(text):
    inputs = processor(text=[text], return_tensors="pt", padding=True, truncation=True, max_length=77)
    with torch.no_grad():
        embedding = model.get_text_features(**inputs)
    return embedding.numpy()[0]

def create_overlapping_chunks(text, chunk_size=77, overlap=15):
    """Split text into overlapping chunks of specified token size"""
    try:
        # Tokenize the full text
        inputs = processor(text=[text], return_tensors="pt", padding=True)
        tokens = inputs['input_ids'][0]  # Get token IDs
        
        chunks = []
        start = 0
        
        while start < len(tokens):
            # Get chunk of tokens
            end = min(start + chunk_size, len(tokens))
            chunk_tokens = tokens[start:end]
            
            # Decode back to text
            chunk_text = processor.tokenizer.decode(chunk_tokens, skip_special_tokens=True)
            chunks.append(chunk_text)
            
            # Move start position with overlap
            if end >= len(tokens):
                break
            start = end - overlap
        
        return chunks
    except Exception as e:
        print(f"Error chunking text: {e}")
        return [text[:500]]  # Fallback to first 500 chars

def get_text_embedding_from_chunk(text_chunk, uniq_id, chunk_number):
    """Get CLIP text embedding + metadata"""
    try:
        if text_chunk.strip():
            print(f"Processing text chunk {chunk_number} for {uniq_id}")
            embedding = get_text_embedding(text_chunk)
            
            metadata = {
                "uniq_id": str(uniq_id),
                "type": "text",
                "chunk": chunk_number
            }
            return embedding, metadata
        return None, None
    except Exception as e:
        print(f"ERROR: {e}")
        return None, None

# BATCH PROCESSING SETUP
BATCH_SIZE = 100
embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []


# Get unique products only for text embedding
print("-"*60)
print("Deleting repeated text columns (result of 'explosion')")
print("-"*60)
unique_df = exploded_df.drop_duplicates(subset=['Uniq Id'])
print(f"Using {len(unique_df)} unique products for text embedding")

------------------------------------------------------------
Deleting repeated text columns (result of 'explosion')
------------------------------------------------------------
Using 10002 unique products for text embedding


In [13]:
print("="*60)
print("TESTING TEXT EMBEDDING")
print("="*60)

test_df = unique_df.head(3)

print("-"*60)
print("EMBEDDING")
print("-"*60)
for i in range(len(test_df)):
    row = test_df.iloc[i]
    
    if pd.notna(row['Uniq Id']) and pd.notna(row['text_to_embed']):
        uniq_id = row['Uniq Id']
        full_text = str(row['text_to_embed'])
        
        # Get chunks for this product
        chunks = create_overlapping_chunks(full_text)
        
        # Process each chunk
        for chunk_num, text_chunk in enumerate(chunks, 1):
            
            # Create unique ChromaDB ID for each chunk
            unique_chroma_id = f"text_{uniq_id}_{chunk_num}"

            # Skip if chunk was already embedded
            try:
                existing = collection.get(ids=[unique_chroma_id])
                if existing['ids']: 
                    print(f"Skipping {unique_chroma_id} - already exists")
                    continue
            except:
                pass
            
            # Get embedding and metadata
            result = get_text_embedding_from_chunk(text_chunk, uniq_id, chunk_num)
            if result is not None:
                embedding, metadata = result
                if embedding is not None:
                    embeddings_to_store.append(embedding.tolist())
                    metadatas_to_store.append(metadata)
                    ids_to_store.append(unique_chroma_id)
            
            # Store every BATCH_SIZE embeddings
            if len(embeddings_to_store) >= BATCH_SIZE:
                collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store)
                print(f"Stored batch of {len(embeddings_to_store)} text embeddings")
                # Clear lists for next batch
                embeddings_to_store = []
                metadatas_to_store = []
                ids_to_store = []

# Store final batch
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} text embeddings")

print()
print("-"*60)
print("Checking results")
print("-"*60)
all_data = collection.get()
print(f"\nTotal embeddings: {len(all_data['ids'])}")

# Count text vs image embeddings
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
image_embeddings = [id for id in all_data['ids'] if id.startswith('img_')]
print(f"Text embeddings: {len(text_embeddings)}")
print(f"Image embeddings: {len(image_embeddings)}")

# Check text chunks per product
text_metadata = [meta for meta in all_data['metadatas'] if meta['type'] == 'text']
text_uniq_ids = [meta['uniq_id'] for meta in text_metadata]
unique_text_uniq_ids = set(text_uniq_ids)
print(f"Products with text embeddings: {len(unique_text_uniq_ids)}")

# Show chunks per product
from collections import Counter
chunks_per_product = Counter(text_uniq_ids)
print(f"Average chunks per product: {len(text_uniq_ids) / len(unique_text_uniq_ids):.1f}")
print(f"Max chunks for one product: {max(chunks_per_product.values())}")

Token indices sequence length is longer than the specified maximum sequence length for this model (340 > 77). Running this sequence through the model will result in indexing errors


TESTING TEXT EMBEDDING
------------------------------------------------------------
EMBEDDING
------------------------------------------------------------
Processing text chunk 1 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 2 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 3 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 4 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 5 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 6 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 1 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 2 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 3 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 4 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 5 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 6 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 7 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 8 for 66d49bbed043f5be260fa9f7

In [14]:
# FINAL VERIFICATION CHECK
print("="*60)
print("FINAL VECTOR STORE VERIFICATION")
print("="*60)

# Check collection details
print(f"Collection name: {collection.name}")
print(f"Total embeddings: {collection.count()}")

# Get all data
all_data = collection.get()

# Count by type
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
image_embeddings = [id for id in all_data['ids'] if id.startswith('img_')]

print(f"\nImage embeddings: {len(image_embeddings)}")
print(f"Text embeddings: {len(text_embeddings)}")

# Check they have the same uniq_ids (products)
text_uniq_ids = set([meta['uniq_id'] for meta in all_data['metadatas'] if meta['type'] == 'text'])
image_uniq_ids = set([meta['uniq_id'] for meta in all_data['metadatas'] if meta['type'] == 'image'])

print(f"\nProducts with text embeddings: {len(text_uniq_ids)}")
print(f"Products with image embeddings: {len(image_uniq_ids)}")
print(f"Products with BOTH text and images: {len(text_uniq_ids & image_uniq_ids)}")

# Sample IDs to verify format
print(f"\nSample image IDs: {image_embeddings[:3] if image_embeddings else 'None'}")
print(f"Sample text IDs: {text_embeddings[:3] if text_embeddings else 'None'}")

# Quick search test to make sure both work
try:
    query_text = "longboard skateboard"
    query_embedding = get_text_embedding(query_text)  # 512 dims

    results = collection.query(
        query_embeddings=[query_embedding.tolist()],  # 512 dims
        n_results=5
    )

    result_types = [meta['type'] for meta in results['metadatas'][0]]
    print(f"\nSearch test - found types: {set(result_types)}")
    print("Both text and image embeddings are searchable!")
except Exception as e:
    print(f"Search test failed: {e}")

print("\n" + "="*60)

FINAL VECTOR STORE VERIFICATION
Collection name: amazon_products_exploded_v2
Total embeddings: 33996

Image embeddings: 33974
Text embeddings: 22

Products with text embeddings: 3
Products with image embeddings: 9980
Products with BOTH text and images: 3

Sample image IDs: ['img_4c69b61db1fc16e7013b43fc926e502d_0', 'img_4c69b61db1fc16e7013b43fc926e502d_1', 'img_4c69b61db1fc16e7013b43fc926e502d_2']
Sample text IDs: ['text_4c69b61db1fc16e7013b43fc926e502d_1', 'text_4c69b61db1fc16e7013b43fc926e502d_2', 'text_4c69b61db1fc16e7013b43fc926e502d_3']

Search test - found types: {'text'}
Both text and image embeddings are searchable!



In [None]:
print("="*60)
print("FULL DATASET TEXT EMBEDDING")
print("="*60)

test_df = unique_df.head(5)

print("-"*60)
print("EMBEDDING")
print("-"*60)
for i in range(len(test_df)):
    row = test_df.iloc[i]
    
    if pd.notna(row['Uniq Id']) and pd.notna(row['text_to_embed']):
        uniq_id = row['Uniq Id']
        full_text = str(row['text_to_embed'])
        
        # Get chunks for this product
        chunks = create_overlapping_chunks(full_text)
        
        # Process each chunk
        for chunk_num, text_chunk in enumerate(chunks, 1):
            
            # Create unique ChromaDB ID for each chunk
            unique_chroma_id = f"text_{uniq_id}_{chunk_num}"

            # Skip if chunk was already embedded
            try:
                existing = collection.get(ids=[unique_chroma_id])
                if existing['ids']: 
                    print(f"Skipping {unique_chroma_id} - already exists")
                    continue
            except:
                pass
            
            # Get embedding and metadata
            result = get_text_embedding_from_chunk(text_chunk, uniq_id, chunk_num)
            if result is not None:
                embedding, metadata = result
                if embedding is not None:
                    embeddings_to_store.append(embedding.tolist())
                    metadatas_to_store.append(metadata)
                    ids_to_store.append(unique_chroma_id)
            
            # Store every BATCH_SIZE embeddings
            if len(embeddings_to_store) >= BATCH_SIZE:
                collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store)
                print(f"Stored batch of {len(embeddings_to_store)} text embeddings")
                # Clear lists for next batch
                embeddings_to_store = []
                metadatas_to_store = []
                ids_to_store = []

# Store final batch
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} text embeddings")

print()
print("-"*60)
print("Checking results")
print("-"*60)
all_data = collection.get()
print(f"\nTotal embeddings: {len(all_data['ids'])}")

# Count text vs image embeddings
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
image_embeddings = [id for id in all_data['ids'] if id.startswith('img_')]
print(f"Text embeddings: {len(text_embeddings)}")
print(f"Image embeddings: {len(image_embeddings)}")

# Check text chunks per product
text_metadata = [meta for meta in all_data['metadatas'] if meta['type'] == 'text']
text_uniq_ids = [meta['uniq_id'] for meta in text_metadata]
unique_text_uniq_ids = set(text_uniq_ids)
print(f"Products with text embeddings: {len(unique_text_uniq_ids)}")

# Show chunks per product
from collections import Counter
chunks_per_product = Counter(text_uniq_ids)
print(f"Average chunks per product: {len(text_uniq_ids) / len(unique_text_uniq_ids):.1f}")
print(f"Max chunks for one product: {max(chunks_per_product.values())}")

# OLD CODE

In [15]:
# HEAVIER METADATA
# def get_image_embedding_from_url(url_string, product_row):
#     """Download image from URL and get CLIP embedding + metadata - no saving on local"""
#     try:
#         uniq_id = product_row['Uniq Id']

#         # split by | and get all urls
#         urls = url_string.split('|')

#         # find first URL that is not a transparent pixel
#         for url in urls:
#             url = url.strip()
#             if 'transparent-pixel.jpg' not in url:
#                 print(f"Trying: {url}")

#                 # download to memory (not disk)
#                 response = requests.get(url)
#                 if response.status_code == 200:
#                     # get image
#                     image = Image.open(BytesIO(response.content))

#                     # get CLIP embedding
#                     embedding = get_image_embedding(image) # This just returns a numpy array - no metadata! I NEED TO ADD METADATA

#                     # Create Metadata

#                     # # very heavy metadata option
#                     # metadata = {
#                     #     "uniq_id": str(uniq_id),
#                     #     "product_name": product_row['Product Name'],
#                     #     "category": product_row['Category'],
#                     #     "price": str(product_row['Selling Price']),
#                     #     "type": "image", # Important: identifies this as image embedding. It is "text" for text
#                     #     "source_url": url
#                     # }

#                     print(f"Got embedding for {uniq_id}")
#                     return embedding, metadata

#     except Exception as e:
#         print(f"ERROR: {e}")
#         return None, None