In [46]:
import pandas as pd
import numpy as np
from IPython.display import display

import requests
from PIL import Image
from io import BytesIO

from transformers import CLIPProcessor, CLIPModel # CLIP
#import longclip # LONG CLIP
import torch

import chromadb
import os

# Reload data
df = pd.read_csv('/Users/brunamedeiros/Documents/GitHub/Amazon-Multimodal-Chatbot/data.csv')
print(f"df has {len(df)} products")
exploded_df = pd.read_csv('/Users/brunamedeiros/Documents/GitHub/Amazon-Multimodal-Chatbot/exploded_df.csv')
print(f"exploded_df has {len(exploded_df)} products")

# Reconnect to vector store

# OLD
#client = chromadb.PersistentClient(path="./my_vectorstore")
#collection = client.get_or_create_collection(name="amazon_products")

# client = chromadb.PersistentClient(path="./my_vectorstore_exploded")
# collection = client.get_or_create_collection(name="amazon_products_exploded") 

# client = chromadb.PersistentClient(path="./my_vectorstore_exploded_v2")
# collection = client.get_or_create_collection(name="amazon_products_exploded_v2") 

client = chromadb.PersistentClient(path="./my_vectorstore_exploded_v3")
collection = client.get_or_create_collection(name="amazon_products_exploded_v3") 
print(f"Vector store was reconnected! total embeddings: {collection.count()}\n")

# CLIP
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

df has 10002 products
exploded_df has 43870 products
Vector store was reconnected! total embeddings: 86643



# DON'T RUN - DELETE EMBEDDINGS

### Image

In [2]:
# # Delete code inside vector store
# all_data = collection.get()
# if all_data['ids']:
#     collection.delete(ids=all_data['ids'])
#     print(f"Deleted {len(all_data['ids'])} embeddings")
# else:
#     print("No embeddings to delete")

<!-- ### Text -->

In [3]:
# # Get all existing text embeddings and delete them
# all_data = collection.get()
# text_ids = [id for id in all_data['ids'] if id.startswith('text_')]

# if text_ids:
#     collection.delete(ids=text_ids)
#     print(f"Deleted {len(text_ids)} existing text embeddings")
# else:
#     print("No existing text embeddings found")

# EDA

- `Image` and `Variants` column, if my understanding is correct, have the identical image. The `Image` column has an actual image while the `Variants` has the link to the Amazon site for that specific product. A lot of those products are not on Amazon anymore so the link leads to an error.

In [4]:

print("="*60)
print("Entire dataset:")
print("="*60)
display(df.head(1))

print("="*60)
print("Dataset shape:")
print("="*60)
print(df.shape)


print("\n")
print("="*60)
print("Columns in the dataset:")
print("="*60)
print(df.columns.tolist())

print("\n")
print("="*60)
print("Column types:")
print("="*60)
print(df.dtypes)

print("\n")
print("="*60)
print("NaN counts:")
print("="*60)
print(df.isna().sum())

print("\n")
print("="*60)
print("Columns where all values are NaN:")
print("="*60)
print("The columns that should be removed because all values are NaN are:")
for col in df.columns:
    if df[col].isna().sum() == df.shape[0]:
        print(f"  {col}")


print("\n")
print("="*60)
print("Columns we can work with")
print("="*60)

for col in df.columns:
    if df[col].isna().sum() != df.shape[0]:
        print(f"  {col}")
valid_cols = [col for col in df.columns if df[col].isna().sum() != df.shape[0]]


print("\n")
print("="*60)
print("Final Dataset")
print("="*60)
display(df[valid_cols].head(1))


Entire dataset:


Unnamed: 0,Uniq Id,Product Name,Brand Name,Asin,Category,Upc Ean Code,List Price,Selling Price,Quantity,Model Number,...,Product Url,Stock,Product Details,Dimensions,Color,Ingredients,Direction To Use,Is Amazon Seller,Size Quantity Variant,Product Description
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",,,Sports & Outdoors | Outdoor Recreation | Skate...,,,$237.68,,,...,https://www.amazon.com/DB-Longboards-CoreFlex-...,,,,,,,Y,,


Dataset shape:
(10002, 28)


Columns in the dataset:
['Uniq Id', 'Product Name', 'Brand Name', 'Asin', 'Category', 'Upc Ean Code', 'List Price', 'Selling Price', 'Quantity', 'Model Number', 'About Product', 'Product Specification', 'Technical Details', 'Shipping Weight', 'Product Dimensions', 'Image', 'Variants', 'Sku', 'Product Url', 'Stock', 'Product Details', 'Dimensions', 'Color', 'Ingredients', 'Direction To Use', 'Is Amazon Seller', 'Size Quantity Variant', 'Product Description']


Column types:
Uniq Id                   object
Product Name              object
Brand Name               float64
Asin                     float64
Category                  object
Upc Ean Code              object
List Price               float64
Selling Price             object
Quantity                 float64
Model Number              object
About Product             object
Product Specification     object
Technical Details         object
Shipping Weight           object
Product Dimensions        objec

Unnamed: 0,Uniq Id,Product Name,Category,Upc Ean Code,Selling Price,Model Number,About Product,Product Specification,Technical Details,Shipping Weight,Product Dimensions,Image,Variants,Product Url,Is Amazon Seller
0,4c69b61db1fc16e7013b43fc926e502d,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib...",Sports & Outdoors | Outdoor Recreation | Skate...,,$237.68,,Make sure this fits by entering your model num...,Shipping Weight: 10.7 pounds (View shipping ra...,,10.7 pounds,,https://images-na.ssl-images-amazon.com/images...,https://www.amazon.com/DB-Longboards-CoreFlex-...,https://www.amazon.com/DB-Longboards-CoreFlex-...,Y


# Data Cleaning
- Strip `Uniq id` column
    - we will use that as naming for the images and metadata for embedding. we are using .strip() to ensure no errors arise later
- Clean URLs: ended up not using this one because checking every URL will take a long time

In [5]:
print("Cleaning Uniq Id column (.strip())...")
df['Uniq Id'] = df['Uniq Id'].astype(str).str.strip()
print()
print(df['Uniq Id'].head())

# Check for any issues
print(f"\nUnique IDs: {df['Uniq Id'].nunique()}")
print(f"Any duplicates: {df['Uniq Id'].duplicated().sum()}")

Cleaning Uniq Id column (.strip())...

0    4c69b61db1fc16e7013b43fc926e502d
1    66d49bbed043f5be260fa9f7fbff5957
2    2c55cae269aebf53838484b0d7dd931a
3    18018b6bc416dab347b1b7db79994afa
4    e04b990e95bf73bbe6a3fa09785d7cd0
Name: Uniq Id, dtype: object

Unique IDs: 10002
Any duplicates: 0


# Image Embedding

Each inidivual row under `Image` column has more than one https link in it.

For instance, row 1: `https://images-na.ssl-images-amazon.com/images/I/51j3fPQTQkL.jpg|https://images-na.ssl-images-amazon.com/images/I/31hKM3cSoSL.jpg|https://images-na.ssl-images-amazon.com/images/I/51WlHdwghfL.jpg|https://images-na.ssl-images-amazon.com/images/I/51FsyLRBzwL.jpg|https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/transparent-pixel.jpg`

Each product has more than 1 image (showing different perspectives of product). Instead of putting them in a different column, they concatenated all URLs in the same one, dividing them by the |

---

The `get_image_embedding_from_url` does the following:
- skip the transparent pixel
    - the transparent pixel is a 1x1 pixel invisible image that Amazon uses a placeholder/tracking pixel.It looks like this `https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/transparent-pixel.jpg`. 
- Name images as the `Uniq Id`: easy to look up
    - When we connect to Chroma and create embeddings, we can store `Uniq Id` as metadata

### CLIP with exploded_df

In [6]:
def get_image_embedding(image):
    inputs = processor(images=[image], return_tensors="pt", padding=True)
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
    return embedding.numpy()[0]


def get_image_embedding_from_single_url(url_string, uniq_id):
    """Download image from URL and get CLIP embedding + metadata - no saving on local"""
    url = url_string.strip()
    try:
        # find first URL that is not a transparent pixel
        if 'transparent-pixel.jpg' not in url:
            print(f"Trying: {url}")

            # download to memory (not disk)
            response = requests.get(url)
            if response.status_code == 200:
                
                # get image
                image = Image.open(BytesIO(response.content))

                # get CLIP embedding
                embedding = get_image_embedding(image) # This just returns a numpy array - no metadata! Therefore, need to add metadata

                # Metadata
                metadata = {
                    "uniq_id": str(uniq_id),
                    "type": "image"
                }
                return embedding, metadata
        
        # If no URLs worked
        print(f"No valid URLs found for {uniq_id}")
        return None, None

    except Exception as e:
        print(f"ERROR: {e}")
        return None, None

# BATCH PROCESSING SETUP
BATCH_SIZE = 100  # Process 100 images at a time
documents_to_store = []
embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []

In [7]:
print("="*60)
print("TESTING IMAGE EMBEDDING")
print("="*60)


print("-"*60)
print("Selecting 4 images linked to same product")
print("-"*60)
# Testing with 4 rows (those 4 images are linked to the same product)
test_df = exploded_df[exploded_df['Uniq Id'] == '4c69b61db1fc16e7013b43fc926e502d'].head(4)
print(test_df[['Uniq Id', 'Image']])


print("-"*60)
print("EMBEDDING")
print("-"*60)
#for i in range(len(exploded_df)):
for i in range(len(test_df)):

    row = test_df.iloc[i]
    if pd.notna(row['Image']):
        uniq_id = row['Uniq Id']
        single_url = row['Image']

        # create unique ChromaDB ID for each image
        unique_chroma_id = f"img_{uniq_id}_{i}"


        # Skip if image was already embedded
        try:
            existing = collection.get(ids=[unique_chroma_id])
            if existing['ids']: 
                print(f"Skipping {unique_chroma_id} - already exists")
                continue
        except:
            pass

        embedding, metadata = get_image_embedding_from_single_url(single_url, uniq_id)  # Just like old code
        if embedding is not None:  # Just like old code
            embeddings_to_store.append(embedding.tolist())
            metadatas_to_store.append(metadata)
            ids_to_store.append(unique_chroma_id)
            documents_to_store.append(single_url)
            
            #print(f"Embedding shape: {embedding.shape}")
            #print("Metadata:", metadata)
            #print("-" * 50)
        
        # Store every BATCH_SIZE embeddings
        if len(embeddings_to_store) >= BATCH_SIZE:
            collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
            print(f"Stored batch of {len(embeddings_to_store)} embeddings")
            # Clear lists for next batch
            documents_to_store = []
            embeddings_to_store = []
            metadatas_to_store = []
            ids_to_store = []
            

# Store final batch (if any remaining)
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} embeddings")


print("-"*60)
print("Metadata Analysis")
print("-"*60)

# Showing metadata 
all_data = collection.get()
print(f"Total embeddings: {len(all_data['ids'])}")

print(f"\nDocuments stored: {all_data['documents'] is not None}")
if all_data['documents']:
    print(f"Sample document: {all_data['documents'][0][:200]}...")  # Show first 50 chars
    
print("\nMetadata for each embedding:")
for i, (id, metadata) in enumerate(zip(all_data['ids'], all_data['metadatas'])):
   print(f"{i+1}. ID: {id} | Metadata: {metadata}")

# Check if all uniq_ids are the same
uniq_ids = [metadata['uniq_id'] for metadata in all_data['metadatas']]
all_same = len(set(uniq_ids)) == 1
print(f"\nAll uniq_ids are the same: {all_same}")
if all_same:
   print(f"Uniq ID: {uniq_ids[0]}") 
   print(f"Ready to run on full dataset!")

TESTING IMAGE EMBEDDING
------------------------------------------------------------
Selecting 4 images linked to same product
------------------------------------------------------------
                            Uniq Id  \
0  4c69b61db1fc16e7013b43fc926e502d   
1  4c69b61db1fc16e7013b43fc926e502d   
2  4c69b61db1fc16e7013b43fc926e502d   
3  4c69b61db1fc16e7013b43fc926e502d   

                                               Image  
0  https://images-na.ssl-images-amazon.com/images...  
1  https://images-na.ssl-images-amazon.com/images...  
2  https://images-na.ssl-images-amazon.com/images...  
3  https://images-na.ssl-images-amazon.com/images...  
------------------------------------------------------------
EMBEDDING
------------------------------------------------------------
Skipping img_4c69b61db1fc16e7013b43fc926e502d_0 - already exists
Skipping img_4c69b61db1fc16e7013b43fc926e502d_1 - already exists
Skipping img_4c69b61db1fc16e7013b43fc926e502d_2 - already exists
Skipping img_4

In [8]:
print("="*60)
print("FULL DATASET IMAGE EMBEDDING")
print("="*60)


print("-"*60)
print("EMBEDDING")
print("-"*60)
for i in range(len(exploded_df)):
    row = exploded_df.iloc[i]
    if pd.notna(row['Image']):
        uniq_id = row['Uniq Id']
        single_url = row['Image']

        # create unique ChromaDB ID for each image
        unique_chroma_id = f"img_{uniq_id}_{i}"


        # Skip if image was already embedded
        try:
            existing = collection.get(ids=[unique_chroma_id])
            if existing['ids']: 
                print(f"Skipping {unique_chroma_id} - already exists")
                continue
        except:
            pass

        embedding, metadata = get_image_embedding_from_single_url(single_url, uniq_id)  # Just like old code
        if embedding is not None:  # Just like old code
            embeddings_to_store.append(embedding.tolist())
            metadatas_to_store.append(metadata)
            ids_to_store.append(unique_chroma_id)
            documents_to_store.append(single_url)
            
            #print(f"Embedding shape: {embedding.shape}")
            #print("Metadata:", metadata)
            #print("-" * 50)
        
        # Store every BATCH_SIZE embeddings
        if len(embeddings_to_store) >= BATCH_SIZE:
            collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
            print(f"Stored batch of {len(embeddings_to_store)} embeddings")
            # Clear lists for next batch
            embeddings_to_store = []
            metadatas_to_store = []
            ids_to_store = []
            documents_to_store = []

# Store final batch (if any remaining)
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} embeddings")


print("-"*60)
print("RESULTS")
print("-"*60)

all_data = collection.get()
print(f"Total embeddings: {len(all_data['ids'])}")

# Count unique uniq_ids
uniq_ids = [metadata['uniq_id'] for metadata in all_data['metadatas']]
unique_uniq_ids = set(uniq_ids)
print(f"Number of different uniq_ids: {len(unique_uniq_ids)}")
#print(f"Uniq IDs found: {list(unique_uniq_ids)}")

FULL DATASET IMAGE EMBEDDING
------------------------------------------------------------
EMBEDDING
------------------------------------------------------------
Skipping img_4c69b61db1fc16e7013b43fc926e502d_0 - already exists
Skipping img_4c69b61db1fc16e7013b43fc926e502d_1 - already exists
Skipping img_4c69b61db1fc16e7013b43fc926e502d_2 - already exists
Skipping img_4c69b61db1fc16e7013b43fc926e502d_3 - already exists
No valid URLs found for 4c69b61db1fc16e7013b43fc926e502d
Skipping img_66d49bbed043f5be260fa9f7fbff5957_5 - already exists
Skipping img_66d49bbed043f5be260fa9f7fbff5957_6 - already exists
Skipping img_66d49bbed043f5be260fa9f7fbff5957_7 - already exists
Skipping img_66d49bbed043f5be260fa9f7fbff5957_8 - already exists
Skipping img_66d49bbed043f5be260fa9f7fbff5957_9 - already exists
Skipping img_66d49bbed043f5be260fa9f7fbff5957_10 - already exists
No valid URLs found for 66d49bbed043f5be260fa9f7fbff5957
Skipping img_2c55cae269aebf53838484b0d7dd931a_12 - already exists
No valid

In [9]:
print("="*60)
print("VECTOR STORE INFO:")
print("="*60)
print(f"Collection name: {collection.name}")
print(f"Total embeddings: {collection.count()}")

# Check what's inside
if collection.count() > 0:
    peek = collection.peek(limit=3)
    print(f"\nSample IDs: {peek['ids']}")
    print(f"\nSample metadata: {peek['metadatas']}")
else:
    print("No embeddings stored yet")

VECTOR STORE INFO:
Collection name: amazon_products_exploded_v3
Total embeddings: 33975

Sample IDs: ['img_4c69b61db1fc16e7013b43fc926e502d_0', 'img_4c69b61db1fc16e7013b43fc926e502d_1', 'img_4c69b61db1fc16e7013b43fc926e502d_2']

Sample metadata: [{'uniq_id': '4c69b61db1fc16e7013b43fc926e502d', 'type': 'image'}, {'uniq_id': '4c69b61db1fc16e7013b43fc926e502d', 'type': 'image'}, {'uniq_id': '4c69b61db1fc16e7013b43fc926e502d', 'type': 'image'}]


`Sample IDs` = Chroma's interal IDs (chroma's way of finding embedding)

`Uniq ID` = our product ID in metadata

---

# Text Embedding

For text embedding, there were 2 options:
- CLIP 
    - PROBLEM: 77 token limit
- Another LM model like all-mini
    - Won't align with CLIP image embeddings

We opted to go with CLIP embeddings. We opted to do chunking instead of cutting down the text size to not lose valuable information.

In [10]:
# Text concatenation
def create_product_text(row):
    """Concatenate all text columns into one description"""
    text_parts = []
    
    # product name
    if pd.notna(row['Product Name']):
        text_parts.append(f"This product is {row['Product Name']}")
    
    # category
    if pd.notna(row['Category']):
        text_parts.append(f"It falls under the category of {row['Category']}")
    
    # price
    if pd.notna(row['Selling Price']):
        text_parts.append(f"The price is {row['Selling Price']}")
    
    # model number
    if pd.notna(row['Model Number']):
        text_parts.append(f"The model number is {row['Model Number']}")
    
    # main description
    if pd.notna(row['About Product']):
        text_parts.append(f"Product description: {row['About Product']}")
    
    # technical details
    if pd.notna(row['Technical Details']):
        text_parts.append(f"Technical specifications: {row['Technical Details']}")
    
    # shipping info
    if pd.notna(row['Shipping Weight']):
        text_parts.append(f"Shipping weight is {row['Shipping Weight']}")
    
    # dimensions
    if pd.notna(row['Product Dimensions']):
        text_parts.append(f"Product dimensions are {row['Product Dimensions']}")
    
    # seller info
    if pd.notna(row['Is Amazon Seller']):
        seller_text = "The product is sold by Amazon" if str(row['Is Amazon Seller']).lower() == 'true' else "The product is not sold by Amazon"
        text_parts.append(f"This item is {seller_text}")
    
    # combine into one text
    return ". ".join(text_parts) + "."

# Test it
sample_text = create_product_text(exploded_df.iloc[0])
print("="*60)
print("TESTING...")
print("="*60)
print(sample_text)

# create new column
print("\nCreating new column...")
exploded_df['text_to_embed'] = exploded_df.apply(create_product_text,axis=1)
print("...new column created!")


TESTING...
This product is DB Longboards CoreFlex Crossbow 41" Bamboo Fiberglass Longboard Complete. It falls under the category of Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards. The price is $237.68. Product description: Make sure this fits by entering your model number. | RESPONSIVE FLEX: The Crossbow features a bamboo core encased in triaxial fiberglass and HD plastic for a responsive flex pattern that’s second to none. Pumping & carving have never been so satisfying! Flex 2 is recommended for people 120 to 170 pounds. | COREFLEX TECH: CoreFlex construction is water resistant, impact resistant, scratch resistant and has a flex like you won’t believe. These boards combine fiberglass, epoxy, HD plastic and bamboo to create a perfect blend of performance and strength. | INSPIRED BY THE NORTHWEST: Our founding ideal is chasing adventure & riding the best boards possible, inspired by the hills, wav

In [11]:
# FUNCTION SETUP
def get_text_embedding(text):
    inputs = processor(text=[text], return_tensors="pt", padding=True, truncation=True, max_length=77)
    with torch.no_grad():
        embedding = model.get_text_features(**inputs)
    return embedding.numpy()[0]

def create_overlapping_chunks(text, chunk_size=77, overlap=15):
    """Split text into overlapping chunks of specified token size"""
    try:
        # Tokenize the full text
        inputs = processor(text=[text], return_tensors="pt", padding=True)
        tokens = inputs['input_ids'][0]  # Get token IDs
        
        chunks = []
        start = 0
        
        while start < len(tokens):
            # Get chunk of tokens
            end = min(start + chunk_size, len(tokens))
            chunk_tokens = tokens[start:end]
            
            # Decode back to text
            chunk_text = processor.tokenizer.decode(chunk_tokens, skip_special_tokens=True)
            chunks.append(chunk_text)
            
            # Move start position with overlap
            if end >= len(tokens):
                break
            start = end - overlap
        
        return chunks
    except Exception as e:
        print(f"Error chunking text: {e}")
        return [text[:500]]  # Fallback to first 500 chars

def get_text_embedding_from_chunk(text_chunk, uniq_id, chunk_number):
    """Get CLIP text embedding + metadata"""
    try:
        if text_chunk.strip():
            print(f"Processing text chunk {chunk_number} for {uniq_id}")
            embedding = get_text_embedding(text_chunk)
            
            metadata = {
                "uniq_id": str(uniq_id),
                "type": "text",
                "chunk": chunk_number
            }
            return embedding, metadata
        return None, None
    except Exception as e:
        print(f"ERROR: {e}")
        return None, None

# BATCH PROCESSING SETUP
BATCH_SIZE = 100
embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []


# Get unique products only for text embedding
print("-"*60)
print("Deleting repeated text columns (result of 'explosion')")
print("-"*60)
unique_df = exploded_df.drop_duplicates(subset=['Uniq Id'])
print(f"Using {len(unique_df)} unique products for text embedding")

------------------------------------------------------------
Deleting repeated text columns (result of 'explosion')
------------------------------------------------------------
Using 10002 unique products for text embedding


In [13]:
print("="*60)
print("TESTING TEXT EMBEDDING")
print("="*60)

test_df = unique_df.head(3)

embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []
documents_to_store = []


print("-"*60)
print("EMBEDDING")
print("-"*60)
for i in range(len(test_df)):
    row = test_df.iloc[i]
    
    if pd.notna(row['Uniq Id']) and pd.notna(row['text_to_embed']):
        uniq_id = row['Uniq Id']
        full_text = str(row['text_to_embed'])
        
        # Get chunks for this product
        chunks = create_overlapping_chunks(full_text)
        
        # Process each chunk
        for chunk_num, text_chunk in enumerate(chunks, 1):
            
            # Create unique ChromaDB ID for each chunk
            unique_chroma_id = f"text_{uniq_id}_{chunk_num}"

            # Skip if chunk was already embedded
            try:
                existing = collection.get(ids=[unique_chroma_id])
                if existing['ids']: 
                    print(f"Skipping {unique_chroma_id} - already exists")
                    continue
            except:
                pass
            
            # Get embedding and metadata
            result = get_text_embedding_from_chunk(text_chunk, uniq_id, chunk_num)
            if result is not None:
                embedding, metadata = result
                if embedding is not None:
                    embeddings_to_store.append(embedding.tolist())
                    metadatas_to_store.append(metadata)
                    ids_to_store.append(unique_chroma_id)
                    documents_to_store.append(text_chunk)
            
            # Store every BATCH_SIZE embeddings
            if len(embeddings_to_store) >= BATCH_SIZE:
                collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
                print(f"Stored batch of {len(embeddings_to_store)} text embeddings")
                # Clear lists for next batch
                embeddings_to_store = []
                metadatas_to_store = []
                ids_to_store = []
                documents_to_store = []

# Store final batch
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} text embeddings")

print()
print("-"*60)
print("Checking results")
print("-"*60)
all_data = collection.get()
print(f"\nTotal embeddings: {len(all_data['ids'])}")

# Count text vs image embeddings
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
image_embeddings = [id for id in all_data['ids'] if id.startswith('img_')]
print(f"Text embeddings: {len(text_embeddings)}")
print(f"Image embeddings: {len(image_embeddings)}")

# Check text chunks per product
text_metadata = [meta for meta in all_data['metadatas'] if meta['type'] == 'text']
text_uniq_ids = [meta['uniq_id'] for meta in text_metadata]
unique_text_uniq_ids = set(text_uniq_ids)
print(f"Products with text embeddings: {len(unique_text_uniq_ids)}")

# Show chunks per product
from collections import Counter
chunks_per_product = Counter(text_uniq_ids)
print(f"Average chunks per product: {len(text_uniq_ids) / len(unique_text_uniq_ids):.1f}")
print(f"Max chunks for one product: {max(chunks_per_product.values())}")

TESTING TEXT EMBEDDING
------------------------------------------------------------
EMBEDDING
------------------------------------------------------------
Processing text chunk 1 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 2 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 3 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 4 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 5 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 6 for 4c69b61db1fc16e7013b43fc926e502d
Processing text chunk 1 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 2 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 3 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 4 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 5 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 6 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 7 for 66d49bbed043f5be260fa9f7fbff5957
Processing text chunk 8 for 66d49bbed043f5be260fa9f7

In [15]:
# FINAL VERIFICATION CHECK
print("="*60)
print("FINAL VECTOR STORE VERIFICATION")
print("="*60)

# Check collection details
print(f"Collection name: {collection.name}")
print(f"Total embeddings: {collection.count()}")

# Get all data
all_data = collection.get()

print(f"Documents stored: {all_data['documents'] is not None}")
if all_data['documents']:
    print(f"Sample text document: {all_data['documents'][0][:100]}...")
    print(f"Sample image document: {[doc for doc in all_data['documents'] if doc and doc.startswith('http')][0][:50]}...")

# Count by type
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
image_embeddings = [id for id in all_data['ids'] if id.startswith('img_')]

print(f"\nImage embeddings: {len(image_embeddings)}")
print(f"Text embeddings: {len(text_embeddings)}")

# Check they have the same uniq_ids (products)
text_uniq_ids = set([meta['uniq_id'] for meta in all_data['metadatas'] if meta['type'] == 'text'])
image_uniq_ids = set([meta['uniq_id'] for meta in all_data['metadatas'] if meta['type'] == 'image'])

print(f"\nProducts with text embeddings: {len(text_uniq_ids)}")
print(f"Products with image embeddings: {len(image_uniq_ids)}")
print(f"Products with BOTH text and images: {len(text_uniq_ids & image_uniq_ids)}")

# Sample IDs to verify format
print(f"\nSample image IDs: {image_embeddings[:3] if image_embeddings else 'None'}")
print(f"Sample text IDs: {text_embeddings[:3] if text_embeddings else 'None'}")

# Quick search test to make sure both work
try:
    query_text = "longboard skateboard"
    query_embedding = get_text_embedding(query_text)  # 512 dims

    results = collection.query(
        query_embeddings=[query_embedding.tolist()],  # 512 dims
        n_results=5
    )

    result_types = [meta['type'] for meta in results['metadatas'][0]]
    print(f"\nSearch test - found types: {set(result_types)}")
    print("Both text and image embeddings are searchable!")
except Exception as e:
    print(f"Search test failed: {e}")

print("\n" + "="*60)

FINAL VECTOR STORE VERIFICATION
Collection name: amazon_products_exploded_v3
Total embeddings: 33997
Documents stored: True
Sample text document: https://images-na.ssl-images-amazon.com/images/I/51j3fPQTQkL.jpg...
Sample image document: https://images-na.ssl-images-amazon.com/images/I/5...

Image embeddings: 33975
Text embeddings: 22

Products with text embeddings: 3
Products with image embeddings: 9980
Products with BOTH text and images: 3

Sample image IDs: ['img_4c69b61db1fc16e7013b43fc926e502d_0', 'img_4c69b61db1fc16e7013b43fc926e502d_1', 'img_4c69b61db1fc16e7013b43fc926e502d_2']
Sample text IDs: ['text_4c69b61db1fc16e7013b43fc926e502d_1', 'text_4c69b61db1fc16e7013b43fc926e502d_2', 'text_4c69b61db1fc16e7013b43fc926e502d_3']

Search test - found types: {'text'}
Both text and image embeddings are searchable!



In [16]:
print("="*60)
print("FULL DATASET TEXT EMBEDDING")
print("="*60)

BATCH_SIZE = 100
embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []
documents_to_store = []


print("-"*60)
print("EMBEDDING")
for i in range(len(unique_df)): 
    #row = test_df.iloc[i]
    row = unique_df.iloc[i]  
    
    if pd.notna(row['Uniq Id']) and pd.notna(row['text_to_embed']):
        uniq_id = row['Uniq Id']
        full_text = str(row['text_to_embed'])
        
        # Get chunks for this product
        chunks = create_overlapping_chunks(full_text)
        
        # Process each chunk
        for chunk_num, text_chunk in enumerate(chunks, 1):
            
            # Create unique ChromaDB ID for each chunk
            unique_chroma_id = f"text_{uniq_id}_{chunk_num}"

            # Skip if chunk was already embedded
            try:
                existing = collection.get(ids=[unique_chroma_id])
                if existing['ids']: 
                    print(f"Skipping {unique_chroma_id} - already exists")
                    continue
            except:
                pass
            
            # Get embedding and metadata
            result = get_text_embedding_from_chunk(text_chunk, uniq_id, chunk_num)
            if result is not None:
                embedding, metadata = result
                if embedding is not None:
                    embeddings_to_store.append(embedding.tolist())
                    metadatas_to_store.append(metadata)
                    ids_to_store.append(unique_chroma_id)
                    documents_to_store.append(text_chunk)
            
            # Store every BATCH_SIZE embeddings
            if len(embeddings_to_store) >= BATCH_SIZE:
                collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
                print(f"Stored batch of {len(embeddings_to_store)} text embeddings")
                # Clear lists for next batch
                embeddings_to_store = []
                metadatas_to_store = []
                ids_to_store = []
                documents_to_store = []

# Store final batch
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} text embeddings")

print()
print("-"*60)
print("Checking results")
print("-"*60)
all_data = collection.get()
print(f"\nTotal embeddings: {len(all_data['ids'])}")

# Count text vs image embeddings
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
image_embeddings = [id for id in all_data['ids'] if id.startswith('img_')]
print(f"Text embeddings: {len(text_embeddings)}")
print(f"Image embeddings: {len(image_embeddings)}")

# Check text chunks per product
text_metadata = [meta for meta in all_data['metadatas'] if meta['type'] == 'text']
text_uniq_ids = [meta['uniq_id'] for meta in text_metadata]
unique_text_uniq_ids = set(text_uniq_ids)
print(f"Products with text embeddings: {len(unique_text_uniq_ids)}")

# Show chunks per product
from collections import Counter
chunks_per_product = Counter(text_uniq_ids)
print(f"Average chunks per product: {len(text_uniq_ids) / len(unique_text_uniq_ids):.1f}")
print(f"Max chunks for one product: {max(chunks_per_product.values())}")

FULL DATASET TEXT EMBEDDING
------------------------------------------------------------
EMBEDDING
Skipping text_4c69b61db1fc16e7013b43fc926e502d_1 - already exists
Skipping text_4c69b61db1fc16e7013b43fc926e502d_2 - already exists
Skipping text_4c69b61db1fc16e7013b43fc926e502d_3 - already exists
Skipping text_4c69b61db1fc16e7013b43fc926e502d_4 - already exists
Skipping text_4c69b61db1fc16e7013b43fc926e502d_5 - already exists
Skipping text_4c69b61db1fc16e7013b43fc926e502d_6 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_1 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_2 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_3 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_4 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_5 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_6 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_7 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5

In [17]:
# Verify no duplicate chunk IDs
text_ids = [id for id in all_data['ids'] if id.startswith('text_')]
unique_text_ids = set(text_ids)
print(f"Text embeddings: {len(text_ids)}")
print(f"Unique text IDs: {len(unique_text_ids)}")
print(f"Duplicates: {len(text_ids) - len(unique_text_ids)}")

Text embeddings: 52668
Unique text IDs: 52668
Duplicates: 0


In [38]:
# Systematic RAG Debugging Experiments
# Run these in your code.ipynb to isolate the exact problem

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

print("=" * 80)
print("RAG SYSTEM DEBUGGING - SYSTEMATIC EXPERIMENTS")
print("=" * 80)

# EXPERIMENT 1: EMBEDDING QUALITY TEST
print("\n1. EMBEDDING QUALITY TEST")
print("-" * 50)

# Test if your embeddings can distinguish between similar concepts
test_texts = [
    "Samsung Galaxy smartphone",
    "iPhone smartphone", 
    "LEGO Minecraft building set",
    "Star Wars action figure",
    "kitchen mixer appliance"
]

embeddings = []
for text in test_texts:
    emb = get_text_embedding(text)
    embeddings.append(emb)
    print(f"Embedded: '{text}' -> shape {emb.shape}")

# Compute similarity matrix
embeddings_matrix = np.array(embeddings)
similarity_matrix = cosine_similarity(embeddings_matrix)

print("\nSimilarity Matrix (should show smartphones are similar, toys are similar):")
print("Texts:", [t[:20] for t in test_texts])
for i, row in enumerate(similarity_matrix):
    print(f"{test_texts[i][:20]:20}", [f"{val:.3f}" for val in row])

# Check if Samsung and iPhone are more similar than Samsung and LEGO
samsung_iphone_sim = similarity_matrix[0, 1]
samsung_lego_sim = similarity_matrix[0, 2]
print(f"\nSamsung-iPhone similarity: {samsung_iphone_sim:.3f}")
print(f"Samsung-LEGO similarity: {samsung_lego_sim:.3f}")
print(f"✅ Embeddings work correctly: {samsung_iphone_sim > samsung_lego_sim}")

# EXPERIMENT 2: CHUNKING STRATEGY ANALYSIS
print("\n2. CHUNKING STRATEGY ANALYSIS")
print("-" * 50)

# Test your chunking function with sample text
sample_product_text = """
This product is Samsung Galaxy S21 Ultra 5G smartphone. It falls under the category of Electronics | Cell Phones & Accessories | Cell Phones. The price is $999.99. 
Product description: The Samsung Galaxy S21 Ultra features a 6.8-inch Dynamic AMOLED display, quad camera setup with 108MP main sensor, 
S Pen support, 5000mAh battery, Snapdragon 888 processor, up to 512GB storage, 5G connectivity, and Android 11. 
Technical specifications: 6.8-inch screen, 108MP + 12MP + 10MP + 10MP cameras, 5000mAh battery, Snapdragon 888, Android 11.
This item is the product is not sold by amazon. Method ship it!
"""

# Test chunking
chunks = create_overlapping_chunks(sample_product_text)
print(f"Original text length: {len(sample_product_text)} chars")
print(f"Number of chunks created: {len(chunks)}")
print(f"Chunk size limit: 77 tokens")

for i, chunk in enumerate(chunks):
    # Count tokens (rough estimate)
    token_count = len(chunk.split())
    print(f"\nChunk {i+1} ({token_count} words): {chunk[:100]}...")
    
    # Check if chunk contains meaningful content
    meaningful_keywords = ['samsung', 'galaxy', 's21', 'camera', 'battery', 'processor']
    keywords_found = [kw for kw in meaningful_keywords if kw.lower() in chunk.lower()]
    print(f"  Keywords found: {keywords_found}")
    print(f"  Meaningful: {'Yes' if keywords_found else 'No'}")

# EXPERIMENT 3: RETRIEVAL PRECISION TEST
print("\n3. RETRIEVAL PRECISION TEST")
print("-" * 50)

# Test different query formulations for the same concept
query_variations = [
    "Samsung Galaxy S21",
    "Samsung Galaxy S21 smartphone", 
    "Samsung Galaxy S21 features",
    "Galaxy S21 phone",
    "Samsung S21 mobile device"
]

for query in query_variations:
    print(f"\nTesting query: '{query}'")
    query_embedding = get_text_embedding(query)
    
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=5,
        include=['metadatas', 'documents']
    )
    
    # Analyze results
    samsung_matches = 0
    galaxy_matches = 0
    meaningful_matches = 0
    
    for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
        if doc:
            doc_lower = doc.lower()
            if 'samsung' in doc_lower:
                samsung_matches += 1
            if 'galaxy' in doc_lower:
                galaxy_matches += 1
            if len(doc.strip()) > 50 and not doc_lower.startswith('this item is the product'):
                meaningful_matches += 1
    
    print(f"  Samsung mentions: {samsung_matches}/5")
    print(f"  Galaxy mentions: {galaxy_matches}/5") 
    print(f"  Meaningful content: {meaningful_matches}/5")
    print(f"  Top result: {results['documents'][0][0][:80] if results['documents'][0][0] else 'None'}...")

# EXPERIMENT 4: PROMPT EMBEDDING VS DOCUMENT EMBEDDING
print("\n4. PROMPT EMBEDDING VS DOCUMENT EMBEDDING")
print("-" * 50)

# Test if your query embeddings are in the same space as document embeddings
query = "Samsung Galaxy smartphone"
query_embedding = get_text_embedding(query)

# Get some document embeddings by searching for known good content
lego_query = "LEGO Minecraft"
lego_embedding = get_text_embedding(lego_query)
lego_results = collection.query(
    query_embeddings=[lego_embedding.tolist()],
    n_results=3,
    include=['metadatas', 'documents']
)

print(f"Query embedding shape: {query_embedding.shape}")
print(f"Query embedding stats: mean={np.mean(query_embedding):.4f}, std={np.std(query_embedding):.4f}")

# Test direct similarity to see if embeddings are in similar ranges
if lego_results['documents'][0]:
    # Re-embed a good document and compare
    good_doc = lego_results['documents'][0][0]
    if good_doc and len(good_doc) > 20:
        doc_reembedding = get_text_embedding(good_doc)
        
        # Compare query embedding vs document re-embedding
        similarity = cosine_similarity([query_embedding], [doc_reembedding])[0][0]
        print(f"Query-Document embedding similarity: {similarity:.4f}")
        print(f"Document re-embedded: {good_doc[:60]}...")

# EXPERIMENT 5: DATABASE CONTENT SPOT CHECK
print("\n5. DATABASE CONTENT SPOT CHECK")
print("-" * 50)

# Manually check if Samsung content exists by searching product IDs
print("Searching for products with 'samsung' in text...")

# Search through a sample of your data
sample_data = collection.get(limit=1000)  # Check first 1000 items
samsung_items = []

for i, (doc, meta) in enumerate(zip(sample_data['documents'], sample_data['metadatas'])):
    if doc and 'samsung' in doc.lower():
        samsung_items.append((doc, meta))

print(f"Samsung items found in first 1000: {len(samsung_items)}")

if samsung_items:
    print("Sample Samsung content found:")
    for i, (doc, meta) in enumerate(samsung_items[:3]):
        print(f"  {i+1}. Type: {meta.get('type')}, Content: {doc[:100]}...")
else:
    print("❌ No Samsung content found in sample")

# EXPERIMENT 6: END-TO-END RETRIEVAL TEST
print("\n6. END-TO-END RETRIEVAL TEST")
print("-" * 50)

# Test the complete retrieval pipeline like your Streamlit app does
test_queries = ["LEGO Minecraft", "Samsung Galaxy", "kitchen mixer"]

for query in test_queries:
    print(f"\n--- Testing: '{query}' ---")
    
    # Simulate your app's retrieval
    query_embedding = get_text_embedding(query)
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=15,  # Get more like we discussed
        include=['metadatas', 'documents']
    )
    
    # Apply your formatting logic
    text_snippets = []
    image_urls = []
    
    for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
        if meta.get("type") == "text" and doc:
            if len(doc.strip()) > 30:  # Basic quality filter
                text_snippets.append(doc)
        elif meta.get("type") == "image" and doc:
            image_urls.append(doc)
    
    final_context = "\n\n".join(text_snippets[:5])  # Limit context
    
    print(f"  Retrieved {len(results['documents'][0])} total items")
    print(f"  Quality text chunks: {len(text_snippets)}")
    print(f"  Images found: {len(image_urls)}")
    print(f"  Final context length: {len(final_context)} chars")
    print(f"  Context preview: {final_context[:150]}...")
    
    # Check if context would help LLM answer
    query_words = query.lower().split()
    context_lower = final_context.lower()
    matching_words = [word for word in query_words if word in context_lower]
    print(f"  Query words in context: {matching_words}")

print("\n" + "=" * 80)
print("DEBUGGING COMPLETE - ANALYZE RESULTS ABOVE")
print("=" * 80)

Token indices sequence length is longer than the specified maximum sequence length for this model (171 > 77). Running this sequence through the model will result in indexing errors


RAG SYSTEM DEBUGGING - SYSTEMATIC EXPERIMENTS

1. EMBEDDING QUALITY TEST
--------------------------------------------------
Embedded: 'Samsung Galaxy smartphone' -> shape (512,)
Embedded: 'iPhone smartphone' -> shape (512,)
Embedded: 'LEGO Minecraft building set' -> shape (512,)
Embedded: 'Star Wars action figure' -> shape (512,)
Embedded: 'kitchen mixer appliance' -> shape (512,)

Similarity Matrix (should show smartphones are similar, toys are similar):
Texts: ['Samsung Galaxy smart', 'iPhone smartphone', 'LEGO Minecraft build', 'Star Wars action fig', 'kitchen mixer applia']
Samsung Galaxy smart ['1.000', '0.887', '0.592', '0.731', '0.757']
iPhone smartphone    ['0.887', '1.000', '0.595', '0.738', '0.772']
LEGO Minecraft build ['0.592', '0.595', '1.000', '0.676', '0.600']
Star Wars action fig ['0.731', '0.738', '0.676', '1.000', '0.720']
kitchen mixer applia ['0.757', '0.772', '0.600', '0.720', '1.000']

Samsung-iPhone similarity: 0.887
Samsung-LEGO similarity: 0.592
✅ Embeddings wo

In [39]:
# Check if images exist for successful text queries
lego_results = collection.query(
    query_embeddings=[get_text_embedding("LEGO Minecraft").tolist()],
    n_results=20,  # Get more results
    include=['metadatas', 'documents']
)

# Count types
text_count = sum(1 for meta in lego_results['metadatas'][0] if meta.get('type') == 'text')
image_count = sum(1 for meta in lego_results['metadatas'][0] if meta.get('type') == 'image') 

print(f"LEGO query results: {text_count} text, {image_count} images")

# If image_count = 0, your text-image embeddings are completely separated

LEGO query results: 20 text, 0 images


In [40]:
# Search your actual database for the exact Samsung text you expect
expected_samsung_text = "Samsung Galaxy S21 comes with a 6.2-inch Dynamic AMOLED display"

# Search through your database more exhaustively
large_sample = collection.get(limit=5000)  # Check more items
samsung_found = []

for i, (doc, meta) in enumerate(zip(large_sample['documents'], large_sample['metadatas'])):
    if doc and ('galaxy s21' in doc.lower() or 'samsung galaxy' in doc.lower()):
        samsung_found.append((doc, meta))
        print(f"FOUND SAMSUNG: {doc[:100]}...")

print(f"Samsung items found in 5000 items: {len(samsung_found)}")

Samsung items found in 5000 items: 0


In [41]:
# Test if you can force image retrieval using different approaches
query = "LEGO Minecraft"
query_embedding = get_text_embedding(query)

# Method 1: Search specifically for images
image_only_results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=20,
    include=['metadatas', 'documents'],
    where={"type": "image"}  # Force image search
)

print(f"Image-only search results: {len(image_only_results['documents'][0])}")

# Method 2: Get much larger result set
large_results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=100,  # Much larger
    include=['metadatas', 'documents']
)

image_count = sum(1 for meta in large_results['metadatas'][0] if meta.get('type') == 'image')
print(f"Images in top 100 results: {image_count}")

Image-only search results: 20
Images in top 100 results: 0


In [42]:
# Go back to your original data and check what should have been embedded
print("=== CHECKING ORIGINAL DATA vs EMBEDDED DATA ===")

# Check your unique_df for Samsung content
samsung_in_original = unique_df[unique_df['text_to_embed'].str.contains('samsung.*galaxy|galaxy.*s21', case=False, na=False, regex=True)]
print(f"Samsung products in original unique_df: {len(samsung_in_original)}")

if len(samsung_in_original) > 0:
    print("Sample Samsung from original data:")
    sample_samsung = samsung_in_original.iloc[0]
    print(f"Uniq ID: {sample_samsung['Uniq Id']}")
    print(f"Original text: {sample_samsung['text_to_embed'][:200]}...")
    
    # NOW CHECK: Did this specific product get embedded?
    sample_id = sample_samsung['Uniq Id']
    
    # Search your collection for this specific product
    embedded_items = collection.get(where={"uniq_id": sample_id})
    print(f"\nItems in v3 database for this Samsung product: {len(embedded_items['ids'])}")
    
    if embedded_items['ids']:
        print("What got embedded for this Samsung product:")
        for i, (id, meta, doc) in enumerate(zip(embedded_items['ids'], embedded_items['metadatas'], embedded_items['documents'])):
            print(f"  {i+1}. ID: {id}")
            print(f"     Type: {meta.get('type')}")
            print(f"     Content: {doc[:100] if doc else 'None'}...")
    else:
        print("❌ This Samsung product was NEVER embedded into v3!")

=== CHECKING ORIGINAL DATA vs EMBEDDED DATA ===
Samsung products in original unique_df: 1
Sample Samsung from original data:
Uniq ID: ab8173b65f1c2a497b1bca49ddb6dc1f
Original text: This product is Ultimaxx Phone Monitor Sun Shade Cover Tablets Pad Hood for DJI Phantom 4/3, Mavic Pro, Inspire, OSMO, M600 Monitor Remote Controller (Small Tablet 8 Inch). It falls under the category...

Items in v3 database for this Samsung product: 11
What got embedded for this Samsung product:
  1. ID: img_ab8173b65f1c2a497b1bca49ddb6dc1f_7689
     Type: image
     Content: https://images-na.ssl-images-amazon.com/images/I/4105KghjJ-L.jpg...
  2. ID: img_ab8173b65f1c2a497b1bca49ddb6dc1f_7690
     Type: image
     Content: https://images-na.ssl-images-amazon.com/images/I/31IupTXN5UL.jpg...
  3. ID: img_ab8173b65f1c2a497b1bca49ddb6dc1f_7691
     Type: image
     Content: https://images-na.ssl-images-amazon.com/images/I/411FPLaE9JL.jpg...
  4. ID: img_ab8173b65f1c2a497b1bca49ddb6dc1f_7692
     Type: image
 

In [43]:
# Go back to your original data and check what should have been embedded
print("=== CHECKING ORIGINAL DATA vs EMBEDDED DATA ===")

# Check your unique_df for Samsung content
samsung_in_original = unique_df[unique_df['text_to_embed'].str.contains('samsung.*galaxy|galaxy.*s21', case=False, na=False, regex=True)]
print(f"Samsung products in original unique_df: {len(samsung_in_original)}")

if len(samsung_in_original) > 0:
    print("Sample Samsung from original data:")
    sample_samsung = samsung_in_original.iloc[0]
    print(f"Uniq ID: {sample_samsung['Uniq Id']}")
    print(f"Original text: {sample_samsung['text_to_embed'][:200]}...")
    
    # NOW CHECK: Did this specific product get embedded?
    sample_id = sample_samsung['Uniq Id']
    
    # Search your collection for this specific product
    embedded_items = collection.get(where={"uniq_id": sample_id})
    print(f"\nItems in v3 database for this Samsung product: {len(embedded_items['ids'])}")
    
    if embedded_items['ids']:
        print("What got embedded for this Samsung product:")
        for i, (id, meta, doc) in enumerate(zip(embedded_items['ids'], embedded_items['metadatas'], embedded_items['documents'])):
            print(f"  {i+1}. ID: {id}")
            print(f"     Type: {meta.get('type')}")
            print(f"     Content: {doc[:100] if doc else 'None'}...")
    else:
        print("❌ This Samsung product was NEVER embedded into v3!")

=== CHECKING ORIGINAL DATA vs EMBEDDED DATA ===
Samsung products in original unique_df: 1
Sample Samsung from original data:
Uniq ID: ab8173b65f1c2a497b1bca49ddb6dc1f
Original text: This product is Ultimaxx Phone Monitor Sun Shade Cover Tablets Pad Hood for DJI Phantom 4/3, Mavic Pro, Inspire, OSMO, M600 Monitor Remote Controller (Small Tablet 8 Inch). It falls under the category...

Items in v3 database for this Samsung product: 11
What got embedded for this Samsung product:
  1. ID: img_ab8173b65f1c2a497b1bca49ddb6dc1f_7689
     Type: image
     Content: https://images-na.ssl-images-amazon.com/images/I/4105KghjJ-L.jpg...
  2. ID: img_ab8173b65f1c2a497b1bca49ddb6dc1f_7690
     Type: image
     Content: https://images-na.ssl-images-amazon.com/images/I/31IupTXN5UL.jpg...
  3. ID: img_ab8173b65f1c2a497b1bca49ddb6dc1f_7691
     Type: image
     Content: https://images-na.ssl-images-amazon.com/images/I/411FPLaE9JL.jpg...
  4. ID: img_ab8173b65f1c2a497b1bca49ddb6dc1f_7692
     Type: image
 

In [49]:
# Diagnostic 2: Test embedding consistency
from transformers import CLIPProcessor, CLIPModel
import torch

# Load CLIP model (same as in your RAG)
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Test text embedding
test_text = "This is a test product description"
text_inputs = processor(text=[test_text], return_tensors="pt", padding=True, truncation=True, max_length=77)
with torch.no_grad():
    text_embedding = model.get_text_features(**text_inputs)

print(f"Text embedding shape: {text_embedding.shape}")
print(f"Text embedding dimension: {text_embedding.shape[1]}")

# Now test with a simple query
query_text = "laptop computer"
query_inputs = processor(text=[query_text], return_tensors="pt", padding=True, truncation=True, max_length=77)
with torch.no_grad():
    query_embedding = model.get_text_features(**query_inputs)

print(f"\nQuery embedding shape: {query_embedding.shape}")
print(f"Query embedding dimension: {query_embedding.shape[1]}")

# Verify they match
print(f"\nDimensions match: {text_embedding.shape[1] == query_embedding.shape[1]}")

Text embedding shape: torch.Size([1, 512])
Text embedding dimension: 512

Query embedding shape: torch.Size([1, 512])
Query embedding dimension: 512

Dimensions match: True


In [50]:
# Diagnostic 3: Direct retrieval test
from langchain_chroma import Chroma

# Your ClipEmbeddings class (copy from your code)
class ClipEmbeddings:
    def __init__(self):
        self.model = model  # Use already loaded model
        self.processor = processor

    def _embed_text(self, texts):
        inputs = self.processor(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=77)
        with torch.no_grad():
            embeddings = self.model.get_text_features(**inputs)
        return embeddings.cpu().numpy()

    def embed_documents(self, texts):
        return self._embed_text(texts).tolist()

    def embed_query(self, text):
        return self._embed_text([text])[0].tolist()

# Load vectorstore
embedding_function = ClipEmbeddings()
vectorstore = Chroma(
    persist_directory="./my_vectorstore_exploded_v3",
    embedding_function=embedding_function,
    collection_name="amazon_products_exploded_v3"
)

# Test simple retrieval
test_query = "laptop"
results = vectorstore.similarity_search(test_query, k=5)

print(f"Found {len(results)} results for query: '{test_query}'")
for i, doc in enumerate(results):
    print(f"\nResult {i+1}:")
    print(f"Type: {doc.metadata.get('type', 'unknown')}")
    print(f"Content preview: {doc.page_content[:200] if doc.page_content else 'Empty'}")
    print(f"Metadata: {doc.metadata}")

Found 5 results for query: 'laptop'

Result 1:
Type: text
Content preview: x 3 8 . 6 inches . this item is the product is not sold by amazon .
Metadata: {'type': 'text', 'uniq_id': '657263b44d454d69464a05466accbd36', 'chunk': 5}

Result 2:
Type: text
Content preview: 0 x 1 9 x 1 0 inches . this item is the product is not sold by amazon .
Metadata: {'type': 'text', 'uniq_id': 'c538fdf30ee94fff0364329c4f42b391', 'chunk': 5}

Result 3:
Type: text
Content preview: x 3 0 . 5 cm ( 1 8 " x 1 2 "). this item is the product is not sold by amazon .
Metadata: {'uniq_id': '52c3aafbb6d35a656aa21996d1d200de', 'type': 'text', 'chunk': 2}

Result 4:
Type: text
Content preview: 0 x 1 0 x 2 inches . this item is the product is not sold by amazon .
Metadata: {'uniq_id': '762c9ee1be7ca359274fecdba946cfe3', 'chunk': 7, 'type': 'text'}

Result 5:
Type: text
Content preview: 1 . 1 8 . this item is the product is not sold by amazon .
Metadata: {'chunk': 2, 'type': 'text', 'uniq_id': '06933927ded931d544bbd050d

In [51]:
# Diagnostic 4: Check the actual stored text content
import chromadb

client = chromadb.PersistentClient(path="./my_vectorstore_exploded_v3")
collection = client.get_or_create_collection(name="amazon_products_exploded_v3")

# Get some text documents specifically
text_docs = collection.get(
    where={"type": "text"},
    limit=5,
    include=['documents', 'metadatas']
)

print("="*60)
print("STORED TEXT CONTENT CHECK")
print("="*60)
for i, (doc, meta) in enumerate(zip(text_docs['documents'], text_docs['metadatas'])):
    print(f"\nDocument {i+1}:")
    print(f"Uniq ID: {meta.get('uniq_id', 'unknown')}")
    print(f"Chunk: {meta.get('chunk', 'unknown')}")
    print(f"Full content: {doc}")
    print("-"*40)

STORED TEXT CONTENT CHECK

Document 1:
Uniq ID: 4c69b61db1fc16e7013b43fc926e502d
Chunk: 1
Full content: this product is db longboards coreflex crossbow 4 1 " bamboo fiberglass longboard complete . it falls under the category of sports & outdoors | outdoor recreation | skates , skateboards & scooters | skateboarding | standard skateboards & longboards | longboards . the price is $ 2 3 7 . 6 8 . product description : make sure this fits by entering your model number
----------------------------------------

Document 2:
Uniq ID: 4c69b61db1fc16e7013b43fc926e502d
Chunk: 2
Full content: 6 8 . product description : make sure this fits by entering your model number . | responsive flex : the crossbow features a bamboo core encased in triaxial fiberglass and hd plastic for a responsive flex pattern that ’ s second to none . pumping & carving have never been so satisfying ! flex 2 is recommended for people 1 2 0 to 1 7 0 pounds . | core
----------------------------------------

Document 3:
Uniq I

In [53]:
# Diagnostic 5: Test text creation
# Load a sample row from your dataframe
sample_row = exploded_df.iloc[0]
print("Sample row data:")
print(f"Product Name: {sample_row.get('Product Name', 'N/A')}")
print(f"Category: {sample_row.get('Category', 'N/A')}")
print(f"About Product: {sample_row.get('About Product', 'N/A')[:200] if 'About Product' in sample_row else 'N/A'}")

# Test the text creation
created_text = create_product_text(sample_row)
print(f"\nCreated text:\n{created_text}")

# Check if text_to_embed column exists and has good data
if 'text_to_embed' in exploded_df.columns:
    print("\nSample text_to_embed values:")
    for i in range(min(3, len(exploded_df))):
        print(f"\nRow {i}: {exploded_df.iloc[i]['text_to_embed'][:500]}")

Sample row data:
Product Name: DB Longboards CoreFlex Crossbow 41" Bamboo Fiberglass Longboard Complete
Category: Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards
About Product: Make sure this fits by entering your model number. | RESPONSIVE FLEX: The Crossbow features a bamboo core encased in triaxial fiberglass and HD plastic for a responsive flex pattern that’s second to n

Created text:
This product is DB Longboards CoreFlex Crossbow 41" Bamboo Fiberglass Longboard Complete. It falls under the category of Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards. The price is $237.68. Product description: Make sure this fits by entering your model number. | RESPONSIVE FLEX: The Crossbow features a bamboo core encased in triaxial fiberglass and HD plastic for a responsive flex pattern that’s second to none. Pumping & c

In [54]:
# Diagnostic: Check your image URLs
print("="*60)
print("IMAGE URL ANALYSIS")
print("="*60)

# Check what URLs look like in your dataframe
sample_images = exploded_df[exploded_df['Image'].notna()]['Image'].head(20)
print("Sample image URLs:")
for i, url in enumerate(sample_images):
    print(f"{i+1}. {url[:100]}...")  # First 100 chars
    if 'transparent' in url.lower():
        print("   ⚠️ Contains 'transparent'")

# Count different URL patterns
all_urls = exploded_df[exploded_df['Image'].notna()]['Image']
transparent_count = all_urls.str.contains('transparent', case=False).sum()
total_count = len(all_urls)

print(f"\n{'-'*40}")
print(f"Total image URLs: {total_count}")
print(f"URLs with 'transparent': {transparent_count}")
print(f"Percentage filtered out: {transparent_count/total_count*100:.1f}%")

# Check what's actually in your vector store
import chromadb
client = chromadb.PersistentClient(path="./my_vectorstore_exploded_v3")
collection = client.get_or_create_collection(name="amazon_products_exploded_v3")

stored_images = collection.get(where={"type": "image"}, limit=10)
print(f"\n{'-'*40}")
print(f"Stored image embeddings: {collection.count(where={'type': 'image'})}")
print("\nSample stored image URLs:")
for doc in stored_images['documents'][:5]:
    print(f"  - {doc[:100]}...")

IMAGE URL ANALYSIS
Sample image URLs:
1. https://images-na.ssl-images-amazon.com/images/I/51j3fPQTQkL.jpg...
2. https://images-na.ssl-images-amazon.com/images/I/31hKM3cSoSL.jpg...
3. https://images-na.ssl-images-amazon.com/images/I/51WlHdwghfL.jpg...
4. https://images-na.ssl-images-amazon.com/images/I/51FsyLRBzwL.jpg...
5. https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/transparent-pixel.jpg...
   ⚠️ Contains 'transparent'
6. https://images-na.ssl-images-amazon.com/images/I/51M0KnJxjKL.jpg...
7. https://images-na.ssl-images-amazon.com/images/I/5166GD8OkXL.jpg...
8. https://images-na.ssl-images-amazon.com/images/I/61o5S1VnaNL.jpg...
9. https://images-na.ssl-images-amazon.com/images/I/61t4Q0rPYjL.jpg...
10. https://images-na.ssl-images-amazon.com/images/I/61NASUAyqcL.jpg...
11. https://images-na.ssl-images-amazon.com/images/I/51OMrADdyJL.jpg...
12. https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/transparent-pixel.jpg...
   ⚠️ Contains 'transparent'

TypeError: Collection.count() got an unexpected keyword argument 'where'

In [55]:
def get_image_embedding_from_single_url(url_string, uniq_id):
    """Download image from URL and get CLIP embedding with better validation"""
    url = url_string.strip()
    
    try:
        print(f"Processing: {url[:50]}...")
        
        # Download image
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"  Failed to download (status {response.status_code})")
            return None, None
        
        # Open and validate image
        image = Image.open(BytesIO(response.content)).convert('RGB')
        
        # Check if image is valid (not a transparent pixel or placeholder)
        if is_valid_product_image(image):
            # Get CLIP embedding
            embedding = get_image_embedding(image)
            
            metadata = {
                "uniq_id": str(uniq_id),
                "type": "image"
            }
            return embedding, metadata
        else:
            print(f"  Skipped: Invalid/placeholder image for {uniq_id}")
            return None, None
            
    except Exception as e:
        print(f"  ERROR processing {uniq_id}: {e}")
        return None, None

def is_valid_product_image(image):
    """
    Check if an image is a real product image or just a placeholder
    """
    # Convert to RGB if needed
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    # Check 1: Size (placeholders are often tiny)
    width, height = image.size
    if width < 50 or height < 50:
        return False
    
    # Check 2: Variation in pixels (placeholders are often uniform)
    # Sample some pixels
    pixels = np.array(image)
    
    # Calculate standard deviation of pixel values
    std_dev = np.std(pixels)
    if std_dev < 5:  # Very uniform image (likely a placeholder)
        return False
    
    # Check 3: Not mostly transparent/white
    # (Some placeholders are just white squares)
    mean_pixel_value = np.mean(pixels)
    if mean_pixel_value > 250:  # Mostly white
        return False
    
    return True

In [56]:
# Test image search directly
from PIL import Image
import requests
from io import BytesIO

# Download a test image from your dataset
test_url = exploded_df[exploded_df['Image'].notna()]['Image'].iloc[0]
response = requests.get(test_url)
test_image = Image.open(BytesIO(response.content))

# Display the image (in Jupyter)
display(test_image)

# Now search with this image
embedding_function = ClipEmbeddings()
image_embedding = embedding_function.embed_image(BytesIO(response.content))

if image_embedding:
    # Search in your vectorstore
    vectorstore = Chroma(
        persist_directory="./my_vectorstore_exploded_v3",
        embedding_function=embedding_function,
        collection_name="amazon_products_exploded_v3"
    )
    
    # Search by vector
    results = vectorstore.similarity_search_by_vector(image_embedding, k=5)
    
    print("Image search results:")
    for i, doc in enumerate(results):
        print(f"\nResult {i+1}:")
        print(f"  Type: {doc.metadata.get('type')}")
        print(f"  Product: {doc.metadata.get('uniq_id')}")
        if doc.metadata.get('type') == 'image':
            print(f"  URL: {

SyntaxError: incomplete input (195573220.py, line 35)