# Product Image Similarity Search with CLIP

Pure visual similarity matching using CLIP embeddings:
1. Generate CLIP embeddings for all product images
2. Find visually similar products using cosine similarity
3. Optional: Use GPT-4.1 for final verification

## Setup and Dependencies

Install required packages for CLIP model and image processing.

In [None]:
# Run this to install the necessary packages for using the CLIP model
!pip install transformers torch torchvision torchaudio

## Initialize Models and Libraries
### CLIP Model
Embeddings for images are offered in the API by some providers (Google, for instance) but unfortunately not OpenAI. To avoid having to deal with Google service accounts and access we'll use the local CLIP model.

CLIP is a model from OpenAI that links images and text in a shared representation space. It turns images into embeddings—numerical vectors that capture semantic meaning—so we can measure how similar two images are.

We use CLIP because it allows us to perform similarity search on images: given a query image, we can find other images that look alike or share visual characteristics.

In [None]:
import base64
import os
from pathlib import Path
from typing import List, Optional, Dict, Any

import numpy as np
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from openai import OpenAI
from pydantic import BaseModel, Field
from sklearn.metrics.pairwise import cosine_similarity

# Initialize
client = OpenAI()
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model.to(device)

print(f"✅ CLIP model loaded on {device}")

In [None]:
class SimilarityMatch(BaseModel):
    image_id: str
    is_same_product: bool
    confidence_score: float = Field(ge=0.0, le=1.0)
    reasoning: str

class ProductMatchResult(BaseModel):
    query_image_path: str
    matches: List[SimilarityMatch]
    best_match_id: Optional[str]
    summary: str

In [None]:
def get_clip_embedding(image_path: str) -> np.ndarray:
    """Generate CLIP visual embedding for an image"""
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    
    with torch.no_grad():
        image_features = clip_model.get_image_features(**inputs)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    
    return image_features.cpu().numpy().flatten()

def encode_image(image_path: str) -> str:
    """Encode image as base64 for OpenAI API"""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

print("✅ Helper functions ready")

# Product Database
We initialize a product database for storing embeddings and doing similarity search

In [None]:
class ProductDatabase:
    def __init__(self):
        self.products = {}
        self.embeddings = {}
    
    def add_product(self, product_id: str, image_path: str):
        self.products[product_id] = {"id": product_id, "image_path": image_path}
        self.embeddings[product_id] = get_clip_embedding(image_path)
        print(f"Added {product_id}")
    
    def find_similar(self, query_embedding: np.ndarray, top_k: int = 5) -> List[str]:
        if not self.embeddings:
            return []
        
        ids = list(self.embeddings.keys())
        vectors = np.stack([self.embeddings[pid] for pid in ids])
        similarities = cosine_similarity([query_embedding], vectors)[0]
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        return [ids[i] for i in top_indices]
    
    def get_similarities(self, query_embedding: np.ndarray, product_ids: List[str]) -> List[float]:
        vectors = np.stack([self.embeddings[pid] for pid in product_ids if pid in self.embeddings])
        if len(vectors) == 0:
            return []
        return cosine_similarity([query_embedding], vectors)[0].tolist()
    
    def size(self) -> int:
        return len(self.products)

db = ProductDatabase()
print("✅ Database ready")

In [None]:
images_dir = Path("sample_product_images")
image_files = list(images_dir.glob("*.jpg")) + list(images_dir.glob("*.png"))

print(f"Found {len(image_files)} images")
print("Building database...")

for img_path in image_files:
    db.add_product(img_path.stem, str(img_path))

print(f"✅ Database built with {db.size()} products")

In [None]:
def find_similar_products(query_image_path: str, top_k: int = 5) -> ProductMatchResult:
    """Find similar products using CLIP embeddings"""
    
    # Get CLIP embedding for query
    query_embedding = get_clip_embedding(query_image_path)
    
    # Find similar products
    similar_ids = db.find_similar(query_embedding, top_k)
    similarities = db.get_similarities(query_embedding, similar_ids)
    
    # Create matches
    matches = []
    for i, product_id in enumerate(similar_ids):
        score = similarities[i] if i < len(similarities) else 0.0
        is_match = score > 0.85
        
        matches.append(SimilarityMatch(
            image_id=product_id,
            is_same_product=is_match,
            confidence_score=min(score, 0.95),
            reasoning=f"CLIP similarity: {score:.3f}"
        ))
    
    best_match = next((m.image_id for m in matches if m.is_same_product), None)
    
    return ProductMatchResult(
        query_image_path=query_image_path,
        matches=matches,
        best_match_id=best_match,
        summary=f"Found {len([m for m in matches if m.is_same_product])} potential matches"
    )

print("✅ Search function ready")

# Testing
## Simple similarity test
First we do a simple test to see if the similarity search works. We test with an image we know exists in the database already. This should lead to a very high matching score.

In [None]:
query_image = image_files[5]

print(f"🔍 Testing with: {query_image.name}")
print("="*50)

result = find_similar_products(str(query_image))

print(f"Summary: {result.summary}")
print("\nTop matches:")

for i, match in enumerate(result.matches, 1):
    status = "✅ MATCH" if match.is_same_product else "❌ NO MATCH"
    print(f"{i}. {match.image_id}: {status} ({match.confidence_score:.3f})")

if result.best_match_id:
    print(f"\n🏆 Best match: {result.best_match_id}")

## Similarity search together with GPT-4.1
Now we will use GPT-4.1 to analyze if the most similar candidates match the query. So we do a rough similarity search first, and then we let GPT-4.1 reason if any are correct.

In [None]:
def analyze_with_gpt(query_image_path: str, candidate_paths: List[str], candidate_ids: List[str]) -> ProductMatchResult:    
    content = [{
        "type": "input_text",
        "text": f"Compare the first image to these candidates: {candidate_ids}. Which show the same exact product?"
    }]
    
    # Add query image
    content.append({
        "type": "input_image",
        "image_url": f"data:image/jpeg;base64,{encode_image(query_image_path)}"
    })
    
    # Add candidate images
    for i, img_path in enumerate(candidate_paths):
        content.extend([
            {"type": "input_text", "text": f"Candidate {candidate_ids[i]}:"},
            {"type": "input_image", "image_url": f"data:image/jpeg;base64,{encode_image(img_path)}"}
        ])
    
    response = client.responses.parse(
        model="gpt-4.1",
        input=[{"role": "user", "content": content}],
        text_format=ProductMatchResult
    )
    
    return response.output_parsed

print("✅ GPT-4.1 reasoning function ready")

Let's test with an example image of a bottle of Pernod that we know exist in the database.

In [None]:
from IPython.display import display, Image as NotebookImage
from PIL import Image
import matplotlib.pyplot as plt

#query_image = image_files[3]
query_image = Path("pernod.png")

print("🤖 TESTING GPT-4.1 MULTI-MODAL REASONING")
print("=" * 60)

clip_result = find_similar_products(str(query_image), top_k=5)

print(f"📸 Query: {query_image.name}")
display(NotebookImage(filename=str(query_image)))

print(f"🔍 CLIP found {len(clip_result.matches)} similar images")

candidate_paths = []
candidate_ids = []

for match in clip_result.matches:
    img_path = db.products[match.image_id]["image_path"]
    candidate_paths.append(img_path)
    candidate_ids.append(match.image_id)

# Display candidates side by side
fig, axes = plt.subplots(1, len(candidate_paths), figsize=(5 * len(candidate_paths), 5))
if len(candidate_paths) == 1:
    axes = [axes]
for ax, path, cid in zip(axes, candidate_paths, candidate_ids):
    ax.imshow(Image.open(path))
    ax.set_title(cid)
    ax.axis("off")
plt.show()

print("\n🧠 Sending to GPT-4.1 for reasoning...")

try:
    gpt_result = analyze_with_gpt(str(query_image), candidate_paths, candidate_ids)
    
    print("\n📊 GPT-4.1 RESULTS:")
    print(f"Summary: {gpt_result.summary}")
    
    print("\nDetailed analysis:")
    for match in gpt_result.matches:
        status = "✅ MATCH" if match.is_same_product else "❌ NO MATCH"
        print(f"  {match.image_id}: {status} (confidence: {match.confidence_score:.3f})")
        print(f"    Reasoning: {match.reasoning}")
    
    if gpt_result.best_match_id:
        print(f"\n🏆 GPT-4.1 selected: {gpt_result.best_match_id}")
    else:
        print("\n❌ GPT-4.1 found no exact matches")
        
except Exception as e:
    print(f"❌ Error with GPT-4.1 analysis: {e}")
