# Process each message and upload the vectors to OpenSearch

## Load the model

In [1]:
import os
import clip
import torch
from PIL import Image

# Load the open CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

100%|████████████████████████████████████████| 338M/338M [00:01<00:00, 189MiB/s]


## Create a function that computes the feature vectors for a batch of images

In [2]:
def compute_clip_features(photos_batch):
    # Load all the photos from the files
    photos = [Image.open(photo_file) for photo_file in photos_batch]
    
    # Preprocess all photos
    photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)

    with torch.no_grad():
        # Encode the photos batch to compute the feature vectors and normalize them
        photos_features = model.encode_image(photos_preprocessed)
        photos_features /= photos_features.norm(dim=-1, keepdim=True)

    # Transfer the feature vectors back to the CPU and convert to numpy
    return photos_features.cpu().numpy()

## Create a function to index embeddings into OpenSearch

In [3]:
from dotenv import load_dotenv
load_dotenv()
SERVICE_URI = os.getenv("SERVICE_URI")

from opensearchpy import OpenSearch
opensearch = OpenSearch(SERVICE_URI, use_ssl=True)
from opensearchpy.helpers import bulk

def index_embeddings_to_opensearch(data):
    actions = []
    for d in data:
        action = {
            "_index": "photos",  # Update with your index name
            "_source": {
                "image_url": d['image_url'],
                "embedding": d['embedding'].tolist()
            }
        }
        actions.append(action)
    success, _ = bulk(opensearch, actions, index="photos")
    print(f"Indexed {success} embeddings to OpenSearch")


## Iterate over images and process them in batches

In [4]:
import os

# Path to the directory containing photos
image_dir = "photos"

# Batch size for processing images and indexing embeddings
batch_size = 100

# List to store embeddings
data = []

# Process images in batches
image_files = os.listdir(image_dir)
for i in range(0, len(image_files), batch_size):
    batch_files = image_files[i:i+batch_size]
    batch_file_paths = [os.path.join(image_dir, file) for file in batch_files]

    # Compute embeddings for the batch of images
    batch_embeddings = compute_clip_features(batch_file_paths)

    # Create data dictionary for indexing
    for file_path, embedding in zip(batch_file_paths, batch_embeddings):
        data.append({'image_url': file_path, 'embedding': embedding})

    # Check if we have enough data to index
    if len(data) >= batch_size:
        index_embeddings_to_opensearch(data)
        data = []

# Index any remaining data
if len(data) > 0:
    index_embeddings_to_opensearch(data)

print("All embeddings indexed successfully.")



Indexed 100 embeddings to OpenSearch
Indexed 100 embeddings to OpenSearch
Indexed 100 embeddings to OpenSearch
Indexed 100 embeddings to OpenSearch
Indexed 100 embeddings to OpenSearch
Indexed 100 embeddings to OpenSearch
Indexed 100 embeddings to OpenSearch
Indexed 100 embeddings to OpenSearch
Indexed 100 embeddings to OpenSearch
Indexed 100 embeddings to OpenSearch
All embeddings indexed successfully.
