## Importing Necessary Libraries

In [80]:
import os
import numpy as np
import pandas as pd
import time
import requests
import json
import torch
import clip
import glob
# import pymilvus
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    db,
    MilvusClient
)
import unicodedata
import open_clip
from all_clip import load_clip
from PIL import Image


In [2]:
print(torch.__version__)

2.2.1+cpu


### Setting Directories and Folders

In [3]:
IMAGES_URL_DIRECTORY = 'image_folder'
IMAGE_EMBEDDINGS_DIRECTORY = 'image_embeddings_complete'
TITLE_EMBEDDINGS_DIRECTORY = 'title_embeddings'
title_embeddings_path = 'title_embeddings/text_emb/text_emb_0.npy'
image_embeddings_path = 'image_embeddings_complete/img_emb/img_emb_0.npy'
image_embeddings_meta_data_file = 'image_embeddings_complete/metadata/metadata_0.parquet'
title_embeddings_meta_data_file = 'title_embeddings/metadata/metadata_0.parquet'
data_path = 'D:\Datasets\Amazon\Amazon Product Dataset\meta_All_Beauty.jsonl'

### Loading Clip Model for inference

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model, preprocess, tokenizer = load_clip(clip_model='open_clip:ViT-B-16')

warming up with batch size 1 on cpu
done warming up in 9.828677415847778s


### Functions for generating Embeddings for Inference

In [87]:
def get_text_query_embedding(query):
    # Load the model and preprocess function
    text_tokens = tokenizer(query)
    with torch.no_grad():
        text_features = model.encode_text(text_tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    text_embs = text_features.cpu().to(torch.float32).numpy()
    return text_embs[0]

def generate_image_embeddings(img):
    img = preprocess(img).unsqueeze(0).to(device) 
    with torch.no_grad():
        image_embeddings = model.encode_image(img).float()
        image_embeddings /= image_embeddings.norm(dim = -1, keepdim = True)
    return image_embeddings.cpu().to(torch.float32).numpy()[0]

### Function for creating url and keys dictionary

In [6]:
def creating_images_url_dictionary(path = IMAGES_URL_DIRECTORY):
# Initialize an empty dictionary
    url_key_dict = {}

    # List of file paths (adjust the path and extension as needed)
    file_paths = glob.glob(f'{path}/*.parquet')
    
    # print(len(file_paths))
    
    for file_path in file_paths:
        # Read the file into a DataFrame
        data = pd.read_parquet(file_path)
        
        # Ensure the key column is in string format
        data['key'] = data['key'].astype(str)
        
        # Update the dictionary with data from the current file
        url_key_dict.update(data.set_index('url')['key'].to_dict())
    
    return url_key_dict

### Function for preparing the meta data of the images to get the keys

In [7]:
def preparing_image_meta_data(image_embeddings_meta_data_file= image_embeddings_meta_data_file):
    image_embeddings_meta_data = pd.read_parquet(image_embeddings_meta_data_file)
    image_embeddings_meta_data['key'] = image_embeddings_meta_data['image_path'].str.extract(r'(\d{9})')
    return image_embeddings_meta_data


### Functions for getting the index of image and title embeddings

In [8]:
def get_index_of_image_embedding(image_embeddings_metadata, image_key):
    index_value = image_embeddings_metadata.loc[image_embeddings_metadata['key'] == image_key].index
    if not index_value.empty:
        return index_value[0]
    else:
        return False

def get_index_of_title_embedding(title_embeddings_metadata, title_key):
    index_value = title_embeddings_metadata.loc[title_embeddings_metadata['caption'] == title_key].index
    if not index_value.empty:
        return index_value[0]
    else:
        return False

### Function for extracting the urls

In [9]:
def extract_img_urls(image_array):
    urls = []
    for item in image_array:
        # print(item)
        if 'hi_res' in item and item['hi_res']:
            urls.append(item['hi_res'])
        elif 'large' in item and item['large']:
            urls.append(item['large'])
        else:
            print(f"Key 'hi_res' and 'large' not found in item: {item}")
        if len(urls) == 3:
            return urls
    return urls

### Functions to get the image and title embeddings

In [10]:
def get_image_embedding(all_image_embeddings, index):
    if index < 0 or index >=(len(all_image_embeddings)):
        print(f'{index} index is not in all the image embeddings!')
        return [0]
    return all_image_embeddings[index]

def get_title_embedding(title_embeddings, index):
    return title_embeddings[index]

### Loading the Image and Title embeddings

In [11]:
all_image_embeddings = np.load(image_embeddings_path)
title_embeddings = np.load(title_embeddings_path)

In [12]:
print(len(all_image_embeddings))
# title_embeddings[5]

548083


In [13]:
magnitude = np.linalg.norm(all_image_embeddings[53464])
magnitude

0.998

### Loading the Title and Image Meta files

In [14]:
title_embeddings_metadata = pd.read_parquet(title_embeddings_meta_data_file)

In [15]:
image_embeddings_metadata = preparing_image_meta_data()
print(image_embeddings_metadata.head())
print(len(image_embeddings_metadata))

                         image_path        key
0  image_folder/00000/000000000.jpg  000000000
1  image_folder/00000/000000001.jpg  000000001
2  image_folder/00000/000000002.jpg  000000002
3  image_folder/00000/000000003.jpg  000000003
4  image_folder/00000/000000004.jpg  000000004
548083


In [16]:
images_url_dictionary = creating_images_url_dictionary()
print(len(images_url_dictionary))

478384


In [17]:
n = 2
# View the first n items of the dictionary
first_n_items = list(images_url_dictionary.items())[:n]
first_n_items

[('https://m.media-amazon.com/images/I/517uoA+-gzL._SL1005_.jpg', '000000016'),
 ('https://m.media-amazon.com/images/I/510BWq7O95L._SL1005_.jpg', '000000015')]

### Connecting to the Milvus Database

In [44]:
if connections.has_connection("default"):
    connections.disconnect("default")

In [18]:
host = "141.195.16.189"
port = 40238 # Mapping for 19530 (default Milvus port)

# Connect to Milvus
client = connections.connect("default", host=host, port=port)

# Check if the connection is established
print("Is Milvus connected:", connections.has_connection("default"))

# Optional: List collections to confirm the connection
from pymilvus import utility
print("Collections:", utility.list_collections())

Is Milvus connected: True
Collections: []


In [19]:
db.list_database()
# db.create_database('Products')

['default', 'My_Products', 'Products']

In [20]:
# database = db.create_database("My_Products")
db.using_database("Products")

### Creating Field Schemas for the Collections

In [21]:
fields = [
    FieldSchema(name="product_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=512),  
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length = 5000),
    FieldSchema(name="average_rating", dtype=DataType.FLOAT),
    FieldSchema(name="features", dtype=DataType.ARRAY, max_capacity = 1000, element_type = DataType.VARCHAR, max_length = 10000),
    FieldSchema(name="description", dtype=DataType.ARRAY, max_capacity = 500, element_type = DataType.VARCHAR, max_length = 50000),
    FieldSchema(name="categories", dtype=DataType.ARRAY, max_capacity = 500, element_type = DataType.VARCHAR, max_length = 500),
    FieldSchema(name="price", dtype=DataType.FLOAT),
    FieldSchema(name="store", dtype=DataType.VARCHAR, max_length = 500),
    FieldSchema(name="main_category", dtype=DataType.VARCHAR, max_length = 500)
]

fields_images = [
    FieldSchema(name="image_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="p_id", dtype=DataType.INT64),  # Foreign key to Products
    FieldSchema(name="image_vector", dtype=DataType.FLOAT_VECTOR, dim=512),
    FieldSchema(name="image_url", dtype=DataType.VARCHAR, max_length=10000)
]

### Creating the collections

In [22]:
product_schema = CollectionSchema(fields, description="Products collection")
products_collection = Collection(name = 'products', schema=product_schema)

In [23]:
image_schema = CollectionSchema(fields_images, description='Images Collection')
images_collection = Collection(name = 'images', schema = image_schema)

In [24]:
collections = utility.list_collections()
print(collections)

['products', 'images']


### Creating the Indices for the collections and loading them

In [25]:
index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 256}
}

products_collection.create_index(field_name="title_vector", index_params = index_params)
products_collection.load()
# products_collection.release()
# products_collection.drop_index()

images_collection.create_index(field_name="image_vector", index_params = index_params)
images_collection.load()
# images_collection.release()
# images_collection.drop_index()

In [26]:
# Replace 'your_collection_name' with the name of your collection

# Get the number of entities
num_entities = products_collection.num_entities
print(f"Number of items in collection '{products_collection}': {num_entities}")

Number of items in collection '<Collection>:
-------------
<name>: products
<description>: Products collection
<schema>: {'auto_id': True, 'description': 'Products collection', 'fields': [{'name': 'product_id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'title_vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 512}}, {'name': 'title', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 5000}}, {'name': 'average_rating', 'description': '', 'type': <DataType.FLOAT: 10>}, {'name': 'features', 'description': '', 'type': <DataType.ARRAY: 22>, 'params': {'max_length': 10000, 'max_capacity': 1000}, 'element_type': <DataType.VARCHAR: 21>}, {'name': 'description', 'description': '', 'type': <DataType.ARRAY: 22>, 'params': {'max_length': 50000, 'max_capacity': 500}, 'element_type': <DataType.VARCHAR: 21>}, {'name': 'categories', 'description': '', 'type': <DataType.ARRAY: 22>, 'params': {

## Functions for pre-processing the products and storing them in the Milvus Database in Batches

In [42]:
def clean_text(text):
    if not text:
        return ""
    
    # Remove non-UTF-8 characters
    return ''.join([c for c in text if ord(c) < 128])

def insert_product_batch(products):
    if products:
        products_collection.insert(products)
        products_collection.flush()

def insert_image_batch(images):
    if images:
        images_collection.insert(images)
        images_collection.flush()

def process_line(line, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary):
    data = json.loads(line.strip())
    if '"' in data['title']:
        return None, None

    # title = data['title']
    title = clean_text(data['title'])
    title_index = get_index_of_title_embedding(title_embeddings_metadata, title)
    if not title_index:
        return None, None
    title_embedding = get_title_embedding(title_embeddings, title_index)
    
    # Extract product information
    average_rating = data['average_rating']
    features = data['features']
    description = data['description']
    categories = data['categories']
    price = data['price']
    main_category = data['main_category']
    store = data['store']
    if not price:
        price = 0.0
    if not store:
        store = ''
    if not main_category:
        main_category = ''
    if not average_rating:
        average_rating = 0.0

    # Prepare product data
    product_data = {
        'title_vector': title_embedding.tolist(),
        'title': title,
        'average_rating': average_rating,
        'features': features,
        'description': description,
        'categories': categories,
        'price': price,
        'store': store,
        'main_category': main_category,
    }

    # Extract and process image URLs
    image_urls = extract_img_urls(data['images'])
    current_image_embeddings = []
    for url in image_urls:
        if url in images_url_dictionary and images_url_dictionary[url]:
            image_key = images_url_dictionary[url]
        else:
            continue

        image_index = get_index_of_image_embedding(image_embeddings_metadata, image_key)
        if not image_index:
            print('no image index')
            continue
        
        image_embedding = get_image_embedding(all_image_embeddings, image_index)
        if len(image_embedding) > 1 :
            current_image_embeddings.append((image_embedding, url))

    return product_data, current_image_embeddings

def create_and_store_data(data_path, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary):
    product_batch = []
    image_batch = []
    i = 0
    with open(data_path) as file:
        for line in (file):
            i += 1
            if i < 8000:
                continue

            product_data, current_image_embeddings = process_line(line, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary)
            if product_data and current_image_embeddings:
                product_batch.append((product_data, current_image_embeddings))
                if len(product_batch) >= 100:
                    # Insert products
                    products_data = [pd for pd, _ in product_batch]
                    insert_product_batch(products_data)

                    # Retrieve product IDs for inserted products
                    titles = [pd['title'] for pd, _ in product_batch]
                    product_ids = products_collection.query(expr=f'title in {titles}', output_fields=["title", "product_id"])

                    title_to_id = {p['title']: p['product_id'] for p in product_ids}

                    # Prepare image data
                    for pd, image_ki_embeddings in product_batch:
                        product_id = title_to_id.get(pd['title'])
                        for image_embedding, image_url in image_ki_embeddings:
                            image_data = {
                                'p_id': product_id,
                                'image_vector': image_embedding.tolist(),
                                'image_url': image_url,
                            }
                            image_batch.append(image_data)

                    if image_batch:
                        insert_image_batch(image_batch)

                    # Clear batches
                    product_batch = []
                    image_batch = []

    # Insert any remaining data
    if product_batch:
        products_data = [pd for pd, _ in product_batch]
        insert_product_batch(products_data)
        
        titles = [pd['title'] for pd, _ in product_batch]
        product_ids = products_collection.query(expr=f'title in {titles}', output_fields=["title", "product_id"])
        
        title_to_id = {p['title']: p['product_id'] for p in product_ids}
        
        for pd, image_ki_embeddings in product_batch:
            product_id = title_to_id.get(pd['title'])
            for image_embedding, image_url in image_ki_embeddings:
                image_data = {
                    'p_id': product_id,
                    'image_vector': image_embedding.tolist(),
                    'image_url': image_url,
                }
                image_batch.append(image_data)
        
        if image_batch:
            insert_image_batch(image_batch)

In [43]:
# Run the data creation and storage process
create_and_store_data(data_path, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary)

no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image i

KeyboardInterrupt: 

### Sample search Query

In [33]:
title = "Auric Blends Stella Blue"
product_ids = products_collection.query(expr=f'title == "{title}"', output_fields=["title", 'features', 'price', 'product_id', 'title_vector'])
title_v_embedding = product_ids[0]["title_vector"]
# title_v_embedding = np.float16(title_v_embedding)
title_v_embedding

IndexError: list index out of range

### Defining Search Parameters

In [27]:
search_params = {
    "metric_type": "COSINE", 
    "offset": 0, 
    "ignore_growing": False, 
    "params": {"nprobe": 20}
}

In [28]:
sample_query = 'Eyeshadow Powder'
query_embedding = get_text_query_embedding(sample_query)
# sample_query = products[1]['Image']
# sample_query_embedding = generate_text_embeddings(sample_query)
# sample_query_embedding = generate_image_embeddings(sample_query)
# sample_query_embedding
# len(embedding) == len(title_v_embedding)
query_embedding

array([-2.87091378e-02, -1.60262622e-02,  1.10321883e-02,  1.86346974e-02,
        4.64565679e-03,  1.06061390e-02, -6.46375939e-02, -3.00045423e-02,
       -1.68694369e-02,  5.11441231e-02,  5.06444885e-05,  1.29315825e-02,
        1.02344705e-02,  9.36190225e-03, -1.87919363e-02, -2.54604947e-02,
       -3.80817195e-03, -4.93968949e-02, -1.79287456e-02, -2.84742750e-02,
        4.75340104e-03,  2.66201850e-02, -1.96729824e-02, -3.21539380e-02,
       -1.03345923e-02, -2.50490326e-02, -2.96938908e-03, -2.37336308e-02,
       -2.27651894e-02, -2.13819239e-02, -4.82474305e-02,  1.04074283e-02,
        1.72209702e-02,  5.74808428e-03,  1.46720035e-03,  3.86455399e-03,
       -2.94432603e-02,  1.58005357e-02,  1.04613239e-02, -2.53673252e-02,
        2.12441273e-02,  1.71216875e-02,  4.79665175e-02, -1.06422522e-03,
       -9.65849496e-03, -1.69090107e-02, -2.89468057e-02,  2.79049240e-02,
       -9.63328220e-03, -3.34460698e-02, -8.34832527e-03, -3.38171273e-02,
       -1.54383443e-02, -

In [35]:
results = products_collection.search(
    data=[query_embedding], 
    anns_field="title_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=10,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=['title','price'],
    consistency_level="Strong"
)

In [30]:
for result in results[0]:
    # print(result[0].entity.get('title'),'\n')
    print(result.entity)

id: 451413128074150986, distance: 0.8763506412506104, entity: {'title': 'Color On Professional Eye Shadow Smokey Classics', 'price': 0.0}
id: 451412599882129925, distance: 0.8670003414154053, entity: {'title': 'cosmetic', 'price': 0.0}
id: 451413128074130806, distance: 0.8618062734603882, entity: {'title': 'Foundation Makeup', 'price': 0.0}
id: 451413128074138397, distance: 0.8314498662948608, entity: {'title': 'Gel Eyeliner', 'price': 6.489999771118164}
id: 451413128074158552, distance: 0.8233270645141602, entity: {'title': 'Dirty Little Secret DLS Eyeshadow -Bikini', 'price': 6.900000095367432}
id: 451413128074136722, distance: 0.8155425190925598, entity: {'title': 'NEW Color Eye Shadow Makeup Cosmetic Shimmer Eyeshadow Palette NEON Set', 'price': 0.0}
id: 451413128074136024, distance: 0.7919713258743286, entity: {'title': 'Exotic Eyes Complete Eye Makeup Kit Reusable - Isis', 'price': 0.0}
id: 451412599882123932, distance: 0.7884990572929382, entity: {'title': 'Beauty 21 Cosmetics C

In [31]:
results_images = images_collection.search(
    data=[query_embedding], 
    anns_field="image_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=100,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=['p_id','image_url'],
    consistency_level="Strong"
)

In [32]:
# Assuming results_images is the variable holding your search results
p_id_list = []

for result in results_images[0]:
    p_id = result.entity.get('p_id')
    p_id_list.append(p_id)

print(p_id_list)


[451413128074154437, 451412599882129153, 451413128074132189, 451413128074137784, 451413128074145676, 451412599882127098, 451413128074154035, 451413128074145727, 451413128074135008, 451413128074122750, 451413128074160361, 451412599882139242, 451412599882135356, 451413128074150631, 451412599882139242, 451413128074135372, 451413128074127982, 451413128074133968, 451412599882124608, 451412599882141317, 451413128074143262, 451413128074146096, 451413128074141226, 451413128074161006, 451413128074132841, 451413128074136389, 451413128074150631, 451413128074137415, 451413128074134604, 451412599882138209, 451413128074158573, 451413128074142559, 451413128074155094, 451412599882129153, 451412599882129153, 451413128074138097, 451412599882124588, 451412599882140952, 451412599882123914, 451412599882137438, 451413128074154029, 451412599882140230, 451413128074141892, 451413128074137780, 451413128074135693, 451412599882128777, 451412599882143135, 451412599882130855, 451413128074146096, 451413128074141226,

In [66]:
for result in results_images[0]:
    # print(result[0].entity.get('title'),'\n')
    # print(type(result))
    print(result)

print(type(results_images))

id: 451413128074154653, distance: 0.27994802594184875, entity: {'p_id': 451413128074154437, 'image_url': 'https://m.media-amazon.com/images/I/71-ekhsaXSL._SL1500_.jpg'}
id: 451412599882129297, distance: 0.2792852520942688, entity: {'p_id': 451412599882129153, 'image_url': 'https://m.media-amazon.com/images/I/91QfwpCtBQL._SL1500_.jpg'}
id: 451413128074132374, distance: 0.2773583233356476, entity: {'p_id': 451413128074132189, 'image_url': 'https://m.media-amazon.com/images/I/71bxW7PRNaL._SL1500_.jpg'}
id: 451413128074137998, distance: 0.2770081162452698, entity: {'p_id': 451413128074137784, 'image_url': 'https://m.media-amazon.com/images/I/91CAdjH3I1L._SL1500_.jpg'}
id: 451413128074145783, distance: 0.27623647451400757, entity: {'p_id': 451413128074145676, 'image_url': 'https://m.media-amazon.com/images/I/51XeGNInTZL.jpg'}
id: 451412599882127318, distance: 0.27580058574676514, entity: {'p_id': 451412599882127098, 'image_url': 'https://m.media-amazon.com/images/I/41QvkwiGE6L.jpg'}
id: 451

In [41]:
matching_results  = products_collection.query(expr=f'product_id in {p_id_list}', output_fields=['title','price', 'average_rating', 'features', 'description', 'categories', 'store', 'main_category'])
# title_v_embedding = product_ids[0]["title_vector"]
matching_results[4]

{'title': 'Autumn Delights Body Mist Spray for Women, Black Cherry, Vanilla & Amber, 8.45 Fluid Oz',
 'price': 0.0,
 'average_rating': 4.0,
 'features': [],
 'description': [],
 'categories': [],
 'store': 'Cosmolive',
 'main_category': 'All Beauty',
 'product_id': 451412599882132595}

### Query match with title and then get the images

In [72]:
sample_query = 'cosmetic handbag'
query_embedding = get_text_query_embedding(sample_query)
title_results = products_collection.search(
    data=[query_embedding], 
    anns_field="title_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=20,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=["title", 'features', 'price', 'product_id', 'description', 'main_category', 'store', 'categories'],
    consistency_level="Strong"
)
product_ids = [result.entity.get('product_id') for result in title_results[0]]


In [73]:
# Construct a query expression to filter by product IDs
query_expr = "p_id in {}".format(product_ids)

# Search for images
image_results = images_collection.query(
    expr=query_expr,
    output_fields=["image_url", "p_id"]  # Adjust fields based on your schema
)

# # Process results
# for image in image_results:
#     product_id = image.get('p_id')
#     image_url = image.get('image_url')
#     print(f"Product ID: {product_id}, Image Vector: {image_url}")

In [76]:
# Step 1: Access Product Details from title_results
product_details = {}
for result in title_results[0]:  # Adjust according to actual structure of title_results
    product_id = result.entity.get('product_id')  # Adjust according to actual field names
    score = result.distance
    if product_id is not None:
        product_details[product_id] = {
            'title': result.entity.get('title'),
            'score':score,
            'description': result.entity.get('description'),
            'price': result.entity.get('price'),
            'main_category': result.entity.get('main_category'),
            'store': result.entity.get('store'),
            'categories': result.entity.get('categories')
        }

# Step 2: Create a dictionary for image URLs
product_images = {}
for image in image_results:
    product_id = image.get('p_id')
    image_url = image.get('image_url')
    
    if product_id not in product_images:
        product_images[product_id] = []
    
    product_images[product_id].append(image_url)

# Step 3: Combine Product Details and Image URLs into a Single Dictionary
combined_product_info = {}
for product_id, details in product_details.items():
    combined_product_info[product_id] = details
    combined_product_info[product_id]['image_urls'] = product_images.get(product_id, [])

# Print or use the combined information
for product_id, info in combined_product_info.items():
    print(f"Product ID: {product_id}")
    print(f"Product score: {info['score']}")
    print(f"Title: {info['title']}")
    print(f"Description: {info['description']}")
    print(f"Price: {info['price']}")
    print(f"Main Category: {info['main_category']}")
    print(f"Store: {info['store']}")
    print(f"Categories: {info['categories']}")
    print(f"Image URLs: {info['image_urls']}")
    print()


Product ID: 451413128074145025
Product score: 0.8849709033966064
Title: NEW Premium Multi-Function Compartmentalized Spacious Cosmetic Makeup Bag Organizer for Purse
Description: ['Do you hate shuffling around in your makeup bag for lipstick, liner, pencils, etc.? Have you purchased several cosmetic cases that just do not provide enough space for your makeup?', 'Well look no further!', 'Bare BeYouTy', 'has developed the perfect product to fit all your needs!', 'Bare BeYouTy', 'is committed to exploring inspiration, releasing unpredictably elegant products, and bringing an exuberant, graceful approach to your lifestyle.', 'This PREMIUM cosmetic makeup case is the perfect solution to all your problems!', '★ Expandable, Compartmentalized, yet Compact to fit all your everyday makeup.', '★ Premium, Durable, Wipeable, Waterproof Material makes the case great for longevity.', '★ Spacing on the top AND the bottom of the case makes provides more than enough space and versatility to organize and

### Query match with images and then get the products

In [61]:
results_images = images_collection.search(
    data=[query_embedding], 
    anns_field="image_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=100,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=['p_id','image_url'],
    consistency_level="Strong",
    return_score = True
)
# Flatten the results
image_results = results_images[0]  # Access the actual result data

In [55]:
# Extract product_ids from the image search results
product_ids = [result.entity.get('p_id') for result in image_results if result.entity.get('p_id')]

# Construct query expression to retrieve product information
query_expr = "product_id in {}".format(product_ids)

# Perform the query on the products collection
product_results = products_collection.query(
    expr=query_expr,
    output_fields=["title", "description", "price", "main_category", "store", "categories"]
)

# Process results
product_details = {result.get('product_id'): {
    'title': result.get('title'),
    'description': result.get('description'),
    'price': result.get('price'),
    'main_category': result.get('main_category'),
    'store': result.get('store'),
    'categories': result.get('categories'),
} for result in product_results}

In [71]:
# Create a dictionary to combine product details and image URLs
combined_product_info = {}
for image in image_results:
    product_id = image.entity.get('p_id')
    image_url = image.entity.get('image_url')
    score = image.distance
    
    if product_id in product_details:
        if product_id not in combined_product_info:
            combined_product_info[product_id] = product_details[product_id]
            combined_product_info[product_id]['image_urls'] = []
            combined_product_info[product_id]['scores'] = []
        combined_product_info[product_id]['image_urls'].append(image_url)
        combined_product_info[product_id]['scores'].append(score)

# Print or use the combined information
for product_id, info in combined_product_info.items():
    print(f"Product ID: {product_id}")
    print(f"Product Image scores: {info['scores']}")
    print(f"Title: {info['title']}")
    print(f"Description: {info['description']}")
    print(f"Price: {info['price']}")
    print(f"Main Category: {info['main_category']}")
    print(f"Store: {info['store']}")
    print(f"Categories: {info['categories']}")
    print(f"Image URLs: {info['image_urls']}")
    print()


Product ID: 451413128074154437
Product Image scores: [0.27994802594184875]
Title: Professional Translucent Loose Setting Powder - Shimmer Friendly Matte Makeup Finishing Powder - Includes Mirror & Puff, 1.76oz (1#)
Description: []
Price: 0.0
Main Category: All Beauty
Store: Ruoxi
Categories: []
Image URLs: ['https://m.media-amazon.com/images/I/71-ekhsaXSL._SL1500_.jpg']

Product ID: 451412599882129153
Product Image scores: [0.2792852520942688, 0.26284554600715637, 0.26255813241004944]
Title: WindMax US Store 15 Cold Smoked Color Glitter Shimmer Pearl Loose Eyeshadow Pigments Mineral Eye Shadow Dust Powder Makeup Party Cosmetic Set #B
Description: []
Price: 5.989999771118164
Main Category: All Beauty
Store: WindMax
Categories: []
Image URLs: ['https://m.media-amazon.com/images/I/91QfwpCtBQL._SL1500_.jpg', 'https://m.media-amazon.com/images/I/81LKXgGo6YL._SL1500_.jpg', 'https://m.media-amazon.com/images/I/71Qh+rruYzL._SL1500_.jpg']

Product ID: 451413128074132189
Product Image scores: [0

In [68]:
combined_product_info

{451413128074154437: {'title': 'Professional Translucent Loose Setting Powder - Shimmer Friendly Matte Makeup Finishing Powder - Includes Mirror & Puff, 1.76oz (1#)',
  'description': [],
  'price': 0.0,
  'main_category': 'All Beauty',
  'store': 'Ruoxi',
  'categories': [],
  'image_urls': ['https://m.media-amazon.com/images/I/71-ekhsaXSL._SL1500_.jpg']},
 451412599882129153: {'title': 'WindMax US Store 15 Cold Smoked Color Glitter Shimmer Pearl Loose Eyeshadow Pigments Mineral Eye Shadow Dust Powder Makeup Party Cosmetic Set #B',
  'description': [],
  'price': 5.99,
  'main_category': 'All Beauty',
  'store': 'WindMax',
  'categories': [],
  'image_urls': ['https://m.media-amazon.com/images/I/91QfwpCtBQL._SL1500_.jpg',
   'https://m.media-amazon.com/images/I/81LKXgGo6YL._SL1500_.jpg',
   'https://m.media-amazon.com/images/I/71Qh+rruYzL._SL1500_.jpg']},
 451413128074132189: {'title': 'CIBBCCI Loose Power Translucent Face Makeup Set 1.7 oz, Oil Control Loose Matte Foundation Powder W

Search by inputting image

In [92]:
image = Image.open('sample_image.jpeg').convert('RGB')  # Ensure image is in RGB format
im_em = generate_image_embeddings(image)
im_em[0]
# im_em.shape

array([-4.59257104e-02, -2.37922147e-02, -5.23333736e-02,  5.64227663e-02,
       -1.76472180e-02,  1.83983929e-02, -3.02431583e-02, -1.45699773e-02,
       -3.80832492e-03, -2.02894509e-02, -3.39875370e-02,  1.27158011e-03,
        2.07519345e-02, -1.90497357e-02, -2.48479862e-02, -2.43882299e-03,
       -4.27429602e-02, -4.74151820e-02, -1.10541005e-02, -2.12755650e-02,
       -1.75152742e-03,  2.24629529e-02,  1.53530892e-02,  6.79044798e-02,
        1.26098860e-02,  3.65654379e-03, -6.30868925e-03, -1.53702013e-02,
       -6.48145080e-02, -3.85261960e-02,  1.40931690e-02, -2.76178662e-02,
       -4.75056879e-02,  8.30687210e-03, -9.72888153e-03, -4.78520915e-02,
       -5.99918291e-02,  2.35417299e-02, -5.28518157e-03,  5.43791130e-02,
       -3.03657968e-02, -9.39881895e-03,  5.16918711e-02,  5.82175935e-03,
       -1.18332721e-01, -4.28053737e-02, -7.61088356e-02,  1.40330894e-02,
       -4.78930175e-02, -1.83694884e-02,  3.56593840e-02,  3.72319855e-02,
        1.09508326e-02,  

In [93]:
results_images = images_collection.search(
    data=[im_em[0]], 
    anns_field="image_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=100,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=['p_id','image_url'],
    consistency_level="Strong"
)

In [94]:
for result in results_images[0]:
    print(result)

id: 451412599882141426, distance: 0.6760517954826355, entity: {'p_id': 451412599882141295, 'image_url': 'https://m.media-amazon.com/images/I/61zPxdGwI9L._SL1500_.jpg'}
id: 451413128074159258, distance: 0.675410270690918, entity: {'p_id': 451413128074159004, 'image_url': 'https://m.media-amazon.com/images/I/71uYtV5THAL._SL1500_.jpg'}
id: 451413128074159257, distance: 0.675410270690918, entity: {'p_id': 451413128074159004, 'image_url': 'https://m.media-amazon.com/images/I/71uYtV5THAL._SL1500_.jpg'}
id: 451413128074139929, distance: 0.6673462390899658, entity: {'p_id': 451413128074139797, 'image_url': 'https://m.media-amazon.com/images/I/518-qq91qfL._SL1500_.jpg'}
id: 451413128074126694, distance: 0.6614727973937988, entity: {'p_id': 451413128074126553, 'image_url': 'https://m.media-amazon.com/images/I/41OAqznXVqL._SL1000_.jpg'}
id: 451413128074148098, distance: 0.6595019102096558, entity: {'p_id': 451413128074147857, 'image_url': 'https://m.media-amazon.com/images/I/71TJsJBMw6L._SL1500_.

In [95]:
results = products_collection.search(
    data=[im_em[0]], 
    anns_field="title_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=10,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=['title','price'],
    consistency_level="Strong"
)

In [96]:
for result in results[0]:
    print(result)

id: 451413128074147857, distance: 0.2538781762123108, entity: {'title': 'Nicka K Vivid Matte Lipstick NMS02 Red', 'price': 0.0}
id: 451413128074138442, distance: 0.24530798196792603, entity: {'title': 'Matte Lip Velour Lipstick Heart', 'price': 0.0}
id: 451412599882124304, distance: 0.2450072318315506, entity: {'title': 'Maybelline Vivid Matte Lipstick Number 35, Rebel Red', 'price': 0.0}
id: 451413128074128003, distance: 0.24372462928295135, entity: {'title': 'Matte Lip Velour Lipstick Meow', 'price': 0.0}
id: 451412599882134021, distance: 0.23935112357139587, entity: {'title': 'Youngblood Lipstick, Spicy, 4 Gram', 'price': 0.0}
id: 451412599882123270, distance: 0.23474642634391785, entity: {'title': 'Illamasqua Lipstick One Size Magnetism', 'price': 13.5}
id: 451412599882133618, distance: 0.2322523593902588, entity: {'title': 'Avon Ultra Color Lipstick Color Sheer Nectar', 'price': 0.0}
id: 451412599882123543, distance: 0.2310037463903427, entity: {'title': 'Velvet Red Matte Lipstick