## Importing Necessary Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import time
import requests
import json
import torch
import clip
import glob
import pymilvus
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    db
)
import unicodedata
import open_clip
from all_clip import load_clip


In [2]:
print(torch.__version__)

2.2.1+cpu


### Setting Directories and Folders

In [2]:
IMAGES_URL_DIRECTORY = 'image_folder'
IMAGE_EMBEDDINGS_DIRECTORY = 'image_embeddings_complete'
TITLE_EMBEDDINGS_DIRECTORY = 'title_embeddings'
title_embeddings_path = 'title_embeddings/text_emb/text_emb_0.npy'
image_embeddings_path = 'image_embeddings_complete/img_emb/img_emb_0.npy'
image_embeddings_meta_data_file = 'image_embeddings_complete/metadata/metadata_0.parquet'
title_embeddings_meta_data_file = 'title_embeddings/metadata/metadata_0.parquet'
data_path = 'D:\Datasets\Amazon\Amazon Product Dataset\meta_All_Beauty.jsonl'

### Loading Clip Model for inference

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model, preprocess, tokenizer = load_clip(clip_model='open_clip:ViT-B-16')

warming up with batch size 1 on cpu
done warming up in 29.6081964969635s


### Functions for generating Embeddings for Inference

In [4]:
def get_text_query_embedding(query):
    # Load the model and preprocess function
    text_tokens = tokenizer(query)
    with torch.no_grad():
        text_features = model.encode_text(text_tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    text_embs = text_features.cpu().to(torch.float32).numpy()
    return text_embs[0]

def generate_image_embeddings(img):
    with torch.no_grad():
        image_embeddings = model.encode_image(img).float()
        image_embeddings /= image_embeddings.norm(dim = -1, keepdim = True)
    return image_embeddings.cpu().to(torch.float32).numpy()

### Function for creating url and keys dictionary

In [5]:
def creating_images_url_dictionary(path = IMAGES_URL_DIRECTORY):
# Initialize an empty dictionary
    url_key_dict = {}

    # List of file paths (adjust the path and extension as needed)
    file_paths = glob.glob(f'{path}/*.parquet')
    
    # print(len(file_paths))
    
    for file_path in file_paths:
        # Read the file into a DataFrame
        data = pd.read_parquet(file_path)
        
        # Ensure the key column is in string format
        data['key'] = data['key'].astype(str)
        
        # Update the dictionary with data from the current file
        url_key_dict.update(data.set_index('url')['key'].to_dict())
    
    return url_key_dict

### Function for preparing the meta data of the images to get the keys

In [6]:
def preparing_image_meta_data(image_embeddings_meta_data_file= image_embeddings_meta_data_file):
    image_embeddings_meta_data = pd.read_parquet(image_embeddings_meta_data_file)
    image_embeddings_meta_data['key'] = image_embeddings_meta_data['image_path'].str.extract(r'(\d{9})')
    return image_embeddings_meta_data


### Functions for getting the index of image and title embeddings

In [7]:
def get_index_of_image_embedding(image_embeddings_metadata, image_key):
    index_value = image_embeddings_metadata.loc[image_embeddings_metadata['key'] == image_key].index
    if not index_value.empty:
        return index_value[0]
    else:
        return False

def get_index_of_title_embedding(title_embeddings_metadata, title_key):
    index_value = title_embeddings_metadata.loc[title_embeddings_metadata['caption'] == title_key].index
    if not index_value.empty:
        return index_value[0]
    else:
        return False

### Function for extracting the urls

In [8]:
def extract_img_urls(image_array):
    urls = []
    for item in image_array:
        # print(item)
        if 'hi_res' in item and item['hi_res']:
            urls.append(item['hi_res'])
        elif 'large' in item and item['large']:
            urls.append(item['large'])
        else:
            print(f"Key 'hi_res' and 'large' not found in item: {item}")
        if len(urls) == 3:
            return urls
    return urls

### Functions to get the image and title embeddings

In [10]:
def get_image_embedding(all_image_embeddings, index):
    if index < 0 or index >=(len(all_image_embeddings)):
        print(f'{index} index is not in all the image embeddings!')
        return [0]
    return all_image_embeddings[index]

def get_title_embedding(title_embeddings, index):
    return title_embeddings[index]

### Loading the Image and Title embeddings

In [9]:
all_image_embeddings = np.load(image_embeddings_path)
title_embeddings = np.load(title_embeddings_path)

In [12]:
print(len(all_image_embeddings))
# title_embeddings[5]

548083


In [13]:
magnitude = np.linalg.norm(all_image_embeddings[53464])
magnitude

0.998

### Loading the Title and Image Meta files

In [10]:
title_embeddings_metadata = pd.read_parquet(title_embeddings_meta_data_file)

In [15]:
image_embeddings_metadata = preparing_image_meta_data()
print(image_embeddings_metadata.head())
print(len(image_embeddings_metadata))

                         image_path        key
0  image_folder/00000/000000000.jpg  000000000
1  image_folder/00000/000000001.jpg  000000001
2  image_folder/00000/000000002.jpg  000000002
3  image_folder/00000/000000003.jpg  000000003
4  image_folder/00000/000000004.jpg  000000004
548083


In [16]:
images_url_dictionary = creating_images_url_dictionary()
print(len(images_url_dictionary))

478384


In [17]:
n = 2
# View the first n items of the dictionary
first_n_items = list(images_url_dictionary.items())[:n]
first_n_items

[('https://m.media-amazon.com/images/I/517uoA+-gzL._SL1005_.jpg', '000000016'),
 ('https://m.media-amazon.com/images/I/510BWq7O95L._SL1005_.jpg', '000000015')]

### Connecting to the Milvus Database

In [11]:
client = connections.connect("default", host="localhost", port="19530")

In [12]:
db.list_database()

['default', 'Products', 'My_Products']

In [13]:
# database = db.create_database("My_Products")
db.using_database("Products")

### Creating Field Schemas for the Collections

In [14]:
fields = [
    FieldSchema(name="product_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=512),  
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length = 5000),
    FieldSchema(name="average_rating", dtype=DataType.FLOAT),
    FieldSchema(name="features", dtype=DataType.ARRAY, max_capacity = 1000, element_type = DataType.VARCHAR, max_length = 10000),
    FieldSchema(name="description", dtype=DataType.ARRAY, max_capacity = 500, element_type = DataType.VARCHAR, max_length = 50000),
    FieldSchema(name="categories", dtype=DataType.ARRAY, max_capacity = 500, element_type = DataType.VARCHAR, max_length = 500),
    FieldSchema(name="price", dtype=DataType.FLOAT),
    FieldSchema(name="store", dtype=DataType.VARCHAR, max_length = 500),
    FieldSchema(name="main_category", dtype=DataType.VARCHAR, max_length = 500)
]

fields_images = [
    FieldSchema(name="image_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="p_id", dtype=DataType.INT64),  # Foreign key to Products
    FieldSchema(name="image_vector", dtype=DataType.FLOAT_VECTOR, dim=512),
    FieldSchema(name="image_url", dtype=DataType.VARCHAR, max_length=10000)
]

### Creating the collections

In [15]:
product_schema = CollectionSchema(fields, description="Products collection")
products_collection = Collection(name = 'products', schema=product_schema)

In [16]:
image_schema = CollectionSchema(fields_images, description='Images Collection')
images_collection = Collection(name = 'images', schema = image_schema)

### Creating the Indices for the collections and loading them

In [24]:
index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 256}
}

products_collection.create_index(field_name="title_vector", index_params = index_params)
products_collection.load()
# products_collection.release()
# products_collection.drop_index()

images_collection.create_index(field_name="image_vector", index_params = index_params)
images_collection.load()
# images_collection.release()
# images_collection.drop_index()

## Functions for pre-processing the products and storing them in the Milvus Database in Batches

In [113]:
def clean_text(text):
    if not text:
        return ""
    
    # Remove non-UTF-8 characters
    return ''.join([c for c in text if ord(c) < 128])

def insert_product_batch(products):
    if products:
        products_collection.insert(products)
        products_collection.flush()

def insert_image_batch(images):
    if images:
        images_collection.insert(images)
        images_collection.flush()

def process_line(line, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary):
    data = json.loads(line.strip())
    if '"' in data['title']:
        return None, None

    # title = data['title']
    title = clean_text(data['title'])
    title_index = get_index_of_title_embedding(title_embeddings_metadata, title)
    if not title_index:
        return None, None
    title_embedding = get_title_embedding(title_embeddings, title_index)
    
    # Extract product information
    average_rating = data['average_rating']
    features = data['features']
    description = data['description']
    categories = data['categories']
    price = data['price']
    main_category = data['main_category']
    store = data['store']
    if not price:
        price = 0.0
    if not store:
        store = ''
    if not main_category:
        main_category = ''
    if not average_rating:
        average_rating = 0.0

    # Prepare product data
    product_data = {
        'title_vector': title_embedding.tolist(),
        'title': title,
        'average_rating': average_rating,
        'features': features,
        'description': description,
        'categories': categories,
        'price': price,
        'store': store,
        'main_category': main_category,
    }

    # Extract and process image URLs
    image_urls = extract_img_urls(data['images'])
    current_image_embeddings = []
    for url in image_urls:
        if url in images_url_dictionary and images_url_dictionary[url]:
            image_key = images_url_dictionary[url]
        else:
            continue

        image_index = get_index_of_image_embedding(image_embeddings_metadata, image_key)
        if not image_index:
            print('no image index')
            continue
        
        image_embedding = get_image_embedding(all_image_embeddings, image_index)
        if len(image_embedding) > 1 :
            current_image_embeddings.append((image_embedding, url))

    return product_data, current_image_embeddings

def create_and_store_data(data_path, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary):
    product_batch = []
    image_batch = []

    with open(data_path) as file:
        for line in (file):
            # if i < 1000:
            #     continue

            product_data, current_image_embeddings = process_line(line, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary)
            if product_data and current_image_embeddings:
                product_batch.append((product_data, current_image_embeddings))
                if len(product_batch) >= 100:
                    # Insert products
                    products_data = [pd for pd, _ in product_batch]
                    insert_product_batch(products_data)

                    # Retrieve product IDs for inserted products
                    titles = [pd['title'] for pd, _ in product_batch]
                    product_ids = products_collection.query(expr=f'title in {titles}', output_fields=["title", "product_id"])

                    title_to_id = {p['title']: p['product_id'] for p in product_ids}

                    # Prepare image data
                    for pd, image_ki_embeddings in product_batch:
                        product_id = title_to_id.get(pd['title'])
                        for image_embedding, image_url in image_ki_embeddings:
                            image_data = {
                                'p_id': product_id,
                                'image_vector': image_embedding.tolist(),
                                'image_url': image_url,
                            }
                            image_batch.append(image_data)

                    if image_batch:
                        insert_image_batch(image_batch)

                    # Clear batches
                    product_batch = []
                    image_batch = []

    # Insert any remaining data
    if product_batch:
        products_data = [pd for pd, _ in product_batch]
        insert_product_batch(products_data)
        
        titles = [pd['title'] for pd, _ in product_batch]
        product_ids = products_collection.query(expr=f'title in {titles}', output_fields=["title", "product_id"])
        
        title_to_id = {p['title']: p['product_id'] for p in product_ids}
        
        for pd, image_ki_embeddings in product_batch:
            product_id = title_to_id.get(pd['title'])
            for image_embedding, image_url in image_ki_embeddings:
                image_data = {
                    'p_id': product_id,
                    'image_vector': image_embedding.tolist(),
                    'image_url': image_url,
                }
                image_batch.append(image_data)
        
        if image_batch:
            insert_image_batch(image_batch)

In [None]:
# Run the data creation and storage process
# create_and_store_data(data_path, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary)

### Sample search Query

In [25]:
title = "Auric Blends Stella Blue"
product_ids = products_collection.query(expr=f'title == "{title}"', output_fields=["title", 'features', 'price', 'product_id', 'title_vector'])
title_v_embedding = product_ids[0]["title_vector"]
# title_v_embedding = np.float16(title_v_embedding)
title_v_embedding

[0.016479492,
 -0.04272461,
 -0.0146484375,
 0.014465332,
 0.033691406,
 -0.040039062,
 -0.06225586,
 -0.04321289,
 0.026123047,
 0.023925781,
 -0.052978516,
 -0.042236328,
 0.018066406,
 0.026123047,
 -0.05419922,
 -0.08642578,
 0.024780273,
 0.002243042,
 -0.044189453,
 -0.032714844,
 -0.012512207,
 0.01574707,
 0.011230469,
 -0.022338867,
 -0.0015487671,
 -0.029785156,
 -0.032958984,
 0.04638672,
 -0.060791016,
 0.1015625,
 0.030883789,
 0.020141602,
 0.012573242,
 0.018920898,
 0.045654297,
 -0.01928711,
 -0.10595703,
 -0.053222656,
 0.03955078,
 -0.0625,
 0.020629883,
 0.029418945,
 0.00076293945,
 -0.032226562,
 -0.043945312,
 0.09375,
 0.024047852,
 0.013549805,
 -0.024291992,
 -0.016113281,
 0.01977539,
 0.08251953,
 0.010009766,
 0.012207031,
 -0.061767578,
 -0.010437012,
 -0.012329102,
 -0.010986328,
 -0.00025749207,
 -0.0061035156,
 0.0033721924,
 0.021118164,
 0.041503906,
 0.044189453,
 0.030517578,
 -0.006439209,
 -0.048583984,
 -0.03540039,
 -0.03112793,
 -0.0047912598,


### Defining Search Parameters

In [35]:
search_params = {
    "metric_type": "COSINE", 
    "offset": 0, 
    "ignore_growing": False, 
    "params": {"nprobe": 20}
}

In [36]:
sample_query = 'Perfume For Women'
query_embedding = get_text_query_embedding(sample_query)
# sample_query = products[1]['Image']
# sample_query_embedding = generate_text_embeddings(sample_query)
# sample_query_embedding = generate_image_embeddings(sample_query)
# sample_query_embedding
# len(embedding) == len(title_v_embedding)
query_embedding

array([-2.32593361e-02, -2.88339853e-02,  2.17014905e-02,  9.08644777e-03,
       -6.12214394e-03,  2.55541820e-02, -6.32908195e-03, -1.68123357e-02,
        2.67626848e-02, -8.77483469e-03, -3.89870442e-03,  7.46044703e-03,
       -1.27959568e-02,  5.15768379e-02, -8.85808934e-03,  2.95048696e-04,
       -1.29163275e-02, -8.05139914e-03,  3.15496959e-02,  2.55601872e-02,
        4.26098257e-02, -8.62112269e-04,  1.75491199e-02,  2.93029170e-03,
        1.05666462e-02, -5.66374548e-02, -3.31582539e-02, -3.41164991e-02,
        5.30110020e-03, -2.95261797e-02, -2.73504891e-02, -5.85705217e-04,
       -1.95997916e-02,  4.35990281e-02,  6.70069084e-03,  2.48932745e-04,
       -1.09991327e-01, -3.90833477e-03, -4.54629539e-03, -2.71318797e-02,
       -4.23978344e-02, -1.10909808e-02,  2.77142040e-02, -1.65455025e-02,
        1.23184416e-02, -5.21923378e-02, -3.95096280e-02, -1.17282094e-02,
       -1.40293706e-02, -7.44394725e-03,  4.26131580e-03,  4.84202988e-03,
        1.50362644e-02, -

In [37]:
results = products_collection.search(
    data=[query_embedding], 
    anns_field="title_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=10,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=['title','price'],
    consistency_level="Strong"
)

In [48]:
for result in results[0]:
    # print(result[0].entity.get('title'),'\n')
    print(result.entity)

id: 451064611036296840, distance: 0.8905419707298279, entity: {'title': 'Perfume con Feromonas para Mujeres para Atraer a los Hombres Fuerte Efecto (#2)', 'price': 0.0}
id: 451064611036304306, distance: 0.8831300139427185, entity: {'title': '10 Best Selling Perfumes Sample Vial for Women', 'price': 0.0}
id: 451064611036726658, distance: 0.8495831489562988, entity: {'title': 'Body Lotion', 'price': 29.989999771118164}
id: 451064611036189658, distance: 0.8417010307312012, entity: {'title': 'cosmetic', 'price': 0.0}
id: 451064611036193093, distance: 0.8407971858978271, entity: {'title': 'High Class Woman Perfume 3.4oz by Sandora', 'price': 0.0}
id: 451064611036347943, distance: 0.8291282653808594, entity: {'title': 'Stella Hair Fragrance', 'price': 0.0}
id: 451064611036326937, distance: 0.8256978392601013, entity: {'title': 'Zermat Fragrance for Men Embleme, Perfume Para Caballero', 'price': 35.25}
id: 451064611036215353, distance: 0.8218688368797302, entity: {'title': 'Foundation Makeup'

In [39]:
results_images = images_collection.search(
    data=[query_embedding], 
    anns_field="image_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=10,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=['p_id','image_url'],
    consistency_level="Strong"
)

In [78]:
# Assuming results_images is the variable holding your search results
p_id_list = []

for result in results_images[0]:
    p_id = result.entity.get('p_id')
    p_id_list.append(p_id)

print(p_id_list)


[451064611036339180, 451064611036269286, 451064611036323067, 451064611036323067, 451064611036323067, 451064611036328603, 451064611036365385, 451064611036341937, 451064611036358699, 451064611036203532]


In [77]:
for result in results_images[0]:
    # print(result[0].entity.get('title'),'\n')
    # print(type(result))
    print(result)

print(type(results_images))

id: 451064611036339349, distance: 0.2894721031188965, entity: {'p_id': 451064611036339180, 'image_url': 'https://m.media-amazon.com/images/I/71e5Bfbzw1L._SL1500_.jpg'}
id: 451064611036269466, distance: 0.28619927167892456, entity: {'p_id': 451064611036269286, 'image_url': 'https://m.media-amazon.com/images/I/81XJO7HPgkL._SL1500_.jpg'}
id: 451064611036323194, distance: 0.28181228041648865, entity: {'p_id': 451064611036323067, 'image_url': 'https://m.media-amazon.com/images/I/41ofsJx4ilL.jpg'}
id: 451064611036323193, distance: 0.28181228041648865, entity: {'p_id': 451064611036323067, 'image_url': 'https://m.media-amazon.com/images/I/41ofsJx4ilL.jpg'}
id: 451064611036323192, distance: 0.28181228041648865, entity: {'p_id': 451064611036323067, 'image_url': 'https://m.media-amazon.com/images/I/41ofsJx4ilL.jpg'}
id: 451064611036328710, distance: 0.277422696352005, entity: {'p_id': 451064611036328603, 'image_url': 'https://m.media-amazon.com/images/I/81ECEcDr3NL._SL1500_.jpg'}
id: 451064611036

In [84]:
matching_results  = products_collection.query(expr=f'product_id in {p_id_list}', output_fields=['title','price', 'average_rating', 'features', 'description', 'categories', 'store', 'main_category'])
# title_v_embedding = product_ids[0]["title_vector"]
matching_results[4]

{'store': 'WHISKY',
 'title': 'WHISKY - WHISKY FOR WOMEN 75 ml - Women - 75ML - White',
 'average_rating': 4.0,
 'features': ['Whiskey for women, an eternal perfume',
  'Its luxurious and elegant design W-shape emphasizes its uniqueness and embodies the beauty',
  'More than a fragrance it is a precious object containing a magical and refined essence',
  'A mixture of both sweet and intense perfectly combining jasmine and plum on a bed of vanilla and musk',
  'Dive into its world, let yourself be intoxicated by his delicacy! Top notes: Bergamot, Cinnamon Bark, Plum'],
 'categories': [],
 'price': 0.0,
 'description': ['Whiskey for women, an eternal perfume. Its luxurious and elegant design W-shape emphasizes its uniqueness and embodies the beauty. More than a fragrance it is a precious object containing a magical and refined essence. A mixture of both sweet and intense perfectly combining jasmine and plum on a bed of vanilla and musk. Dive into its world, let yourself be intoxicated by