## Importing Necessary Libraries

In [96]:
import os
import numpy as np
import pandas as pd
import time
import requests
import json
import torch
# import clip
import glob
import pymilvus
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    db
)
import unicodedata


### Setting Directories and Folders

In [2]:
IMAGES_URL_DIRECTORY = 'image_folder'
IMAGE_EMBEDDINGS_DIRECTORY = 'image_embeddings_complete'
TITLE_EMBEDDINGS_DIRECTORY = 'title_embeddings'
title_embeddings_path = 'title_embeddings/text_emb/text_emb_0.npy'
image_embeddings_path = 'image_embeddings_complete/img_emb/img_emb_0.npy'
image_embeddings_meta_data_file = 'image_embeddings_complete/metadata/metadata_0.parquet'
title_embeddings_meta_data_file = 'title_embeddings/metadata/metadata_0.parquet'
data_path = 'D:\Datasets\Amazon\Amazon Product Dataset\meta_All_Beauty.jsonl'

### Function for creating url and keys dictionary

In [3]:
def creating_images_url_dictionary(path = IMAGES_URL_DIRECTORY):
# Initialize an empty dictionary
    url_key_dict = {}

    # List of file paths (adjust the path and extension as needed)
    file_paths = glob.glob(f'{path}/*.parquet')
    
    # print(len(file_paths))
    
    for file_path in file_paths:
        # Read the file into a DataFrame
        data = pd.read_parquet(file_path)
        
        # Ensure the key column is in string format
        data['key'] = data['key'].astype(str)
        
        # Update the dictionary with data from the current file
        url_key_dict.update(data.set_index('url')['key'].to_dict())
    
    return url_key_dict

### Function for preparing the meta data of the images to get the keys

In [43]:
def preparing_image_meta_data(image_embeddings_meta_data_file= image_embeddings_meta_data_file):
    image_embeddings_meta_data = pd.read_parquet(image_embeddings_meta_data_file)
    image_embeddings_meta_data['key'] = image_embeddings_meta_data['image_path'].str.extract(r'(\d{9})')
    return image_embeddings_meta_data


### Functions for getting the index of image and title embeddings

In [7]:
def get_index_of_image_embedding(image_embeddings_metadata, image_key):
    index_value = image_embeddings_metadata.loc[image_embeddings_metadata['key'] == image_key].index
    if not index_value.empty:
        return index_value[0]
    else:
        return False

def get_index_of_title_embedding(title_embeddings_metadata, title_key):
    index_value = title_embeddings_metadata.loc[title_embeddings_metadata['caption'] == title_key].index
    if not index_value.empty:
        return index_value[0]
    else:
        return False

### Function for extracting the urls

In [8]:
def extract_img_urls(image_array):
    urls = []
    for item in image_array:
        # print(item)
        if 'hi_res' in item and item['hi_res']:
            urls.append(item['hi_res'])
        elif 'large' in item and item['large']:
            urls.append(item['large'])
        else:
            print(f"Key 'hi_res' and 'large' not found in item: {item}")
        if len(urls) == 3:
            return urls
    return urls

### Functions to get the image and title embeddings

In [79]:
def get_image_embedding(all_image_embeddings, index):
    if index < 0 or index >=(len(all_image_embeddings)):
        print(f'{index} index is not in all the image embeddings!')
        return [0]
    return all_image_embeddings[index]

def get_title_embedding(title_embeddings, index):
    return title_embeddings[index]

## Loading the Image and Title embeddings

In [73]:
all_image_embeddings = np.load(image_embeddings_path)
title_embeddings = np.load(title_embeddings_path)

In [77]:
print(len(all_image_embeddings))

548083


In [62]:
title_embeddings_metadata = pd.read_parquet(title_embeddings_meta_data_file)

In [47]:
image_embeddings_metadata = preparing_image_meta_data()
print(image_embeddings_metadata.head())
print(len(image_embeddings_metadata))

                         image_path        key
0  image_folder/00000/000000000.jpg  000000000
1  image_folder/00000/000000001.jpg  000000001
2  image_folder/00000/000000002.jpg  000000002
3  image_folder/00000/000000003.jpg  000000003
4  image_folder/00000/000000004.jpg  000000004
548083


In [13]:
images_url_dictionary = creating_images_url_dictionary()
print(len(images_url_dictionary))

478384


In [14]:
n = 2
# View the first n items of the dictionary
first_n_items = list(images_url_dictionary.items())[:n]
first_n_items

[('https://m.media-amazon.com/images/I/517uoA+-gzL._SL1005_.jpg', '000000016'),
 ('https://m.media-amazon.com/images/I/510BWq7O95L._SL1005_.jpg', '000000015')]

### Connecting to the Milvus Database

In [18]:
client = connections.connect("default", host="localhost", port="19530")

In [19]:
db.list_database()

['default', 'Products', 'My_Products']

In [30]:
# database = db.create_database("My_Products")
db.using_database("Products")

### Creating Field Schemas for the Collections

In [109]:
fields = [
    FieldSchema(name="product_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=512),  
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length = 5000),
    FieldSchema(name="average_rating", dtype=DataType.FLOAT),
    FieldSchema(name="features", dtype=DataType.ARRAY, max_capacity = 1000, element_type = DataType.VARCHAR, max_length = 10000),
    FieldSchema(name="description", dtype=DataType.ARRAY, max_capacity = 500, element_type = DataType.VARCHAR, max_length = 50000),
    FieldSchema(name="categories", dtype=DataType.ARRAY, max_capacity = 500, element_type = DataType.VARCHAR, max_length = 500),
    FieldSchema(name="price", dtype=DataType.FLOAT),
    FieldSchema(name="store", dtype=DataType.VARCHAR, max_length = 500),
    FieldSchema(name="main_category", dtype=DataType.VARCHAR, max_length = 500)
]

fields_images = [
    FieldSchema(name="image_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="p_id", dtype=DataType.INT64),  # Foreign key to Products
    FieldSchema(name="image_vector", dtype=DataType.FLOAT_VECTOR, dim=512),
    FieldSchema(name="image_url", dtype=DataType.VARCHAR, max_length=10000)
]

### Creating the collections

In [110]:
product_schema = CollectionSchema(fields, description="Products collection")
products_collection = Collection(name = 'products', schema=product_schema)

In [111]:
image_schema = CollectionSchema(fields_images, description='Images Collection')
images_collection = Collection(name = 'images', schema = image_schema)

### Creating the Indices for the collections and loading them

In [112]:
index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}

products_collection.create_index(field_name="title_vector", index_params = index_params)
products_collection.load()

images_collection.create_index(field_name="image_vector", index_params = index_params)
images_collection.load()

In [25]:
# def create_and_store_data():
#     i = 0
#     with open(data_path) as file:
#         for line in file:
#             i += 1
#             if i < 1000: 
#                 continue
#             data = json.loads(line.strip())
#             if '"' in data['title']:
#                 continue
            
#             title = data['title']
#             title_index = get_index_of_title_embedding(title_embeddings_metadata, title)
#             if not title_index:
#                 continue
#             title_embedding = get_title_embedding(title_embeddings, title_index)
#             # break
            
#             # Extract product information
#             average_rating = data['average_rating']
#             features = data['features']
#             description = data['description']
#             categories = data['categories']
#             price = data['price']
#             main_category = data['main_category']
#             store = data['store']
#             if not price:
#                 price = 0.0
#             if not store:
#                 store = ''
#             if not main_category:
#                 main_category = ''
#             if not average_rating:
#                 average_rating = 0.0
            
            
#             # Prepare product data
#             product_data = {
#                 'title_vector': title_embedding.tolist(),
#                 'title': title,
#                 'average_rating': average_rating,
#                 'features': features,
#                 'description': description,
#                 'categories': categories,
#                 'price': price,
#                 'store': store,
#                 'main_category': main_category,
#             }
            
#             # Insert product data into the products collection
#             products_collection.insert([product_data])
#             # Flush to ensure data is written
#             products_collection.flush()
            
#             # Retrieve product ID
#             product_ids = products_collection.query(expr=f'title == "{title}"', output_fields=["product_id"])
#             product_id = product_ids[0]["product_id"]
            
#             # Extract and process image URLs
#             image_urls = extract_img_urls(data['images'])
#             current_image_embeddings = []
#             for url in image_urls:
                
#                 #getting the image key stored in the dictionary
#                 if url in images_url_dictionary and images_url_dictionary[url]:
#                     image_key = images_url_dictionary[url]
#                 else:
#                     continue
#                 # print(image_key)
#                 image_index = get_index_of_image_embedding(image_embeddings_metadata, image_key)
#                 # print(image_index)
#                 if not image_index:
#                     continue
#                 image_embedding = get_image_embedding(all_image_embeddings, image_index)
                
#                 current_image_embeddings.append((image_embedding, url))

#             # Insert each image embedding with the associated product ID
#             for image_embedding, image_url in current_image_embeddings:
#                 image_data = {
#                     'p_id': product_id,
#                     'image_vector': image_embedding.tolist(),
#                     'image_url': image_url,
#                 }
#                 images_collection.insert([image_data])
#             # Flush to ensure data is written
#             images_collection.flush()
            
#             print(f"Inserted product ID: {product_id} with {len(current_image_embeddings)} images")

In [124]:
# create_and_store_data()

Inserted product ID: 451053681282423388 with 2 images
Inserted product ID: 451053681282423396 with 3 images
Inserted product ID: 451053681282423414 with 3 images
Inserted product ID: 451053681282423424 with 1 images
Inserted product ID: 451053681282423428 with 3 images
Inserted product ID: 451053681282423438 with 3 images
Inserted product ID: 451053681282423448 with 3 images
Inserted product ID: 451053681282423458 with 3 images
Inserted product ID: 451053681282423468 with 3 images
Inserted product ID: 451053681282423478 with 1 images
Inserted product ID: 451053681282423482 with 1 images
Inserted product ID: 451053681282423486 with 1 images
Inserted product ID: 451053681282423490 with 3 images
Inserted product ID: 451053681282423500 with 1 images
Inserted product ID: 451053681282423504 with 3 images
Inserted product ID: 451053681282423512 with 3 images
Inserted product ID: 451053681282423522 with 3 images
Inserted product ID: 451053681282423532 with 1 images
Inserted product ID: 4510536

KeyboardInterrupt: 

In [113]:
def clean_text(text):
    if not text:
        return ""
    
    # Remove non-UTF-8 characters
    return ''.join([c for c in text if ord(c) < 128])

def insert_product_batch(products):
    if products:
        products_collection.insert(products)
        products_collection.flush()

def insert_image_batch(images):
    if images:
        images_collection.insert(images)
        images_collection.flush()

def process_line(line, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary):
    data = json.loads(line.strip())
    if '"' in data['title']:
        return None, None

    # title = data['title']
    title = clean_text(data['title'])
    title_index = get_index_of_title_embedding(title_embeddings_metadata, title)
    if not title_index:
        return None, None
    title_embedding = get_title_embedding(title_embeddings, title_index)
    
    # Extract product information
    average_rating = data['average_rating']
    features = data['features']
    description = data['description']
    categories = data['categories']
    price = data['price']
    main_category = data['main_category']
    store = data['store']
    if not price:
        price = 0.0
    if not store:
        store = ''
    if not main_category:
        main_category = ''
    if not average_rating:
        average_rating = 0.0

    # Prepare product data
    product_data = {
        'title_vector': title_embedding.tolist(),
        'title': title,
        'average_rating': average_rating,
        'features': features,
        'description': description,
        'categories': categories,
        'price': price,
        'store': store,
        'main_category': main_category,
    }

    # Extract and process image URLs
    image_urls = extract_img_urls(data['images'])
    current_image_embeddings = []
    for url in image_urls:
        if url in images_url_dictionary and images_url_dictionary[url]:
            image_key = images_url_dictionary[url]
        else:
            continue

        image_index = get_index_of_image_embedding(image_embeddings_metadata, image_key)
        if not image_index:
            print('no image index')
            continue
        
        image_embedding = get_image_embedding(all_image_embeddings, image_index)
        if len(image_embedding) > 1 :
            current_image_embeddings.append((image_embedding, url))

    return product_data, current_image_embeddings

def create_and_store_data(data_path, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary):
    product_batch = []
    image_batch = []

    with open(data_path) as file:
        for line in (file):
            # if i < 1000:
            #     continue

            product_data, current_image_embeddings = process_line(line, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary)
            if product_data and current_image_embeddings:
                product_batch.append((product_data, current_image_embeddings))
                if len(product_batch) >= 100:
                    # Insert products
                    products_data = [pd for pd, _ in product_batch]
                    insert_product_batch(products_data)

                    # Retrieve product IDs for inserted products
                    titles = [pd['title'] for pd, _ in product_batch]
                    product_ids = products_collection.query(expr=f'title in {titles}', output_fields=["title", "product_id"])

                    title_to_id = {p['title']: p['product_id'] for p in product_ids}

                    # Prepare image data
                    for pd, image_ki_embeddings in product_batch:
                        product_id = title_to_id.get(pd['title'])
                        for image_embedding, image_url in image_ki_embeddings:
                            image_data = {
                                'p_id': product_id,
                                'image_vector': image_embedding.tolist(),
                                'image_url': image_url,
                            }
                            image_batch.append(image_data)

                    if image_batch:
                        insert_image_batch(image_batch)

                    # Clear batches
                    product_batch = []
                    image_batch = []

    # Insert any remaining data
    if product_batch:
        products_data = [pd for pd, _ in product_batch]
        insert_product_batch(products_data)
        
        titles = [pd['title'] for pd, _ in product_batch]
        product_ids = products_collection.query(expr=f'title in {titles}', output_fields=["title", "product_id"])
        
        title_to_id = {p['title']: p['product_id'] for p in product_ids}
        
        for pd, image_ki_embeddings in product_batch:
            product_id = title_to_id.get(pd['title'])
            for image_embedding, image_url in image_ki_embeddings:
                image_data = {
                    'p_id': product_id,
                    'image_vector': image_embedding.tolist(),
                    'image_url': image_url,
                }
                image_batch.append(image_data)
        
        if image_batch:
            insert_image_batch(image_batch)

In [114]:
# Run the data creation and storage process
create_and_store_data(data_path, title_embeddings_metadata, title_embeddings, image_embeddings_metadata, all_image_embeddings, images_url_dictionary)

no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image index
no image i

KeyboardInterrupt: 

In [66]:
all_image_embeddings[3]

array([ 6.9580e-03, -5.1514e-02, -8.0566e-02, -2.7954e-02, -3.5156e-02,
        3.5889e-02, -6.7383e-02, -5.4688e-02,  1.2988e-01,  5.2002e-02,
        1.8066e-02, -1.5747e-02, -3.8574e-02,  4.7852e-02, -6.8359e-02,
       -4.8584e-02, -2.9541e-02,  6.5430e-02, -5.9326e-02, -1.0742e-02,
        4.6631e-02,  2.0874e-02, -1.6113e-02, -1.3489e-02,  5.4932e-02,
        1.0376e-02, -1.9287e-02, -1.3367e-02, -6.2256e-02,  6.2256e-02,
        3.9062e-02, -4.0771e-02,  8.5831e-04,  1.4282e-02,  6.6895e-02,
       -1.7822e-02, -5.2002e-02,  2.1118e-02, -4.9316e-02, -9.7656e-03,
       -1.2695e-02, -4.5410e-02,  1.9531e-02,  8.4839e-03, -2.7832e-02,
        5.5176e-02, -5.4626e-03,  2.7954e-02, -6.0059e-02,  5.0354e-03,
        4.0527e-02, -7.4707e-02,  3.2715e-02,  2.9541e-02,  6.2988e-02,
       -6.4941e-02, -6.1279e-02,  5.2734e-02,  4.0039e-02,  7.0572e-04,
        2.3315e-02, -7.4158e-03,  1.9897e-02,  1.4465e-02, -2.1729e-02,
        3.1494e-02,  7.3242e-02,  4.5410e-02, -6.7871e-02, -2.78