## Importing Necessary Libraries

In [23]:
import os
import numpy as np
import pandas as pd
import time
import requests
import json
import torch
import clip
import glob
import pymilvus
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    db
)

### Setting Directories and Folders

In [60]:
IMAGES_URL_DIRECTORY = 'image_folder'
IMAGE_EMBEDDINGS_DIRECTORY = 'image_embeddings_complete'
TITLE_EMBEDDINGS_DIRECTORY = 'title_embeddings'
title_embeddings_path = 'title_embeddings/text_emb/text_emb_0.npy'
image_embeddings_path = 'image_embeddings_complete/img_emb/img_emb_0.npy'
image_embeddings_meta_data_file = 'image_embeddings_complete/metadata/metadata_0.parquet'
title_embeddings_meta_data_file = 'title_embeddings/metadata/metadata_0.parquet'
data_path = 'D:\Datasets\Amazon\Amazon Product Dataset\meta_All_Beauty.jsonl'

### Function for creating url and keys dictionary

In [3]:
def creating_images_url_dictionary(path = IMAGES_URL_DIRECTORY):
# Initialize an empty dictionary
    url_key_dict = {}

    # List of file paths (adjust the path and extension as needed)
    file_paths = glob.glob(f'{path}/*.parquet')
    
    # print(len(file_paths))
    
    for file_path in file_paths:
        # Read the file into a DataFrame
        data = pd.read_parquet(file_path)
        
        # Ensure the key column is in string format
        data['key'] = data['key'].astype(str)
        
        # Update the dictionary with data from the current file
        url_key_dict.update(data.set_index('url')['key'].to_dict())
    
    return url_key_dict

### Function for preparing the meta data of the images to get the keys

In [16]:
def preparing_image_meta_data(image_embeddings_meta_data_file= image_embeddings_meta_data_file):
    image_embeddings_meta_data = pd.read_parquet(image_embeddings_meta_data_file)
    image_embeddings_meta_data['key'] = image_embeddings_meta_data['image_path'].str.extract(r'(\d{9})')
    return image_embeddings_meta_data


### Functions for getting the index of image and title embeddings

In [85]:
def get_index_of_image_embedding(image_embeddings_metadata, image_key):
    index_value = image_embeddings_metadata.loc[image_embeddings_metadata['key'] == image_key].index
    if not index_value.empty:
        return index_value[0]
    else:
        return False

def get_index_of_title_embedding(title_embeddings_metadata, title_key):
    index_value = title_embeddings_metadata.loc[title_embeddings_metadata['caption'] == title_key].index
    if not index_value.empty:
        return index_value[0]
    else:
        return False

### Function for extracting the urls

In [18]:
def extract_img_urls(image_array):
    urls = []
    for item in image_array:
        # print(item)
        if 'hi_res' in item and item['hi_res']:
            urls.append(item['hi_res'])
        elif 'large' in item and item['large']:
            urls.append(item['large'])
        else:
            print(f"Key 'hi_res' and 'large' not found in item: {item}")
    return urls

### Functions to get the image and title embeddings

In [20]:
def get_image_embedding(image_embeddings, index):
    return image_embeddings[index]

def get_title_embedding(title_embeddings, index):
    return title_embeddings[index]

## Loading the Image and Title embeddings

In [100]:
image_embeddings = np.load(image_embeddings_path)
title_embeddings = np.load(title_embeddings_path)

In [103]:
# print(image_embeddings.shape)

(548083, 512)


In [89]:
title_embeddings_metadata = pd.read_parquet(title_embeddings_meta_data_file)

In [90]:
image_embeddings_metadata = preparing_image_meta_data()
print(image_embeddings_metadata.head())
print(len(image_embeddings_metadata))

                         image_path        key
0  image_folder/00000/000000000.jpg  000000000
1  image_folder/00000/000000001.jpg  000000001
2  image_folder/00000/000000002.jpg  000000002
3  image_folder/00000/000000003.jpg  000000003
4  image_folder/00000/000000004.jpg  000000004
548083


In [91]:
images_url_dictionary = creating_images_url_dictionary()
print(len(images_url_dictionary))

478384


In [92]:
n = 2
# View the first n items of the dictionary
first_n_items = list(images_url_dictionary.items())[:n]
first_n_items

[('https://m.media-amazon.com/images/I/517uoA+-gzL._SL1005_.jpg', '000000016'),
 ('https://m.media-amazon.com/images/I/510BWq7O95L._SL1005_.jpg', '000000015')]

### Connecting to the Milvus Database

In [105]:
client = connections.connect("My_Products", host="localhost", port="19530")
client

In [107]:
db.list_database()

['default', 'Products', 'My_Products']

In [106]:
# database = db.create_database("My_Products")
db.using_database("My_Products")

### Creating Field Schemas for the Collections

In [108]:
fields = [
    FieldSchema(name="product_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=512),  
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length = 5000),
    FieldSchema(name="average_rating", dtype=DataType.FLOAT),
    FieldSchema(name="features", dtype=DataType.ARRAY, max_capacity = 1000, element_type = DataType.VARCHAR, max_length = 10000),
    FieldSchema(name="description", dtype=DataType.ARRAY, max_capacity = 500, element_type = DataType.VARCHAR, max_length = 50000),
    FieldSchema(name="categories", dtype=DataType.ARRAY, max_capacity = 500, element_type = DataType.VARCHAR, max_length = 500),
    FieldSchema(name="price", dtype=DataType.FLOAT),
    FieldSchema(name="store", dtype=DataType.VARCHAR, max_length = 500),
    FieldSchema(name="main_category", dtype=DataType.VARCHAR, max_length = 500)
]

fields_images = [
    FieldSchema(name="image_id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="p_id", dtype=DataType.INT64),  # Foreign key to Products
    FieldSchema(name="image_vector", dtype=DataType.FLOAT_VECTOR, dim=512),
    FieldSchema(name="image_url", dtype=DataType.VARCHAR, max_length=10000)
]

### Creating the collections

In [109]:
product_schema = CollectionSchema(fields, description="Products collection")
products_collection = Collection(name = 'products', schema=product_schema)

In [110]:
image_schema = CollectionSchema(fields_images, description='Images Collection')
images_collection = Collection(name = 'images', schema = image_schema)

### Creating the Indices for the collections and loading them

In [111]:
index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}

products_collection.create_index(field_name="title_vector", index_params = index_params)
products_collection.load()

images_collection.create_index(field_name="image_vector", index_params = index_params)
images_collection.load()

In [113]:
def create_and_store_data():
    with open(data_path) as file:
        for line in file:
            data = json.loads(line.strip())
            if '"' in data['title']:
                continue
            
            title = data['title']
            title_index = get_index_of_title_embedding(title_embeddings_metadata, title)
            if not title_index:
                continue
            title_embedding = get_title_embedding(title_embeddings, title_index)
            # break
            
            # Extract product information
            average_rating = data['average_rating']
            features = data['features']
            description = data['description']
            categories = data['categories']
            price = data['price']
            main_category = data['main_category']
            store = data['store']
            if not price:
                price = 0.0
            if not store:
                store = ''
            if not main_category:
                main_category = ''
            if not average_rating:
                average_rating = 0.0
            
            
            # Prepare product data
            product_data = {
                'title_vector': title_embedding.tolist(),
                'title': title,
                'average_rating': average_rating,
                'features': features,
                'description': description,
                'categories': categories,
                'price': price,
                'store': store,
                'main_category': main_category,
            }
            
            # Insert product data into the products collection
            products_collection.insert([product_data])
            # Flush to ensure data is written
            products_collection.flush()
            
            # Retrieve product ID
            product_ids = products_collection.query(expr=f'title == "{title}"', output_fields=["product_id"])
            product_id = product_ids[0]["product_id"]
            
            # Extract and process image URLs
            image_urls = extract_img_urls(data['images'])
            current_image_embeddings = []
            for url in image_urls:
                
                #getting the image key stored in the dictionary
                if url in images_url_dictionary and images_url_dictionary[url]:
                    image_key = images_url_dictionary[url]
                else:
                    continue
                # print(image_key)
                image_index = get_index_of_image_embedding(image_embeddings_metadata, image_key)
                # print(image_index)
                if not image_index:
                    continue
                image_embedding = get_image_embedding(image_embeddings, image_index)
                
                current_image_embeddings.append((image_embedding, url))

            # Insert each image embedding with the associated product ID
            for image_embedding, image_url in current_image_embeddings:
                image_data = {
                    'p_id': product_id,
                    'image_vector': image_embedding.tolist(),
                    'image_url': image_url,
                }
                images_collection.insert([image_data])
            # Flush to ensure data is written
            images_collection.flush()
            
            print(f"Inserted product ID: {product_id} with {len(current_image_embeddings)} images")

In [114]:
create_and_store_data()

Inserted product ID: 451053681282414334 with 2 images
Inserted product ID: 451053681282414340 with 2 images
Inserted product ID: 451053681282414346 with 7 images
Inserted product ID: 451053681282414362 with 1 images
Inserted product ID: 451053681282414366 with 9 images
Inserted product ID: 451053681282414386 with 5 images
Inserted product ID: 451053681282414398 with 7 images
Inserted product ID: 451053681282414414 with 1 images
Inserted product ID: 451053681282414418 with 7 images
Inserted product ID: 451053681282414434 with 1 images
Inserted product ID: 451053681282414438 with 1 images
Inserted product ID: 451053681282414442 with 6 images
Inserted product ID: 451053681282414456 with 9 images
Inserted product ID: 451053681282414476 with 1 images
Inserted product ID: 451053681282414480 with 1 images
Inserted product ID: 451053681282414484 with 5 images
Inserted product ID: 451053681282414496 with 9 images
Inserted product ID: 451053681282414516 with 7 images
Inserted product ID: 4510536

In [64]:
title_embeddings_metadata

Unnamed: 0,caption
0,"Howard LC0008 Leather Conditioner, 8-Ounce (4-..."
1,Yes to Tomatoes Detoxifying Charcoal Cleanser ...
2,Avon Big & False Lash Mascara - Brown/Black by...
3,Amybria New Silver Plated Full Crystal Inlay B...
4,Lucky 7 Shaving Starter Set - Seven Blade Shav...
...,...
112585,BELLA AURORA B-7 ANTI-TACHES 50ML
112586,8 Pieces Tie Dye Headbands Wide Boho Turban He...
112587,"Ameng Senegalese Twist Hair, 5Packs Twist Croc..."
112588,Roller Curly Hair Comb Hairstyle Massager Hair...


In [81]:
a = [1,2,3,4]
# a.tolist()

AttributeError: 'list' object has no attribute 'tolist'

In [82]:
'dfsd' in images_url_dictionary

False