In [3]:
import torch
import clip
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import requests
import json
import os

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model, preprocess = clip.load("ViT-B/16", device=device)

In [5]:
DATASET_DIRECTORY = "D:\Datasets\Amazon\Amazon Product Dataset"
sample_data_path = 'D:\Datasets\Amazon\Amazon Product Dataset\meta_Gift_Cards.jsonl'

In [14]:
def image_url_to_img(image_url):
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    # plt.imshow(img)
    # plt.show()
    return img

## Text and Image Embedding Functions

In [15]:
def generate_text_embeddings(text):
    text_features = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_embeddings = model.encode_text(text_features).cpu().numpy()[0]
        text_embeddings /= text_embeddings.norm(dim = -1, keepdim= True)
        return text_embeddings

def generate_image_embeddings(img):
    img_preprocessed = preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        image_embeddings = model.encode_image(img_preprocessed).float()
        image_embeddings /= image_embeddings.norm(dim = -1, keepdim = True)
    return image_embeddings[0].cpu().numpy()


In [32]:
img = image_url_to_img('https://m.media-amazon.com/images/I/612JNfob9nL._AC_UY218_.jpg')
embed = generate_image_embeddings(img)
# print(embed)
# print(list(img.getdata()))

<ImagingCore object at 0x000001F3832834B0>


In [36]:
text_embeddings = []
with open(sample_data_path) as file:
    for line in file:
        data = json.loads(line.strip())
        print(data['images'])
        # print(product['Reviews'])
        # image_embedding = generate_image_embeddings(product['Image'])
        # print(image_embedding)
        # title = product['Title']
        # title_embeddings = generate_text_embeddings(title)
        # text_embeddings.append(title_embeddings)

[{'thumb': 'https://m.media-amazon.com/images/I/41ZA96xtATL._SX38_SY50_CR,0,0,38,50_.jpg', 'large': 'https://m.media-amazon.com/images/I/41ZA96xtATL.jpg', 'variant': 'MAIN', 'hi_res': 'https://m.media-amazon.com/images/I/71cWJvVGYtL._SL1500_.jpg'}, {'thumb': 'https://m.media-amazon.com/images/I/41NK1FX6uUL._SX38_SY50_CR,0,0,38,50_.jpg', 'large': 'https://m.media-amazon.com/images/I/41NK1FX6uUL.jpg', 'variant': 'PT01', 'hi_res': 'https://m.media-amazon.com/images/I/71q-qp4b3-L._SL1500_.jpg'}, {'thumb': 'https://m.media-amazon.com/images/I/41Y45S0GirL._SX38_SY50_CR,0,0,38,50_.jpg', 'large': 'https://m.media-amazon.com/images/I/41Y45S0GirL.jpg', 'variant': 'PT02', 'hi_res': 'https://m.media-amazon.com/images/I/71KutAnl9gL._SL1500_.jpg'}, {'thumb': 'https://m.media-amazon.com/images/I/417MZ16DhcL._SX38_SY50_CR,0,0,38,50_.jpg', 'large': 'https://m.media-amazon.com/images/I/417MZ16DhcL.jpg', 'variant': 'PT03', 'hi_res': 'https://m.media-amazon.com/images/I/61FMUKaXfJL._SL1175_.jpg'}, {'thumb

In [None]:
d = {
    "main_category": "All Beauty", 
     "title": "Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)", 
     "average_rating": 4.8, 
     "rating_number": 10, 
     "features": [], 
     "description": [], 
     "price": null, 
     "images": [{"thumb": "https://m.media-amazon.com/images/I/41qfjSfqNyL._SS40_.jpg", "large": "https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg", "variant": "MAIN", "hi_res": null}, {"thumb": "https://m.media-amazon.com/images/I/41w2yznfuZL._SS40_.jpg", "large": "https://m.media-amazon.com/images/I/41w2yznfuZL.jpg", "variant": "PT01", "hi_res": "https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg"}], 
     "videos": [], 
     "store": "Howard Products", 
     "categories": [], 
     "details": {"Package Dimensions": "7.1 x 5.5 x 3 inches; 2.38 Pounds", "UPC": "617390882781"}, 
     "parent_asin": "B01CUPMQZE", 
     "bought_together": null
     }

In [33]:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    db
)

In [34]:
client = connections.connect("default", host="localhost", port="19530")

In [35]:
db.list_database()

['default', 'Products']

In [47]:
d = {
    "main_category": "All Beauty", 
     "title": "Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)", 
     "average_rating": 4.8, 
     "rating_number": 10, 
     "features": [], 
     "description": [], 
     "price": null, 
     "images": [{"thumb": "https://m.media-amazon.com/images/I/41qfjSfqNyL._SS40_.jpg", "large": "https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg", "variant": "MAIN", "hi_res": null}, {"thumb": "https://m.media-amazon.com/images/I/41w2yznfuZL._SS40_.jpg", "large": "https://m.media-amazon.com/images/I/41w2yznfuZL.jpg", "variant": "PT01", "hi_res": "https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg"}], 
     "videos": [], 
     "store": "Howard Products", 
     "categories": [], 
     "details": {"Package Dimensions": "7.1 x 5.5 x 3 inches; 2.38 Pounds", "UPC": "617390882781"}, 
     "parent_asin": "B01CUPMQZE", 
     "bought_together": null
     }
# Defining schema
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=512),  
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length = 500),
    FieldSchema(name="image_vector", dtype=DataType.FLOAT_VECTOR, dim=512),  
    FieldSchema(name="average_rating", dtype=DataType.FLOAT),
    FieldSchema(name="features", dtype=DataType.ARRAY, max_capacity = 50, element_type = DataType.VARCHAR),
    FieldSchema(name="description", dtype=DataType.ARRAY, max_capacity = 50, element_type = DataType.VARCHAR),
    FieldSchema(name="categories", dtype=DataType.ARRAY, max_capacity = 50, element_type = DataType.VARCHAR),
    FieldSchema(name="price", dtype=DataType.FLOAT),
    FieldSchema(name="store", dtype=DataType.VARCHAR, max_length = 100),
]

In [48]:
utility.drop_collection('product_collection')

In [49]:
collection_name = 'product_collection'
schema = CollectionSchema(fields, description="Product search")
collection = Collection(name = collection_name, schema=schema)

In [50]:
for product in products:
    title_embedding = generate_text_embeddings(product["Title"])
    image_embedding = generate_image_embeddings(product["Image"])
    
    # Remove commas from the reviews field and convert to integer
    reviews_cleaned = int(product["Reviews"].replace(",", ""))
    positions_cleaned = int(product['Position'].replace(',', ''))

    entity = {
        "title_vector": title_embedding.tolist(),
        "image_vector": image_embedding.tolist(),
        "position": positions_cleaned,
        "title": product["Title"],
        "product_link": product["Product link"],
        "price": product["Price"],
        "reviews": reviews_cleaned,
        "rating": product["Rating"],
        "sponsored": product["Sponsored"]
    }
    


In [None]:
status, ids = collection.insert([entity])


In [68]:
# collection.drop_index(index_name = 'title_vector')

In [69]:
index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}

In [70]:
collection.create_index(field_name="title_vector", index_params= index_params)
collection.create_index(field_name = "image_vector", index_params = index_params)

Status(code=0, message=)

In [71]:
collection.load()

In [72]:
search_params = {
    "metric_type": "COSINE", 
    "offset": 0, 
    "ignore_growing": False, 
    "params": {"nprobe": 10}
}

In [130]:
# sample_query = products[1]['Title']
sample_query = products[1]['Image']
# sample_query_embedding = generate_text_embeddings(sample_query)
sample_query_embedding = generate_image_embeddings(sample_query)
sample_query_embedding

array([-7.11805940e-01, -4.88294363e-02,  3.30361724e-01,  2.18768850e-01,
        1.27947137e-01,  1.02498323e-01,  1.86483592e-01,  6.57398775e-02,
        1.82011008e-01,  1.76132232e-01,  3.03569198e-01, -5.66980481e-01,
       -1.45702362e-01, -6.45787418e-02,  2.16639698e-01,  1.31733939e-01,
       -6.89856231e-01, -1.97257280e-01, -1.79133296e-01,  8.63223076e-02,
       -6.48642004e-01, -2.63700396e-01,  2.93147087e-01,  4.38241363e-02,
        8.10087621e-02,  2.22875297e-01, -3.14478755e-01,  3.99146676e-01,
        4.84298170e-03,  3.14859897e-01,  1.40053779e-02, -3.39155197e-01,
        1.23245724e-01, -3.54683131e-01, -3.18638176e-01, -6.34527922e-01,
       -2.28771135e-01,  2.39075616e-01, -6.80975020e-02,  8.76162291e-01,
        1.57782972e-01,  1.91779166e-01,  2.00409859e-01,  5.16777217e-01,
        2.45569929e-01, -5.45612693e-01, -8.59006941e-02,  2.32251883e-02,
        4.35072601e-01,  4.87945974e-04,  1.95686996e-01, -2.96398550e-01,
        4.53765631e-01,  

In [131]:
sample_query

'https://m.media-amazon.com/images/I/81sv3I05wCL._AC_UL320_.jpg'

In [132]:
results = collection.search(
    data=[sample_query_embedding], 
    anns_field="image_vector", 
    # the sum of `offset` in `param` and `limit` 
    # should be less than 16384.
    param=search_params,
    limit=10,
    expr=None,
    # set the names of the fields you want to 
    # retrieve from the search result.
    output_fields=['title','price'],
    consistency_level="Strong"
)


In [84]:
results[0].ids

[450687289225645638,
 450687289225645662,
 450687289225645648,
 450687289225645646,
 450687289225645666,
 450687289225645664,
 450687289225645674,
 450687289225645644,
 450687289225645654,
 450687289225645676]

In [85]:
results[0].distances

[1.0000001192092896,
 0.5990191698074341,
 0.5963578224182129,
 0.5753413438796997,
 0.5619478225708008,
 0.5553554892539978,
 0.5465940237045288,
 0.538995087146759,
 0.5369709730148315,
 0.520717442035675]

In [133]:
hit = results[0][0]
hit.entity.get('title')

'Play Purse for Little Girls, 35PCS Toddler Purse with Pretend Makeup for Toddlers, Princess Toys Includes Handbag, Phone, Wallet, Camera, Keys, Kids Purse Birthday Gift for Girls Age 3 4 5 6+'

In [134]:
for result in results[0]:
    # print(result[0].entity.get('title'),'\n')
    print(result.entity)

id: 450687289225645640, distance: 0.9999998807907104, entity: {'title': 'Play Purse for Little Girls, 35PCS Toddler Purse with Pretend Makeup for Toddlers, Princess Toys Includes Handbag, Phone, Wallet, Camera, Keys, Kids Purse Birthday Gift for Girls Age 3 4 5 6+', 'price': '$17.99'}
id: 450687289225645650, distance: 0.6877689957618713, entity: {'title': 'Kids Smart Phone for Girls, Christmas Birthday Gifts for Girls Age 3-10 Kids Toys Cell Phone, 2.8" Touchscreen Toddler Learning Play Toy Phone with Dual Camera, Game, Music Player, 8G SD Card (Purple)', 'price': '$35.90'}
id: 450687289225645664, distance: 0.6292790174484253, entity: {'title': 'Fidget Toys, 120 Pack Fidgets Set Stocking Stuffers for Kids Party Favors Autism Sensory Toy Bulk Adults Kids Boys Girls Teens Stress Autistic ADHD Anxiety Carnival Treasure Classroom Prizes', 'price': '$14.99'}
id: 450687289225645658, distance: 0.6275117993354797, entity: {'title': 'Sloosh Bubble Lawn Mower Toddler Toys - Kids Toys Bubble Mach