### Import Dependencies

In [1]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct, SparseVectorParams, Document, Prefetch, FusionQuery
from qdrant_client import models

import pandas as pd
import openai
import fastembed

In [2]:
qdrant_client = QdrantClient(url="http://localhost:6333")

### Create Qdrant collection for hybrid search

In [3]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-01-hybrid-search",
    vectors_config={
        "text-embedding-3-small": VectorParams(size=1536, distance=Distance.COSINE)
    },
    sparse_vectors_config={
        "bm25": SparseVectorParams(modifier=models.Modifier.IDF)
    }
)

True

In [4]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-01-hybrid-search",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

### Embedding Functions

In [5]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [6]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(input=text_list, model=model)
        return [embedding.embedding for embedding in response.data]
    
    all_embeddings = []
    counter = 1
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        response = openai.embeddings.create(input=batch, model=model)
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        print(f"Processed {counter * batch_size} of {len(text_list)}")
        counter += 1
    
    return all_embeddings

### Process and Embed Amazon Items Data

In [7]:
df_items = pd.read_json("../../data/meta_Video_Games_2022_2023_with_category_ratings_10_sample_1000.jsonl", lines=True)

In [8]:
df_items.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together
0,Computers,Mad Catz S.T.R.I.K.E. 13 Compact Premium Mecha...,4.4,52,[96-key rollover - never miss a keystroke with...,[Save space without sacrificing performance wi...,117.21,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Mad Catz S.T.R.I.K.E. 13 Compact W...,Mad Catz,"[Video Games, PC, Accessories, Gaming Keyboards]","{'Brand': 'Mad Catz', 'Series': 'S.T.R.I.K.E. ...",B0BLVNZNWV,
1,All Electronics,PS5 Console Plate with Vent and RGB LED Light ...,4.7,59,[üî¥„ÄêVENTILATION WINDOWS„ÄëSIKEMAY especially unde...,[],59.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'PS5 Plates Installation Video', 'u...",SIKEMAY,"[Video Games, PlayStation 5, Accessories, Cont...",{'Package Dimensions': '15.51 x 10.63 x 2.52 i...,B0BM4L3J9Q,
2,Health & Personal Care,"XP-RX Gamer Supplement for Energy, Focus & End...",4.1,27,"[All the Boost, None of the Crash: Through ext...",[],29.95,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Honest Review of Ghost Gamer for F...,DR EMIL NUTRITION,"[Video Games, Legacy Systems, Nintendo Systems]","{'Brand': 'DR EMIL NUTRITION', 'Unit Count': '...",B0BZBP1LF4,
3,All Electronics,SAMINRA Replacement for Xbox One Wired Control...,3.1,12,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Xbox 360 Wired Controller Unboxing...,SAMINRA,"[Video Games, Xbox One, Accessories, Controlle...",{'Package Dimensions': '6.3 x 5.24 x 2.87 inch...,B0BCJJX6GN,
4,Cell Phones & Accessories,GAMSURFING Silicone Dust Plug Cover for Xbox S...,4.5,37,[ONLY FIT FOR XBOX SERIES X/S: The rubber dust...,[],8.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Dust Proof Cooling Fan for Xbox Se...,GAMSURFING,"[Video Games, Xbox Series X & S, Accessories, ...",{'Package Dimensions': '6.42 x 1.89 x 0.47 inc...,B0B1BYBZRP,


In [9]:
len(df_items)

1000

In [10]:
def preprocess_description(row):
    return f"{row['title']} {' '.join(row['features'])}"

In [11]:
def extract_first_large_image(row):
    return row["images"][0].get("large", "")

In [12]:
df_items["description"] = df_items.apply(preprocess_description, axis=1)
df_items["image"] = df_items.apply(extract_first_large_image, axis=1)

In [13]:
data_to_embed = df_items[["description", "image", "rating_number", "price", "average_rating", "parent_asin"]].to_dict(orient="records")

In [14]:
data_to_embed

[{'description': "Mad Catz S.T.R.I.K.E. 13 Compact Premium Mechanical Wired Gaming Keyboard with Aluminum Frame Cherry MX RED switches and RGB Lighting 96-key rollover - never miss a keystroke with full-key anti-ghosting features. Ultra-reliable Cherry MX Red mechanical switches - 50 million strike life Double-shot injection molding with aluminum faceplate creates a durable keyboard that won't wear down Compact, space-saving design, ideal for small setups or travelling to tournaments. Integrated 3-way cable management keeps your space clutter-free and included keycap puller tool makes cleaning quick and easy Included exclusive software lets you customize your settings create macros, customize lighting with up to 18 different effects and more. For Windows only, available for download Multimedia and lighting shortcut keys provide instant access to popular commands. Independent RGB LED in every key - personalize your gaming experience with up to 16.8 million customizable colors",
  'image

In [15]:
text_to_embed = [data["description"] for data in data_to_embed]

In [16]:
text_to_embed

["Mad Catz S.T.R.I.K.E. 13 Compact Premium Mechanical Wired Gaming Keyboard with Aluminum Frame Cherry MX RED switches and RGB Lighting 96-key rollover - never miss a keystroke with full-key anti-ghosting features. Ultra-reliable Cherry MX Red mechanical switches - 50 million strike life Double-shot injection molding with aluminum faceplate creates a durable keyboard that won't wear down Compact, space-saving design, ideal for small setups or travelling to tournaments. Integrated 3-way cable management keeps your space clutter-free and included keycap puller tool makes cleaning quick and easy Included exclusive software lets you customize your settings create macros, customize lighting with up to 18 different effects and more. For Windows only, available for download Multimedia and lighting shortcut keys provide instant access to popular commands. Independent RGB LED in every key - personalize your gaming experience with up to 16.8 million customizable colors",
 "PS5 Console Plate with

In [17]:
embeddings = get_embeddings_batch(text_to_embed)

Processed 100 of 1000
Processed 200 of 1000
Processed 300 of 1000
Processed 400 of 1000
Processed 500 of 1000
Processed 600 of 1000
Processed 700 of 1000
Processed 800 of 1000
Processed 900 of 1000
Processed 1000 of 1000


In [18]:
len(embeddings)

1000

In [19]:
pointstructs = []
i = 1
for embedding, data in zip(embeddings, data_to_embed):
    pointstructs.append(
        PointStruct(
            id=i,
            vector={
                "text-embedding-3-small": embedding,
                "bm25": Document(
                    text=data["description"],
                    model="qdrant/bm25"
                )
            },
            payload=data
        )
    )
    i += 1

In [20]:
pointstructs[0].vector

{'text-embedding-3-small': [0.043461479246616364,
  0.01807820424437523,
  -0.0026256439741700888,
  0.00796916801482439,
  0.010004503652453423,
  -0.010846923105418682,
  0.025383275002241135,
  0.056423675268888474,
  0.01447486225515604,
  -0.01027506124228239,
  -0.0021552417892962694,
  0.01168319396674633,
  -0.015175853855907917,
  0.0719192773103714,
  0.045921098440885544,
  -0.004565669223666191,
  -0.027646126225590706,
  0.0022305676247924566,
  -0.04169055446982384,
  0.02427644655108452,
  0.015692373737692833,
  0.02233334630727768,
  0.04584731161594391,
  0.03750919923186302,
  0.011609405279159546,
  0.03423790633678436,
  0.0009300436940975487,
  -0.07728125154972076,
  -0.004863897804170847,
  0.03364759683609009,
  -0.025112716481089592,
  -0.030474688857793808,
  0.006407309323549271,
  -0.08338110893964767,
  -0.03514796495437622,
  -0.014437967911362648,
  -0.018939072266221046,
  -0.03684510290622711,
  -0.005076040048152208,
  0.010717793367803097,
  0.010084

In [21]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-01-hybrid-search",
    points=pointstructs[0:500],
    wait=True
)

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]



UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [22]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-01-hybrid-search",
    points=pointstructs[500:],
    wait=True
)

UpdateResult(operation_id=4, status=<UpdateStatus.COMPLETED: 'completed'>)

### Hybrid Retrieval

In [23]:
def retrieve_data(query, qdrant_client, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-01-hybrid-search",
        prefetch=[
            Prefetch(
                query=query_embedding,
                using="text-embedding-3-small",
                limit=20
            ),
            Prefetch(
                query=Document(
                    text=query,
                    model="qdrant/bm25"
                ),
                using="bm25",
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k,
    )

    retrieved_context_ids = []
    retrieved_context = []
    similarity_scores = []
    retrieved_context_ratings = []

    for result in results.points:
        retrieved_context_ids.append(result.payload["parent_asin"])
        retrieved_context.append(result.payload["description"])
        retrieved_context_ratings.append(result.payload["average_rating"])
        similarity_scores.append(result.score)

    return {
        "retrieved_context_ids": retrieved_context_ids,
        "retrieved_context": retrieved_context,
        "retrieved_context_ratings": retrieved_context_ratings,
        "similarity_scores": similarity_scores,
    }

In [24]:
results = retrieve_data("Can I get some tablet?", qdrant_client, k=20)

In [25]:
results

{'retrieved_context_ids': ['B0B82NTNRR',
  'B0B927GGCY',
  'B0BKTKFSQW',
  'B0C33ZB3H6',
  'B0BQ7K7ZT8',
  'B0BGSHMC98',
  'B09V2RBL2B',
  'B0BZBP1LF4',
  'B0B5FZGQHH',
  'B0B97LFT8Q',
  'B09Y8F819V',
  'B09WV2NYXQ',
  'B0BT263ZT9',
  'B0B3R22F1J',
  'B0C4DLHNRD',
  'B0C74FK8R4',
  'B0BGRD9QBT',
  'B09VS5D449',
  'B09WK85391',
  'B0BL3LG221'],
 'retrieved_context': ['Steam Deck Docking Station with 4k HDMI Interface,3 USB 3.0& 65W Charging Port,Type-C Cable for Switch, Tablet, Monitor, Handle, Steam Deck Accessories üé™Wild Compatible: The docking station is birthed for Steam deck, while it is also compatible with Nintendo switch/Switch OLED, table PC, cell phone, and more. üëâ6 in 1 Port: Steam deck docking station with 4k HDMI Interface(Connect TV/monitor),3 USB 3.0 sockets(connecting with game peripherals such as keyboards, mouses, etc),65W Charging Port, Type-C cable for Switch, Tablet, Monitor, Handle. ‚úàFast Charging: With type-C devices charging, supports 65w power output, wh