In [137]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, PayloadSchemaType, Prefetch, Filter, FieldCondition, MatchText, FusionQuery, MatchAny

import pandas as pd
import openai
import json
import tiktoken

from dotenv import load_dotenv
import os

load_dotenv('../.env_api')

from rich.pretty import pprint


## Load Amazon dataset(Items)

In [20]:
df_items = pd.read_json("../data/meta_Electronics_1000.jsonl", lines=True)
df_items.head(1)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,available_date
0,All Electronics,"Adjustable Metal Stand for E cho Show 15, Tilt...",4.5,111,[The VMEI adjustable metal bracket is made of ...,[],36.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Small footprint, minimalist design...",VMEI,"[Electronics, Computers & Accessories, Compute...","{'Brand': 'VMEI', 'Color': 'Black', 'Room Type...",B0B13T2GY5,,2022-05-11


In [27]:
df_items_sample = df_items.sample(n=50, random_state=22)

len(df_items_sample)

50

### Preproc

In [28]:
def preprocess_data(row):
    return f"{row['title']} {' '.join(row['features'])}"

def extract_first_large_image(row):
    return row["images"][0].get("large", '')

df_items_sample["preprocessed_data"] = df_items_sample.apply(preprocess_data, axis=1)
df_items_sample["first_large_image"] = df_items_sample.apply(extract_first_large_image, axis=1)

df_items_sample.head(1)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,available_date,preprocessed_data,first_large_image
866,All Electronics,Cubilux Bidirectional 4-Channel 6.35mm Audio S...,4.4,432,[[Bidirectional 4 Ports Stereo Audio Switch] –...,[],22.8,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'FIFINE Ultra Low-Noise 4-Channel L...,Cubilux,"[Electronics, Home Audio, Home Theater, Receiv...","{'Brand': 'Cubilux', 'Color': 'Black, Signal S...",B09QM7C4FJ,,2022-06-15,Cubilux Bidirectional 4-Channel 6.35mm Audio S...,https://m.media-amazon.com/images/I/31ru008gzU...


## Load Amazon dataset(Reviews)

In [71]:
df_reviews = pd.read_json("../data/Electronics_1000.jsonl", lines=True)
len(df_reviews)

103462

In [72]:
df_reviews_sample = df_reviews[df_reviews['parent_asin'].isin(df_items_sample['parent_asin'])]
len(df_reviews_sample)

3452

In [73]:
df_reviews_sample.head(2)

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
40,5,Good antenna replacement,"Great product, easy 2min install. 3 months fin...",[],B079KXLF57,B07HGQT28F,AFWRGOGF4AI2IHRX7KZ2IYL63RXA,2019-07-27 04:29:39.520,0,True
42,5,Solid accent light!,[[VIDEOID:26e41479a2cd6585ebd147f2d336ed2a]] U...,[{'small_image_url': 'https://m.media-amazon.c...,B09VCXYJHK,B09Y77N1T2,AGL3NDIOMDLE2BS7WPF6GHLE77ZQ,2022-05-30 00:29:35.253,2,False


### Define preproc in Reviews

In [74]:
def preprocess_review_data(row):
    return f'{row['title']} {row['text']}'

In [75]:
def token_count(row, model='text-embedding-3-small'):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(row['preprocessed_data']))

In [76]:
df_reviews_sample['preprocessed_data'] = df_reviews_sample.apply(preprocess_review_data, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_sample['preprocessed_data'] = df_reviews_sample.apply(preprocess_review_data, axis=1)


In [77]:
df_reviews_sample['preprocessed_data_token_count'] = df_reviews_sample.apply(token_count, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews_sample['preprocessed_data_token_count'] = df_reviews_sample.apply(token_count, axis=1)


In [78]:
len(df_reviews_sample)

3452

In [79]:
df_reviews_sample=df_reviews_sample[df_reviews_sample['preprocessed_data_token_count']<8192]

In [80]:
len(df_reviews_sample)

3452

In [81]:
df_reviews_sample.head(2)

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,preprocessed_data,preprocessed_data_token_count
40,5,Good antenna replacement,"Great product, easy 2min install. 3 months fin...",[],B079KXLF57,B07HGQT28F,AFWRGOGF4AI2IHRX7KZ2IYL63RXA,2019-07-27 04:29:39.520,0,True,"Good antenna replacement Great product, easy 2...",32
42,5,Solid accent light!,[[VIDEOID:26e41479a2cd6585ebd147f2d336ed2a]] U...,[{'small_image_url': 'https://m.media-amazon.c...,B09VCXYJHK,B09Y77N1T2,AGL3NDIOMDLE2BS7WPF6GHLE77ZQ,2022-05-30 00:29:35.253,2,False,Solid accent light! [[VIDEOID:26e41479a2cd6585...,112


### New collections

In [121]:
qdrant_client = QdrantClient(
    url=f'http://localhost:6333',
    timeout=60.0
)

In [86]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-12-items",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

UnexpectedResponse: Unexpected Response: 409 (Conflict)
Raw response content:
b'{"status":{"error":"Wrong input: Collection `Amazon-items-collection-12-items` already exists!"},"time":0.000026}'

In [87]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-12-items",
    field_name="text",
    field_schema=PayloadSchemaType.TEXT
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [89]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-12-items",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

## Create reviews collections

In [92]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-12-reviews",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

In [93]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-12-reviews",
    field_name="parent_asin",
    field_schema=PayloadSchemaType.KEYWORD
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

## Embedding funcs

In [94]:
import openai

def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [101]:
def get_embeddings_batch(text_list, model="text-embedding-3-small", batch_size=100):
    if len(text_list) <= batch_size:
        response = openai.embeddings.create(
            input=text_list,
            model=model,
        )
        return [embedding.embedding for embedding in response.data]
    
    all_embeddings = []
    counter = 1

    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        response = openai.embeddings.create(
            input=batch,
            model=model,
        )
        all_embeddings.extend([embedding.embedding for embedding in response.data])
        counter += 1
        print(f"Processed {counter * batch_size} of {len(text_list)}")
    return all_embeddings

## Embed text and store extra values

In [102]:
data_to_embed = df_items_sample[['preprocessed_data', 'first_large_image', 'rating_number', 'price', 'average_rating', 'parent_asin']].to_dict(orient='records')

data_to_embed[0]



{'preprocessed_data': 'Cubilux Bidirectional 4-Channel 6.35mm Audio Selector, 1x TRS to 4X TRS Stereo 4-Port 1/4” Quarter Inch AUX Switch Box for Speaker, Headphones, Subwoofer, Home Audio System, Mixer, Amplifier [Bidirectional 4 Ports Stereo Audio Switch] – Cubilux 6.35mm Audio Selector enable audio switching from 4 inputs to 1 output or 1 input to 4 outputs. Plug and play without driver or extra power supply. [Knob Control] – Easily switch audio channel with a single twist. [Knob Control] – Easily switch audio channel with a single twist. [Lossless Sound Quality] – Full Aluminum alloy housing and gold-plated 1/4 sockets provide full isolation of electronic interference and ensure no-loss transmission of audio signal without static or ground noise. [Wide Compatibility] – Compatible with studio monitor, headphones, speakers, mixers, amplifiers etc. for stereo input or output. Please note that the adapter does NOT support microphone input. [Perfect for Desktop Audio System] – With anti

In [103]:
text_to_embed_items = [data['preprocessed_data'] for data in data_to_embed]

In [106]:
embedding_items = get_embeddings_batch(text_to_embed_items)

In [109]:
len(embedding_items[0])

1536

In [110]:
pointstructs = []
i=1
for embedding, data in zip(embedding_items, data_to_embed):
    pointstructs.append(PointStruct(
        id=i,
        vector=embedding,
        payload={
            "text": data['preprocessed_data'],
            "first_large_image": data['first_large_image'],
            "average_rating": data['average_rating'],
            "rating_number": data['rating_number'],
            "price": data['price'],
            "parent_asin": data['parent_asin'],
        }
    ))
    i += 1

In [111]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-12-items",
    points=pointstructs,
    wait=True
)

UpdateResult(operation_id=4, status=<UpdateStatus.COMPLETED: 'completed'>)

### REVIEWS inserting

In [113]:
data_to_embed_reviews = df_reviews_sample[['preprocessed_data', 'parent_asin']].to_dict(orient='records')

In [115]:
text_to_embed_reviews = [data['preprocessed_data'] for data in data_to_embed_reviews]
len(text_to_embed_reviews)

3452

In [116]:
embeddings_reviews = get_embeddings_batch(text_to_embed_reviews)

Processed 200 of 3452
Processed 300 of 3452
Processed 400 of 3452
Processed 500 of 3452
Processed 600 of 3452
Processed 700 of 3452
Processed 800 of 3452
Processed 900 of 3452
Processed 1000 of 3452
Processed 1100 of 3452
Processed 1200 of 3452
Processed 1300 of 3452
Processed 1400 of 3452
Processed 1500 of 3452
Processed 1600 of 3452
Processed 1700 of 3452
Processed 1800 of 3452
Processed 1900 of 3452
Processed 2000 of 3452
Processed 2100 of 3452
Processed 2200 of 3452
Processed 2300 of 3452
Processed 2400 of 3452
Processed 2500 of 3452
Processed 2600 of 3452
Processed 2700 of 3452
Processed 2800 of 3452
Processed 2900 of 3452
Processed 3000 of 3452
Processed 3100 of 3452
Processed 3200 of 3452
Processed 3300 of 3452
Processed 3400 of 3452
Processed 3500 of 3452
Processed 3600 of 3452


In [118]:
len(embeddings_reviews)

3452

In [119]:
pointstructs_reviews = []
i=1
for embedding, data in zip(embeddings_reviews, data_to_embed_reviews):
    pointstructs_reviews.append(PointStruct(
        id=i,
        vector=embedding,
        payload={
            "text": data['preprocessed_data'],
            "parent_asin": data['parent_asin'],
        }
    ))
    i += 1

In [125]:
# Insert reviews in chunks to avoid overloading the server
chunk_size = 100  # You can adjust this as needed
for start in range(0, len(pointstructs_reviews), chunk_size):
    end = start + chunk_size
    chunk = pointstructs_reviews[start:end]
    qdrant_client.upsert(
        collection_name="Amazon-items-collection-12-reviews",
        points=chunk,
        wait=True
    )
    print(f"Inserted chunk {start // chunk_size + 1} of {len(pointstructs_reviews) // chunk_size}")

Inserted chunk 1 of 34
Inserted chunk 2 of 34
Inserted chunk 3 of 34
Inserted chunk 4 of 34
Inserted chunk 5 of 34
Inserted chunk 6 of 34
Inserted chunk 7 of 34
Inserted chunk 8 of 34
Inserted chunk 9 of 34
Inserted chunk 10 of 34
Inserted chunk 11 of 34
Inserted chunk 12 of 34
Inserted chunk 13 of 34
Inserted chunk 14 of 34
Inserted chunk 15 of 34
Inserted chunk 16 of 34
Inserted chunk 17 of 34
Inserted chunk 18 of 34
Inserted chunk 19 of 34
Inserted chunk 20 of 34
Inserted chunk 21 of 34
Inserted chunk 22 of 34
Inserted chunk 23 of 34
Inserted chunk 24 of 34
Inserted chunk 25 of 34
Inserted chunk 26 of 34
Inserted chunk 27 of 34
Inserted chunk 28 of 34
Inserted chunk 29 of 34
Inserted chunk 30 of 34
Inserted chunk 31 of 34
Inserted chunk 32 of 34
Inserted chunk 33 of 34
Inserted chunk 34 of 34
Inserted chunk 35 of 34


### hybrid + RRF

In [133]:
def retrieve_data(query, k=5):
    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-12-items",
        prefetch=[
            Prefetch(
                query=query_embedding,
                limit=20,
            ),
            Prefetch(
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="text",
                            match=MatchText(
                                text=query,
                            )
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k,
    )
    return results

In [134]:
result = retrieve_data('earphones')

pprint(result.points[:2])

In [136]:
parent_asins = []
for data in result.points:
    parent_asins.append(data.payload['parent_asin'])

pprint(parent_asins)

In [144]:
def retrieve_prefiltered_reviews_data(query, parent_asins, k=5):
    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-12-reviews",
        prefetch=[
            Prefetch(
                query=query_embedding,
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="parent_asin",
                            match=MatchAny(
                                any=parent_asins,
                            )
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k,
    )
    return results

In [145]:
reviews = retrieve_prefiltered_reviews_data('bad quality', parent_asins)

In [153]:
display(reviews.points[:6])

[ScoredPoint(id=2333, version=32, score=0.5, payload={'text': "Bad quality It worked 3 times before breaking. It's really bad quality. I returned it and bought a different brand at Best Buy.", 'parent_asin': 'B0B1HPCQQD'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=2157, version=30, score=0.33333334, payload={'text': 'bed quality of sound So bed quality of sound. Volume level so low.', 'parent_asin': 'B0B1HPCQQD'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=2419, version=33, score=0.25, payload={'text': 'Not satisfied with what I thought was to be a good product. I Gave it away. Was not what I thought it would be. Poor quality.', 'parent_asin': 'B0B1HPCQQD'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3088, version=39, score=0.2, payload={'text': 'Good quality [[VIDEOID:d0b449e32a39def216aacab932d6efb5]] Very similar to those expensive ones.good sound and appearance', 'parent_asin': 'B0CCKQF5GF'}, vector=None, shard_key=Non

## my experiments

In [107]:
import tiktoken

enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
vocab = enc._mergeable_ranks  # dict: {token_bytes: token_id}

for token_bytes, token_id in list(vocab.items())[12000:12020]:
    print(f"{token_id}: {token_bytes.decode('utf-8', errors='replace')}")

len(vocab.items())

12000:  Assembly
12001: /user
12002: NullOr
12003: textarea
12004:  ath
12005:  ([
12006:  channels
12007:  Justice
12008: choice
12009: LOBAL
12010: exec
12011: emale
12012:  elem
12013: _le
12014:  responsibility
12015:  Tw
12016: ICATION
12017:  elseif
12018:  fo
12019: asts


100256

In [51]:
import tiktoken
import pandas as pd

# Załaduj tokenizator do gpt-3.5-turbo (czyli cl100k_base)
enc = tiktoken.get_encoding("cl100k_base")

# Pobierz słownik: token_id i string tokena
token_tuples = [(token_id, token_bytes.decode("utf-8", errors="replace"))
                for token_bytes, token_id in enc._mergeable_ranks.items()]

# Stwórz DataFrame
df = pd.DataFrame(token_tuples, columns=["token_id", "token"])

# Zapisz do CSV
df.to_csv("gpt35_token_vocab.csv", index=False, encoding="utf-8")

In [100]:
for i in range(1, 1001, 100):
    print(i)

1
101
201
301
401
501
601
701
801
901
