In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct


import pandas as pd
import openai



In [25]:
qdrant_client = QdrantClient(
    url="http://localhost:6333",
)


In [None]:

qdrant_client.create_collection(
    collection_name="Amazon-items-collection-02",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

In [2]:
import pandas as pd

df_items = pd.read_json("../data/meta_Electronics_1000.jsonl", lines=True)

df_items.head(3)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,available_date
0,All Electronics,"Adjustable Metal Stand for E cho Show 15, Tilt...",4.5,111,[The VMEI adjustable metal bracket is made of ...,[],36.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Small footprint, minimalist design...",VMEI,"[Electronics, Computers & Accessories, Compute...","{'Brand': 'VMEI', 'Color': 'Black', 'Room Type...",B0B13T2GY5,,2022-05-11
1,Computers,"HP Laptop, 15.6"" HD Touchscreen, AMD Athlon Go...",4.4,205,[【High Speed RAM And Enormous Space】8GB high-b...,[PConline365 sells computers with professional...,,[{'thumb': 'https://m.media-amazon.com/images/...,[],HP,"[Electronics, Computers & Accessories, Compute...","{'Brand': 'HP', 'Color': 'Silver', 'Room Type'...",B08DC5KLJZ,,2022-03-24
2,Amazon Home,50 Pcs Groot Stickers for Water Bottle Waterpr...,4.7,156,[🏆Groot Stickers:Anime theme party favors for ...,[Groot Stickers Quantity: 50 Pcs/pack.Non-dupl...,5.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': '200-Piece Stickers Pack for Laptop...,Fcelery,"[Electronics, Computers & Accessories, Laptop ...","{'Brand': 'Fcelery', 'Color': 'Groot', 'Room T...",B0B4QQSRY3,,2022-06-22


In [3]:
def preprocess_data(row):
    return f"{row['title']} {' '.join(row['features'])}"

In [4]:
df_items["preprocessed_data"] = df_items.apply(preprocess_data, axis=1)

df_items.head(2)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,available_date,preprocessed_data
0,All Electronics,"Adjustable Metal Stand for E cho Show 15, Tilt...",4.5,111,[The VMEI adjustable metal bracket is made of ...,[],36.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Small footprint, minimalist design...",VMEI,"[Electronics, Computers & Accessories, Compute...","{'Brand': 'VMEI', 'Color': 'Black', 'Room Type...",B0B13T2GY5,,2022-05-11,"Adjustable Metal Stand for E cho Show 15, Tilt..."
1,Computers,"HP Laptop, 15.6"" HD Touchscreen, AMD Athlon Go...",4.4,205,[【High Speed RAM And Enormous Space】8GB high-b...,[PConline365 sells computers with professional...,,[{'thumb': 'https://m.media-amazon.com/images/...,[],HP,"[Electronics, Computers & Accessories, Compute...","{'Brand': 'HP', 'Color': 'Silver', 'Room Type'...",B08DC5KLJZ,,2022-03-24,"HP Laptop, 15.6"" HD Touchscreen, AMD Athlon Go..."


In [5]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [7]:
df_sample = df_items.sample(n=50, random_state=25)

data_to_embed = df_sample["preprocessed_data"].tolist()

pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data)
    pointstruct = PointStruct(
        id=i,
        vector=embedding,
        payload={"text": data}
    )
    pointstructs.append(pointstruct)

In [10]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-02",
    wait=True,
    points=pointstructs
    )

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [12]:
import json
 
# asking about reason actually improves accuracy

output_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "question": {"type": "string", "description": "Suggested question to answer"},
            "chunk_ids": {
                "type": "array",
                "items": {
                    "type": "integer", 
                    "description": "Index of the chunk that could be used to answer the question"
                },
            },
            "answer_example": {"type": "string", "description": "Suggested answer grounded in the context."},
            "reasoning": {"type": "string", "description": "Reasoning why the answer could be answered with the available chunks."},
        }
    }
 }


SYSTEM_PROMPT = f"""
I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a shopping assistant that can answer questions about the stock of the products we have availabile.
I will provide all of the available products to you with indexes of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output I need you to provide me the list of questions and the indexes of the chunks that could be used to answer them.
Also, provide an example answer the thee question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions. 
Try to have a mix of questions that could use multiple chunks and questions that could use single chunk.
Also, include 5 questions that can't be answered with the available chunks.

{json.dumps(output_schema, indent=2)}

I need to be able to parse the json output.

"""


USER_PROMPT = f"""
Here is the list of chunks, each list element is a dictionary with id and text.
{[{"id": i, "text": text} for i, text in enumerate(data_to_embed)]}
"""

print(SYSTEM_PROMPT)


I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a shopping assistant that can answer questions about the stock of the products we have availabile.
I will provide all of the available products to you with indexes of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output I need you to provide me the list of questions and the indexes of the chunks that could be used to answer them.
Also, provide an example answer the thee question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions. 
Try to have a mix of questions that could use multiple chunks and questions that could use single chunk.
Also, include 5 questions that can't be answered with the available chunks.

{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "question": {
        "type": "string",
        "description": "

In [None]:
from pprint import pprint

pprint(USER_PROMPT)

In [14]:
import openai
import os

response = openai.chat.completions.create(
    model="gpt-4.1",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT },
        {"role": "user", "content": USER_PROMPT}
    ]
)

print(response.choices[0].message.content)

```json
[
  {
    "question": "Do you have any security cameras in stock with 2K or higher resolution?",
    "chunk_ids": [0, 13],
    "answer_example": "Yes, we have the eufy security Floodlight Camera with 2K resolution and the REOLINK 4K Wired WiFi Outdoor Camera with 8MP (4K) resolution in stock.",
    "reasoning": "Chunk 0 describes a 2K camera, and chunk 13 describes a 4K (8MP) camera, both fulfilling the requirement for high-resolution security cameras."
  },
  {
    "question": "Are there any wireless earbuds with long battery life available?",
    "chunk_ids": [1, 11, 24],
    "answer_example": "Yes, we have several options: Wireless Earbuds that offer over 22 hours with the charging case (chunk 1), IKG Wireless Earbuds with up to 30 hours playtime (chunk 11), and Wireless Earbuds with a 2500mAh charging case providing more than 130 hours of music time (chunk 24).",
    "reasoning": "Chunks 1, 11, and 24 all describe wireless earbuds, each with notable battery life."
  },
  {


In [15]:
json_output = response.choices[0].message.content
json_output = json_output.replace("```json", "").replace("```", "")
json_output = json.loads(json_output)

In [16]:
json_output

[{'question': 'Do you have any security cameras in stock with 2K or higher resolution?',
  'chunk_ids': [0, 13],
  'answer_example': 'Yes, we have the eufy security Floodlight Camera with 2K resolution and the REOLINK 4K Wired WiFi Outdoor Camera with 8MP (4K) resolution in stock.',
  'reasoning': 'Chunk 0 describes a 2K camera, and chunk 13 describes a 4K (8MP) camera, both fulfilling the requirement for high-resolution security cameras.'},
 {'question': 'Are there any wireless earbuds with long battery life available?',
  'chunk_ids': [1, 11, 24],
  'answer_example': 'Yes, we have several options: Wireless Earbuds that offer over 22 hours with the charging case (chunk 1), IKG Wireless Earbuds with up to 30 hours playtime (chunk 11), and Wireless Earbuds with a 2500mAh charging case providing more than 130 hours of music time (chunk 24).',
  'reasoning': 'Chunks 1, 11, and 24 all describe wireless earbuds, each with notable battery life.'},
 {'question': 'Which iPad mini 6 cases do yo

In [23]:
from langsmith import Client
import os

lc_client = Client(api_key=os.getenv("LANGSMITH_API_KEY"))


In [None]:

dataset_name = "rag-evaluation-dataset"
dataset = lc_client.create_dataset(
    dataset_name=dataset_name,
    description="Dataset for evaluating RAG systems",
)


In [26]:
for item in json_output:
    lc_client.create_example(
        dataset_id=dataset.id,
        inputs={"question": item["question"]},
        outputs={
            'ground_truth': item["answer_example"],
            'context_ids': item["chunk_ids"],
            'contexts': [qdrant_client.retrieve(
                collection_name="Amazon-items-collection-02", 
                ids=[id],
                with_payload=True)[0].payload['text'] for id in item["chunk_ids"]
            ]
        }
    )

In [27]:
len(json_output)

34

In [34]:
import asyncio

async def foo():
    print("Start foo")
    await asyncio.sleep(1)
    print("Koniec foo")

async def bar():
    print("Start bar")
    await asyncio.sleep(2)
    print("Koniec bar")

await asyncio.gather(foo(), bar())
print("Koniec main")

Start foo
Start bar
Koniec foo
Koniec bar
Koniec main
