In [1]:
import json
import os
import random
from pathlib import Path
from pprint import pprint

import pandas as pd
from datasets import Dataset, load_dataset
from huggingface_hub import hf_hub_download
from openai import OpenAI
from prompts import BOOKS_PROMPT, QUERY_GENERATION_PROMPT
from response_structure import ResponseStructure
from tqdm.notebook import tqdm

In [2]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

## Get dataset

In [3]:
cache_dir = Path("cache")
assert cache_dir.exists(), f"Cache directory {cache_dir} does not exist"

In [4]:
REPO_ID = "apexlearningcurve/Amazon-Search-Benchmark"
df_queries = load_dataset(REPO_ID, split="test", cache_dir=cache_dir).to_pandas()
print(f"Number of data rows: {len(df_queries)}")
df_queries.head()

Downloading readme:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.85M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/20373 [00:00<?, ? examples/s]

Number of data rows: 20373


Unnamed: 0,item_id,queries_old,short_query,long_query
0,7582471,[I'm looking for a modern makeover story that'...,One in a Million,One in a Million book
1,60735880,[I need to find a unique history book that is ...,The Africa House,The Africa House by Christina Lamb
2,61730793,[I'm looking for a book with beautiful words a...,Heart and Soul book,Heart and Soul by Kadir Nelson
3,61900621,[I am looking for a book for my two-year-old w...,I'm a Big Sister,I'm a Big Sister book Joanna Cole
4,62124277,"[I'm looking for a great book, but I want to a...",Flight Behavior novel,Flight Behavior by Barbara Kingsolver


In [5]:
item_ids = df_queries["item_id"].unique().tolist()
print(f"Number of unique item ids: {len(item_ids)}")

Number of unique item ids: 20373


Transform dataset to have uniqe item_id column

In [24]:
# Group by item_id and aggregate the queries into a list
df_queries = df_queries.groupby("item_id").agg({"query_old": list}).reset_index()

# Drop the query_id column (if it still exists, though grouping should remove it)
df_queries = df_queries[["item_id", "query_old"]]

# Rename the columns for clarity if needed
df_queries.columns = ["item_id", "queries_old"]

df_queries.head()

Unnamed: 0,item_id,queries_old
0,7582471,[I'm looking for a modern makeover story that'...
1,60735880,[I need to find a unique history book that is ...
2,61730793,[I'm looking for a book with beautiful words a...
3,61900621,[I am looking for a book for my two-year-old w...
4,62124277,"[I'm looking for a great book, but I want to a..."


Load product text for item ids

In [25]:
filepath = hf_hub_download(
    repo_id=REPO_ID,
    filename="sampled_item_metadata_1M_filtered.jsonl",
    repo_type="dataset",
    cache_dir=cache_dir,
)

In [26]:
df_products = pd.read_json(filepath, lines=True)
print(f"Number of data rows: {len(df_products)}")
df_products.head()

Number of data rows: 1055136


Unnamed: 0,item_id,title,description,file_name
0,B0007YMVOC,"Warriors of Wrestling (The Biggest,Baddest,Mea...","The Biggest, meanest and bad wrestlers of all ...",Movies & TV
1,B004KPUHPE,Nowhere Boy,The story of former Beatle John Lennon's teen ...,Movies & TV
2,B0015OIFPC,Malos Habitos [Ntsc/region 1 & 4 Dvd. Import-l...,"La fe, el amor y la banidad son puestos a prue...",Movies & TV
3,B00KG2QONE,House of Dust,A serial killer's ghost terrorizes a group of ...,Movies & TV
4,B00AVSERBE,20 Country Love Songs Volume 2,This DVD compilation features 20 more classic ...,Movies & TV


Clean non-existing item_ids

In [27]:
print(len(item_ids))
item_ids = set(df_products["item_id"].tolist()).intersection(set(item_ids))
print(len(item_ids))

20463
20373


In [30]:
df_queries = df_queries[df_queries["item_id"].apply(lambda x: x in item_ids)]

#### push to hub

In [36]:
ds_queries = Dataset.from_pandas(df=df_queries, split="test")

In [None]:
ds_queries.push_to_hub(repo_id=REPO_ID, split="test", commit_message="")

### Extract product text

In [38]:
def get_product_text(row) -> str:
    title = row["title"]
    description = row["description"]
    category = "Product category:" + row["file_name"]
    text = "\n".join([title, description, category])
    return text

In [39]:
df_jobs = df_products[df_products["item_id"].isin(item_ids)].copy()
df_jobs["product_text"] = df_jobs.apply(get_product_text, axis=1)
df_jobs.head()

Unnamed: 0,item_id,title,description,file_name,product_text
17,B0B8JXDS86,A Man Called Otto,A grumpy widower forms an unlikely friendship ...,Movies & TV,A Man Called Otto\nA grumpy widower forms an u...
24,B089CZYTLL,Frasier: The Complete Series,"Psychiatrist and ""Cheers"" regular Dr. Frasier ...",Movies & TV,Frasier: The Complete Series\nPsychiatrist and...
55,B09RF1PGLJ,Everything Everywhere All At Once,Academy Award winning Best Picture starring Ac...,Movies & TV,Everything Everywhere All At Once\nAcademy Awa...
110,B00111YM60,30 Days Of Night,Product Description\nJosh Hartnett (The Black ...,Movies & TV,30 Days Of Night\nProduct Description\nJosh Ha...
126,B01IWQUTY0,Deadbeat,"Kevin ""Pac"" Pacalioglu (Tyler Labine) is a laz...",Movies & TV,"Deadbeat\nKevin ""Pac"" Pacalioglu (Tyler Labine..."


## Test with Chat Completions

In [48]:
def get_queries(
    prompt: str, product_text: str, model: str = "gpt-4o-mini", temperature: float = 0
) -> str:
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        response_format=ResponseStructure,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": product_text},
        ],
    )

    return response.choices[0].message.parsed

In [49]:
random_product = df_products.iloc[random.randint(0, len(df_products))]
product_text = get_product_text(random_product)
product_text

'Timberland Shelburne Three Piece Hardside Set, Chocolate Truffle\nPolyvinyl Chloride free, expandable, hard side three piece luggage set\nProduct category:Clothing Shoes & Jewelry'

In [50]:
response = get_queries(prompt=QUERY_GENERATION_PROMPT, product_text=product_text)

In [51]:
pprint(response.model_dump())

{'long_query': 'Expandable three piece hardside luggage set',
 'reasoning': 'The Timberland Shelburne Three Piece Hardside Set is designed '
              'for travelers seeking durable and stylish luggage. Its hard '
              'side construction offers protection for belongings, while the '
              'expandable feature provides extra packing space. The chocolate '
              'truffle color adds a touch of elegance, making it suitable for '
              'various travel occasions.',
 'short_query': 'Hardside luggage set'}


In [52]:
# Testing on a few examples
for _, row in df_products[:5].iterrows():
    product_text = get_product_text(row)
    result = get_queries(prompt=QUERY_GENERATION_PROMPT, product_text=product_text)
    pprint(result.model_dump())
    print("-" * 80)

{'long_query': 'Greatest wrestlers DVD set collection',
 'reasoning': 'This product is a collection of wrestling matches featuring '
              'some of the most famous and formidable wrestlers, packaged in a '
              '4 DVD set. It highlights intense competition and showcases '
              'legendary figures in wrestling history, making it appealing to '
              'fans of the sport.',
 'short_query': 'Wrestling DVD collection'}
--------------------------------------------------------------------------------
{'long_query': 'Nowhere Boy movie about John Lennon',
 'reasoning': 'The product is a movie that dramatizes the early life of John '
              'Lennon, focusing on his relationships with his aunt and mother. '
              'It is a biographical film that appeals to fans of The Beatles '
              'and those interested in music history.',
 'short_query': 'Nowhere Boy'}
--------------------------------------------------------------------------------
{'long_q

## Creating Batch Tasks

In [73]:
def get_product_by_item_id(item_id: str) -> str:
    product = df_products[df_products["item_id"] == item_id].iloc[0]
    product_text = get_product_text(product)
    return product_text

In [58]:
from openai.lib._parsing import _completions

In [59]:
_completions.type_to_response_format_param(ResponseStructure)

{'type': 'json_schema',
 'json_schema': {'schema': {'properties': {'reasoning': {'title': 'Reasoning',
     'type': 'string'},
    'short_query': {'title': 'Short Query', 'type': 'string'},
    'long_query': {'title': 'Long Query', 'type': 'string'}},
   'required': ['reasoning', 'short_query', 'long_query'],
   'title': 'ResponseStructure',
   'type': 'object',
   'additionalProperties': False},
  'name': 'ResponseStructure',
  'strict': True}}

In [60]:
len(df_jobs)

20373

In [61]:
tasks = []

for row in tqdm(df_jobs.itertuples(), total=len(df_jobs), desc="Generating tasks"):
    prompt = QUERY_GENERATION_PROMPT
    if row.file_name == "Books":
        prompt += f"\n{BOOKS_PROMPT}"

    task = {
        "custom_id": row.item_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # This is what you would have in your Chat Completions API call
            # "model": "gpt-4o-mini",
            "model": "gpt-4o-2024-08-06",
            "temperature": 0,
            "response_format": _completions.type_to_response_format_param(
                ResponseStructure
            ),
            "messages": [
                {"role": "system", "content": prompt},
                {"role": "user", "content": row.product_text},
            ],
        },
    }

    tasks.append(task)

Generating tasks:   0%|          | 0/20373 [00:00<?, ?it/s]

Save as jsonl

In [62]:
output_file = Path("artifacts/batch_tasks_queries_v5.jsonl")
with open(output_file, "w", encoding="utf-8") as f:
    for obj in tasks:
        f.write(json.dumps(obj) + "\n")

Upload file

In [63]:
batch_file = client.files.create(file=open(output_file, "rb"), purpose="batch")

In [64]:
batch_file

FileObject(id='file-8Mydv3p7Y2cVha6DiD4yHqzf', bytes=60706895, created_at=1727106157, filename='batch_tasks_queries_v5.jsonl', object='file', purpose='batch', status='processed', status_details=None)

Creating the batch job

In [65]:
batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
)

Checking batch status

In [68]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job.status)
print(batch_job.request_counts)

completed
BatchRequestCounts(completed=20373, failed=0, total=20373)


Retrieving results

In [69]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [70]:
output_file_results = Path("artifacts/batch_tasks_queries_results_v5.jsonl")

with open(output_file_results, "wb") as file:
    file.write(result)

Loading the results

In [71]:
# Loading data from saved file
results = []
with open(output_file_results, "r") as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

Reading the results

In [74]:
# Reading only the first results
for res in results[:2]:
    item_id = res["custom_id"]
    result = json.loads(res["response"]["body"]["choices"][0]["message"]["content"])
    product_text = get_product_by_item_id(item_id)
    print(f"Item ID: {item_id}")
    print(f"Product Text: {json.dumps(product_text, indent=2)}")
    print(result)
    print("-" * 80)

Item ID: B0B8JXDS86
Product Text: "A Man Called Otto\nA grumpy widower forms an unlikely friendship with his new neighbors that turns his world around.\nProduct category:Movies & TV"
{'reasoning': "The product is a movie titled 'A Man Called Otto', which is about a grumpy widower who forms an unexpected friendship with his neighbors. This film likely falls under the drama or comedy-drama genre, focusing on themes of friendship and personal transformation. When generating search queries, it's important to highlight the movie's title and its category to ensure users can find it easily.", 'short_query': 'A Man Called Otto', 'long_query': 'A Man Called Otto movie'}
--------------------------------------------------------------------------------
Item ID: B089CZYTLL
Product Text: "Frasier: The Complete Series\nPsychiatrist and \"Cheers\" regular Dr. Frasier Crane (Kelsey Grammer) left Boston for his hometown of Seattle and a job as a radio call-in show host, in this hit 1993-2004 spin-off. T

In [95]:
df_queries["short_query"] = ""
df_queries["long_query"] = ""

for res in results:
    item_id = res["custom_id"]
    result = json.loads(res["response"]["body"]["choices"][0]["message"]["content"])
    response = ResponseStructure.model_validate(result)

    df_queries.loc[
        df_queries["item_id"] == item_id,
        [
            "long_query",
            "short_query",
        ],
    ] = [
        response.long_query,
        response.short_query,
    ]

In [102]:
df_queries.to_parquet(
    "artifacts/test_with_gpt-4o_generated_queries.parquet", index=False
)
ds_queries = Dataset.from_pandas(df=df_queries, split="test")

In [None]:
df_queries.push_to_hub(repo_id=REPO_ID, split="test")