In [1]:
import json
import os
import random
from pathlib import Path
from pprint import pprint

import pandas as pd
import tiktoken
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from openai import OpenAI
from prompts import QUERY_GENERATION_PROMPT
from pydantic import BaseModel
from tqdm.notebook import tqdm

In [2]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

## Get dataset

In [28]:
cache_dir = Path("../cache")
assert cache_dir.exists(), f"Cache directory {cache_dir} does not exist"

In [29]:
df_queries = load_dataset(
    "apexlearningcurve/Amazon-Search-Benchmark", split="test", cache_dir=cache_dir
).to_pandas()
print(f"Number of data rows: {len(df_queries)}")
df_queries.head()

Number of data rows: 21223


Unnamed: 0,query_id,query_old,item_id
0,0,I need filters that effectively trap dust and ...,B0C5QYYHTJ
1,1,I need to find a protein that is super healthy...,B0C7D3VLXW
2,2,I need a pillow that helps keep my nasal pillo...,B0C3QRMPVN
3,3,I need a memory stick that is excellent and ex...,B0BC13TQJQ
4,4,I want to buy something that my children will ...,B07Z86PHP8


In [30]:
item_ids = df_queries["item_id"].unique().tolist()
print(f"Number of unique item ids: {len(item_ids)}")

Number of unique item ids: 20463


use sampled ids instead

In [3]:
item_ids = [
    "B07VDWMTXB",
    "B001EYIMBU",
    "B005AR7VPK",
    "B081V26NPQ",
    "B0B3R94BMG",
    "B06W2LHJC1",
    "B00WQY5VDY",
    "B06XCNBR66",
    "B009XATAN6",
    "B08HN3LSCM",
    "B07S173YK5",
    "B0876K19T8",
    "B07SN11TM8",
    "B088DHXHLT",
    "B08HCP4KNN",
    "B07FLPRDFT",
    "B000008LE3",
    "B077F4CX89",
    "B00HJLAWCY",
    "B0062TZRX8",
    "B092P1RGMS",
    "B003J234YA",
    "B089LP5R4D",
    "B07N67BHQS",
    "B00BYO36WU",
    "B06XHHN4VD",
    "B07F9V78WY",
    "B00MRB1MD4",
    "B06Y5K4MZN",
    "B07MY2LZ51",
    "B083XRKP1T",
    "B08P1LPCK4",
    "B09ZTSRQ5B",
    "B08KLL8Z28",
    "B016CFIGD4",
    "B010BXYFZ6",
    "B0725RR223",
    "B07CM8RCPJ",
    "B07SB43YTP",
    "B09C4MSBYP",
    "B009PFBS5M",
    "B00CVMHALM",
    "B088CW7B1L",
    "B076616R1X",
    "B014TDFW9C",
]

Load product text for item ids

In [31]:
filepath = hf_hub_download(
    repo_id="apexlearningcurve/Amazon-Search-Benchmark",
    filename="sampled_item_metadata_1M_filtered.jsonl",
    repo_type="dataset",
    cache_dir=cache_dir,
)

In [32]:
df_products = pd.read_json(filepath, lines=True)
print(f"Number of data rows: {len(df_products)}")
df_products.head()

Number of data rows: 1055136


Unnamed: 0,item_id,title,description,file_name
0,B0007YMVOC,"Warriors of Wrestling (The Biggest,Baddest,Mea...","The Biggest, meanest and bad wrestlers of all ...",Movies & TV
1,B004KPUHPE,Nowhere Boy,The story of former Beatle John Lennon's teen ...,Movies & TV
2,B0015OIFPC,Malos Habitos [Ntsc/region 1 & 4 Dvd. Import-l...,"La fe, el amor y la banidad son puestos a prue...",Movies & TV
3,B00KG2QONE,House of Dust,A serial killer's ghost terrorizes a group of ...,Movies & TV
4,B00AVSERBE,20 Country Love Songs Volume 2,This DVD compilation features 20 more classic ...,Movies & TV


Clean non-existing item_ids

In [33]:
item_ids = set(df_products["item_id"].tolist()).intersection(set(item_ids))

### Extract product text

In [34]:
def get_product_text(row) -> str:
    title = row["title"]
    description = row["description"]
    filename = "Product category:" + row["file_name"]
    text = "\n".join([title, description, filename])
    return text

In [35]:
df_jobs = df_products[df_products["item_id"].isin(item_ids)].copy()
df_jobs["product_text"] = df_jobs.apply(get_product_text, axis=1)
df_jobs.head()

Unnamed: 0,item_id,title,description,file_name,product_text
17,B0B8JXDS86,A Man Called Otto,A grumpy widower forms an unlikely friendship ...,Movies & TV,A Man Called Otto\nA grumpy widower forms an u...
24,B089CZYTLL,Frasier: The Complete Series,"Psychiatrist and ""Cheers"" regular Dr. Frasier ...",Movies & TV,Frasier: The Complete Series\nPsychiatrist and...
55,B09RF1PGLJ,Everything Everywhere All At Once,Academy Award winning Best Picture starring Ac...,Movies & TV,Everything Everywhere All At Once\nAcademy Awa...
110,B00111YM60,30 Days Of Night,Product Description\nJosh Hartnett (The Black ...,Movies & TV,30 Days Of Night\nProduct Description\nJosh Ha...
126,B01IWQUTY0,Deadbeat,"Kevin ""Pac"" Pacalioglu (Tyler Labine) is a laz...",Movies & TV,"Deadbeat\nKevin ""Pac"" Pacalioglu (Tyler Labine..."


## Test with Chat Completions

In [36]:
class ResponseStructure(BaseModel):
    short_query: str
    long_query: str
    keywords: list[str]

In [12]:
def get_queries(prompt: str, model: str = "gpt-4o-mini", temperature: float = 0) -> str:
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        response_format=ResponseStructure,
        messages=[
            {"role": "system", "content": prompt},
        ],
    )

    return response.choices[0].message.parsed

In [13]:
random_product = df_products.iloc[random.randint(0, len(df_products))]
product_text = get_product_text(random_product)
product_text

"Priority Chef Tea, Coffee, Sugar Jars, Set of 3 Glass Canisters in Black Metal Overlay, Air Tight Screw Top Lids, Perfect Storage Solution\nDo you need a jar for your dry kitchen essentials? Make the kitchen counter and pantry clutter-free with this fantastic container jar collection. This wonderful product can free up space, and it looks good too! It's perfect for your cookies, beans, and candies. These gorgeous kitchen containers can do the job. Its lids are designed for an air-tight seal to ensure freshness. With its sleek design, this glass jar collection is a functional solution for your countertop or panty. Measuring 12 cm x 10 cm x 10 cm, each jar is designed with an ageless, classic look. This sets perfectly as a kitchen decor. What’s more, these jar containers are easy to clean. How convenient is that! These glass containers are available in 3 color variations: silver, white, and black. They make great gifts for weddings, housewarmings, or birthdays. You can also use them as 

In [15]:
response = get_queries(prompt=QUERY_GENERATION_PROMPT.format(product_text=product_text))

In [16]:
pprint(response.model_dump())

{'keywords': ['Ariel costume',
              'Disney princess dress',
              'mermaid style outfit',
              'girls costume',
              'teal and lavender dress',
              'The Little Mermaid'],
 'long_query': 'buy Ariel Disney princess costume for kids',
 'short_query': 'Ariel princess costume for girls'}


In [34]:
# Testing on a few examples
for _, row in df_products[:5].iterrows():
    product_text = get_product_text(row)
    result = get_queries(
        prompt=QUERY_GENERATION_PROMPT.format(product_text=product_text)
    )
    pprint(result.model_dump())
    print("-" * 80)

{'keywords': ['wrestling DVD',
              'greatest wrestlers',
              'American Wrestling Federation',
              'championship matches',
              'wrestling history',
              '4 DVD set',
              'action-packed wrestling'],
 'long_query': 'top wrestling DVDs featuring legendary wrestlers',
 'short_query': 'best wrestling DVD collection'}
--------------------------------------------------------------------------------
{'keywords': ['John Lennon',
              'Beatles',
              'teen drama',
              'Aunt Mimi',
              'mother relationship',
              'biographical film'],
 'long_query': "Drama about John Lennon's early life story",
 'short_query': 'John Lennon teen drama'}
--------------------------------------------------------------------------------
{'keywords': ['Malos Habitos',
              'DVD',
              'import',
              'Latin America',
              "women's stories",
              'food habits',
            

## Creating Batch Tasks

In [37]:
def get_product_by_item_id(item_id: str) -> str:
    product = df_products[df_products["item_id"] == item_id].iloc[0]
    product_text = get_product_text(product)
    return product_text

In [38]:
from openai.lib._parsing import _completions

In [39]:
_completions.type_to_response_format_param(ResponseStructure)

{'type': 'json_schema',
 'json_schema': {'schema': {'properties': {'short_query': {'title': 'Short Query',
     'type': 'string'},
    'long_query': {'title': 'Long Query', 'type': 'string'},
    'keywords': {'items': {'type': 'string'},
     'title': 'Keywords',
     'type': 'array'}},
   'required': ['short_query', 'long_query', 'keywords'],
   'title': 'ResponseStructure',
   'type': 'object',
   'additionalProperties': False},
  'name': 'ResponseStructure',
  'strict': True}}

In [40]:
tasks = []

for row in tqdm(
    df_jobs[:10].itertuples(), total=len(df_jobs[:10]), desc="Generating tasks"
):

    task = {
        "custom_id": row.item_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # This is what you would have in your Chat Completions API call
            "model": "gpt-4o-mini",
            "temperature": 0,
            "response_format": _completions.type_to_response_format_param(
                ResponseStructure
            ),
            "messages": [
                {
                    "role": "system",
                    "content": QUERY_GENERATION_PROMPT.format(
                        product_text=row.product_text
                    ),
                },
            ],
        },
    }

    tasks.append(task)

Generating tasks:   0%|          | 0/10 [00:00<?, ?it/s]

Save as jsonl

In [41]:
output_file = Path("./batch_tasks_queries_v5.jsonl")
with open(output_file, "w", encoding="utf-8") as f:
    for obj in tasks:
        f.write(json.dumps(obj) + "\n")

Upload file

In [42]:
batch_file = client.files.create(file=open(output_file, "rb"), purpose="batch")

In [43]:
batch_file

FileObject(id='file-yHPD6aREX6Hb776dhPayIbdx', bytes=52515, created_at=1724521284, filename='batch_tasks_queries_v5.jsonl', object='file', purpose='batch', status='processed', status_details=None)

Creating the batch job

In [44]:
batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
)

Checking batch status

In [46]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

Batch(id='batch_xJPF3zii79gtum5wWZsX706u', completion_window='24h', created_at=1724521290, endpoint='/v1/chat/completions', input_file_id='file-yHPD6aREX6Hb776dhPayIbdx', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1724521295, error_file_id=None, errors=None, expired_at=None, expires_at=1724607690, failed_at=None, finalizing_at=1724521294, in_progress_at=1724521290, metadata=None, output_file_id='file-pHHtkdhlaJSQcqLAUECi31hn', request_counts=BatchRequestCounts(completed=10, failed=0, total=10))


Retrieving results

In [47]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [48]:
output_file_results = Path("./batch_tasks_queries_results_v5.jsonl")

with open(output_file_results, "wb") as file:
    file.write(result)

Loading the results

In [49]:
# Loading data from saved file
results = []
with open(output_file_results, "r") as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

Reading the results

In [50]:
# Reading only the first results
for res in results:
    item_id = res["custom_id"]
    result = json.loads(res["response"]["body"]["choices"][0]["message"]["content"])
    product_text = get_product_by_item_id(item_id)
    print(f"Item ID: {item_id}")
    print(f"Product Text: {json.dumps(product_text, indent=2)}")
    print(result)
    print("-" * 80)

Item ID: B0B8JXDS86
Product Text: "A Man Called Otto\nA grumpy widower forms an unlikely friendship with his new neighbors that turns his world around.\nProduct category:Movies & TV"
{'short_query': 'A Man Called Otto movie', 'long_query': 'A Man Called Otto film about friendship', 'keywords': ['A Man Called Otto', 'comedy-drama', 'friendship', 'widower', 'new neighbors', 'heartwarming story']}
--------------------------------------------------------------------------------
Item ID: B089CZYTLL
Product Text: "Frasier: The Complete Series\nPsychiatrist and \"Cheers\" regular Dr. Frasier Crane (Kelsey Grammer) left Boston for his hometown of Seattle and a job as a radio call-in show host, in this hit 1993-2004 spin-off. Thanks to brother Niles (David Hyde Pierce), dad Martin (John Mahoney), health care worker Daphne (Jane Leeves), and radio producer Roz (Peri Gilpin), though, things were just as aggravating on the West Coast.258 episodes on 44 discs. 98 1/4 hrs. Standard; Soundtrack: Engl

In [43]:
tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")
len(tokenizer.encode(QUERY_GENERATION_PROMPT))

634

In [None]:
df_jobs