## Fix category(file_name) 

In [1]:
from pathlib import Path

import pandas as pd
from huggingface_hub import hf_hub_download, upload_file

In [2]:
cache_dir = Path("cache")
assert cache_dir.exists(), f"Cache directory {cache_dir} does not exist"

In [3]:
REPO_ID = "apexlearningcurve/Amazon-Search-Benchmark"
filepath = hf_hub_download(
    repo_id=REPO_ID,
    filename="sampled_item_metadata_1M_filtered.jsonl",
    repo_type="dataset",
    cache_dir=cache_dir,
)

In [4]:
df_products = pd.read_json(filepath, lines=True)
print(f"Number of data rows: {len(df_products)}")
df_products.head()

Number of data rows: 1055136


Unnamed: 0,item_id,title,description,file_name
0,B0007YMVOC,"Warriors of Wrestling (The Biggest,Baddest,Mea...","The Biggest, meanest and bad wrestlers of all ...",Movies & TV
1,B004KPUHPE,Nowhere Boy,The story of former Beatle John Lennon's teen ...,Movies & TV
2,B0015OIFPC,Malos Habitos [Ntsc/region 1 & 4 Dvd. Import-l...,"La fe, el amor y la banidad son puestos a prue...",Movies & TV
3,B00KG2QONE,House of Dust,A serial killer's ghost terrorizes a group of ...,Movies & TV
4,B00AVSERBE,20 Country Love Songs Volume 2,This DVD compilation features 20 more classic ...,Movies & TV


In [5]:
category_names = df_products["file_name"].unique().tolist()
category_names

['Movies & TV',
 'Books',
 'Electronics',
 'Gift Cards',
 'Office Products',
 'Appliances',
 'Clothing Shoes & Jewelry',
 'Handmade Products',
 'CDs & Vinyl',
 'Beauty & Personal Care',
 'Home & Kitchen',
 'Grocery & Gourmet Food',
 'Industrial & Scientific',
 'Health & Personal Care',
 'Cell Phones & Accessories',
 'Tools & Home Improvement',
 'Musical Instruments',
 'Video Games',
 'Automotive',
 'Magazine Subscriptions',
 'Health & Household',
 'Amazon Fashion',
 'Baby Products',
 'Sports & Outdoors',
 'All Beauty',
 'Pet Supplies',
 'Arts Crafts & Sewing',
 'Kindle Store',
 'Toys & Games',
 'Software',
 'Patio Lawn & Garden']

In [9]:
def fix_file_name(x):
    if x["file_name"] == "H&made Products":
        x["file_name"] = "Handmade Products"

    return x


df_products_fixed = df_products.apply(fix_file_name, axis=1)

In [None]:
df_products_fixed["file_name"].value_counts()

In [12]:
df_products_fixed.to_json(
    "sampled_item_metadata_1M_filtered.jsonl", orient="records", lines=True
)

In [None]:
upload_file(
    path_or_fileobj="sampled_item_metadata_1M_filtered.jsonl",
    path_in_repo="sampled_item_metadata_1M_filtered.jsonl",  # Same filename in the repo
    repo_id=REPO_ID,  # Your repo ID
    repo_type="dataset",
    commit_message="Fix file_name '&'->'and'",
)

## Prompt engineering for query generation

In [1]:
import os
from pathlib import Path
from pprint import pprint

import ipywidgets as widgets
import numpy as np
import pandas as pd
from huggingface_hub import hf_hub_download
from openai import OpenAI
from prompts import BOOKS_PROMPT, QUERY_GENERATION_PROMPT
from response_structure import ResponseStructure

In [3]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

#### Sample n products from specific category

In [4]:
def sample_products_from_category(
    num_samples: int, category: str, df: pd.DataFrame
) -> pd.DataFrame:
    df_category = df[df["file_name"] == category]
    n_total = len(df_category)
    sample = np.random.choice(n_total, num_samples, replace=False)
    return df_category.iloc[sample]

In [5]:
cache_dir = Path("cache")
assert cache_dir.exists(), f"Cache directory {cache_dir} does not exist"

In [6]:
filepath = hf_hub_download(
    repo_id="apexlearningcurve/Amazon-Search-Benchmark",
    filename="sampled_item_metadata_1M_filtered.jsonl",
    repo_type="dataset",
    cache_dir=cache_dir,
)

In [7]:
df_products = pd.read_json(filepath, lines=True)
print(f"Number of data rows: {len(df_products)}")
df_products.head()

Number of data rows: 1055136


Unnamed: 0,item_id,title,description,file_name
0,B0007YMVOC,"Warriors of Wrestling (The Biggest,Baddest,Mea...","The Biggest, meanest and bad wrestlers of all ...",Movies & TV
1,B004KPUHPE,Nowhere Boy,The story of former Beatle John Lennon's teen ...,Movies & TV
2,B0015OIFPC,Malos Habitos [Ntsc/region 1 & 4 Dvd. Import-l...,"La fe, el amor y la banidad son puestos a prue...",Movies & TV
3,B00KG2QONE,House of Dust,A serial killer's ghost terrorizes a group of ...,Movies & TV
4,B00AVSERBE,20 Country Love Songs Volume 2,This DVD compilation features 20 more classic ...,Movies & TV


❗❗ note: maybe check size of product description and write a function to cut-off at specified length

In [8]:
def extract_text_from_product(product: pd.DataFrame) -> str:
    return f"""{product["title"]}\n\n{product["description"]}\n\nproduct category: {product["file_name"]}"""

In [9]:
category_names = df_products["file_name"].unique().tolist()
category_names
df_products["file_name"].value_counts()

file_name
Home & Kitchen               170042
Clothing Shoes & Jewelry     144654
Electronics                   76918
Tools & Home Improvement      71809
Health & Household            64902
Beauty & Personal Care        64878
Kindle Store                  55726
Pet Supplies                  44988
Automotive                    41962
Patio Lawn & Garden           41814
Books                         37929
Sports & Outdoors             37628
Toys & Games                  37016
Cell Phones & Accessories     34295
Office Products               28864
Grocery & Gourmet Food        27619
Arts Crafts & Sewing          19033
Baby Products                 12265
Industrial & Scientific       12005
Video Games                    7441
Musical Instruments            6073
Appliances                     4575
Movies & TV                    4357
CDs & Vinyl                    2190
Handmade Products              1976
Amazon Fashion                 1477
Software                        856
Health & Personal 

In [88]:
category = widgets.Dropdown(
    options=category_names,
    value=category_names[0],
    description="category name",
    disabled=False,
)
num_samples = widgets.IntSlider(
    value=30,
    min=1,
    max=100,
    step=1,
    description="num samples",
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d",
)
display(category)
display(num_samples)

Dropdown(description='category name', options=('Movies & TV', 'Books', 'Electronics', 'Gift Cards', 'Office Pr…

IntSlider(value=30, continuous_update=False, description='num samples', min=1)

In [92]:
sample = sample_products_from_category(num_samples.value, category.value, df_products)
sample

Unnamed: 0,item_id,title,description,file_name
82922,B071JBRN4K,"YMIX Macbook Pro 13"" Case Non-Retina,Folio Emb...",,Electronics
53395,B01IY5644I,Super Antenna FF1B Frequency Flag Weatherproof...,Super Antenna FF1B Frequency Flag weatherproof...,Electronics
110945,B012UE7SGW,Uniden DB9 Serial Cable For Scanner Models BC2...,Uniden DB9 Serial Cable For Scanner Models BC2...,Electronics
83226,B00A4QB4SW,"TR TY-147 140 mm, PWM fan, low noise emission.",*The TY Series Fan can be pluged into the 3 pi...,Electronics
56410,B08B3MSC5F,[2 Pack] Silicone Protective Cover for NVIDIA ...,This Remote Case for your NVIDIA Shield TV Pro...,Electronics
42439,B0B48NWZ3L,"Taken 16.5ft Charging Cable, Quick Charge Powe...",Taken quick charge 3.0 Power Adapter for Arlo ...,Electronics
87361,B005HB7ZKU,Mitsubishi WD-62628 rear projector TV lamp wit...,,Electronics
107147,B00IEEGKY4,BenQ 5J.J7L25.002 BenQ 144hz DLP Link 3D Glass...,BenQ 3D Glasses - D4. Enjoy the Fantastic New ...,Electronics
76138,0965321002,"A Small Percentage (Small Percentage, 1)","Review\n""Science fiction has gained a bold new...",Electronics
47490,B07K2ZDZVG,"Dreamvasion USB-C Extension Cable 6ft, Type C ...",,Electronics


In [93]:
index = widgets.IntSlider(
    value=0,
    min=0,
    max=num_samples.value - 1,
    step=1,
    description="index",
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d",
)
display(index)

IntSlider(value=0, continuous_update=False, description='index', max=29)

In [94]:
product_text = extract_text_from_product(sample.iloc[index.value])
print(product_text)  # , width=180)

YMIX Macbook Pro 13" Case Non-Retina,Folio Embroidered Shell Plastic Hard Protective Cover for Old MacBook Pro 13 Inch with CD-ROM Drive,Model A1278(A_Embroidered Floral)



product category: Electronics


In [86]:
response = client.beta.chat.completions.parse(
    # model="gpt-4o-mini",
    model="gpt-4o-2024-08-06",
    temperature=0,
    response_format=ResponseStructure,
    messages=[
        {"role": "system", "content": QUERY_GENERATION_PROMPT},
        {
            "role": "user",
            "content": extract_text_from_product(sample.iloc[index.value]),
        },
    ],
)

In [87]:
pprint(response.choices[0].message.parsed.model_dump(), width=180)

{'long_query': 'Mating of Millie 1948 movie',
 'reasoning': 'This is a classic romantic comedy film from 1948, focusing on themes of love, adoption, and unexpected romance. The plot involves a woman trying to find a husband '
              'to adopt a child, with the help of a man who eventually falls in love with her. This film would appeal to those interested in vintage romantic comedies or classic '
              'cinema.',
 'short_query': 'Mating of Millie'}


In [274]:
index.value += 1