In [229]:
import json
import os
from collections import defaultdict
from datetime import datetime

import pandas as pd
from datasets import DatasetDict, load_dataset
from dotenv import find_dotenv, load_dotenv
from huggingface_hub import HfApi, hf_hub_download

load_dotenv(find_dotenv())

True

### Download the dataset

In [241]:
ds_test = load_dataset("McAuley-Lab/Amazon-C4", split="test")

### Lets remove unnecessary columns

In [242]:
ds_test.column_names

['qid', 'query', 'item_id', 'user_id', 'ori_rating', 'ori_review']

In [243]:
ds_test = ds_test.remove_columns(column_names=["user_id", "ori_rating", "ori_review"])

In [244]:
column_mapping = {"qid": "query_id", "query": "query_old"}
ds_test = ds_test.rename_columns(column_mapping=column_mapping)

In [245]:
dataset_name = "apexlearningcurve/Amazon-Search-Benchmark"

In [246]:
ds_test.push_to_hub(dataset_name, token=os.getenv("HF_TOKEN"), split=None)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/829 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/apexlearningcurve/Amazon-Search-Benchmark/commit/59aec5dcc75befe59fc0c66348063db7ff40cc29', commit_message='Upload dataset', commit_description='', oid='59aec5dcc75befe59fc0c66348063db7ff40cc29', pr_url=None, pr_revision=None, pr_num=None)

In [42]:
ds_train = load_dataset(
    "McAuley-Lab/Amazon-C4",
    data_files=["sampled_item_metadata_1M.jsonl"],
    split="train",
)

In [43]:
ds_train.column_names

['item_id', 'category', 'metadata']

Check if test examples are found in the train split

In [44]:
ds_test[0]

{'qid': 0,
 'query': "I need filters that effectively trap dust and improve the air quality in my home. It's surprising how much dust they can collect in just a few months.",
 'item_id': 'B0C5QYYHTJ'}

In [8]:
results = ds_train.filter(lambda x: x["item_id"] == ds_test[0]["item_id"])
results.to_dict()

Filter:   0%|          | 0/1058417 [00:00<?, ? examples/s]

{'item_id': ['B0C5QYYHTJ'],
 'category': ['Home'],
 'metadata': ['Flintar Core 300 True HEPA Replacement Filters, Compatible with LEVOIT Core 300, Core 300S VortexAir Air Purifier, 3-in-1 H13 Grade True HEPA Filter Replacement, Core 300-RF, 2-Pack. Flintar Premium high-efficiency H13 Grade True HEPA Replacement Filter is made in Taiwan and is fully compatible with LEVOIT Core 300 and Core 300S VortexAir Air Purifier. This True HEPA Filtration System includes:   - Fine Pre-Filter: Traps larger particles in the air like dust, hairs, pet fur, lint, and more - H13 Grade True HEPA Filter: Captures 99.97% of harmful airborne particles down to 0.3 microns in size   - High-Efficiency Activated Carbon Filter: Absorbs household odors from pets, cooking, smoke, wildfire, and harmful VOC’s Using Flintar premium high-efficiency air purifier filters and replacing the filters regularly will help optimize air cleaning performance. Replace your HEPA Filter every 6 months for optimal performance. Fully 

we'll assume the rest are there as well for now

### Finding the product categories (files)

Load the c4 training dataset and save the item_ids only

In [12]:
ds_train = ds_train.to_pandas()

In [13]:
ds_train["item_id"].to_csv("./c4_item_ids.csv")

Load asin2category mapping table

In [14]:
with open("./asin2category.json", "r") as fp:
    mapping_dict = json.load(fp)

Checking the strucutre

In [15]:
list(mapping_dict.items())[0]

('B07R3DYMH6', 'Home and Kitchen')

In [19]:
c4_item_ids = pd.read_csv("./c4_item_ids.csv")

create a dict: "category name" : [list of item_ids]

In [20]:
c4_category2id = defaultdict(list)
for _, item_id in c4_item_ids.values:
    c4_category2id[mapping_dict[item_id]].append(item_id)

In [24]:
df = pd.DataFrame(list(c4_category2id.items()), columns=["category", "ids"])
df.head(3)

Unnamed: 0,category,ids
0,Beauty and Personal Care,"[B0778XR2QM, B07Q443QPB, B00CMGHTHC, B07NVH4C5..."
1,Clothing Shoes and Jewelry,"[B07NRD63N7, B07T589YKW, B07CZJNMYN, B01LPFSG1..."
2,Patio Lawn and Garden,"[B09655QKSN, B09DQ9BS43, B094XNG7CP, B07HFN7FV..."


In [21]:
df.to_csv("./category2item_asin_ids.csv")

### Fetch additional data for each item by category

#### Run over all categories

In [25]:
import ast
import shutil
from pathlib import Path

import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm

In [26]:
def clear_directory(directory: Path):
    if directory.exists() and directory.is_dir():
        # Delete all contents of the directory
        for item in directory.iterdir():
            if item.is_dir():
                shutil.rmtree(item)
            else:
                item.unlink()

In [27]:
category2asin_ids_path = "category2item_asin_ids.csv"
output_dir = "c4-raw-meta"
cache_dir = "cache"
df = pd.read_csv(category2asin_ids_path, index_col=[0])
df.head()

Unnamed: 0,category,ids
0,Beauty and Personal Care,"['B0778XR2QM', 'B07Q443QPB', 'B00CMGHTHC', 'B0..."
1,Clothing Shoes and Jewelry,"['B07NRD63N7', 'B07T589YKW', 'B07CZJNMYN', 'B0..."
2,Patio Lawn and Garden,"['B09655QKSN', 'B09DQ9BS43', 'B094XNG7CP', 'B0..."
3,Kindle Store,"['B004Z1RFB2', 'B009RAOQ9A', 'B07K1F9RWY', 'B0..."
4,Home and Kitchen,"['B07JNQCMX7', 'B077SPV6JD', 'B09LKND673', 'B0..."


Remove already processed categories

In [29]:
def remove_processed(df):
    file_list = list(Path(output_dir).iterdir())
    file_stems = [file.stem for file in file_list]
    cleaned_file_names = [
        file_stem[len("raw_meta_") : -len("_c4")].replace("_", " ")
        for file_stem in file_stems
    ]
    return df[~df["category"].isin(cleaned_file_names)]

In [None]:
print(f"Number of all files: {len(df)}")
df = remove_processed(df)
print(f"Number of unprocessed files: {len(df)}")

In [None]:
for row in tqdm(df.itertuples(), total=len(df)):
    category_name = row.category.replace(" ", "_")
    dataset_name = f"raw_meta_{category_name}"

    item_ids = ast.literal_eval(row.ids)
    ds = load_dataset(
        path="McAuley-Lab/Amazon-Reviews-2023",
        name=dataset_name,
        trust_remote_code=True,
        cache_dir=cache_dir,
        split="full",
    )

    # Use map to add a boolean column indicating whether the item_id is in the set
    item_id_set = set(item_ids)  # Convert the list to a set for faster lookups
    dataset = ds.map(lambda x: {"is_in_set": x["parent_asin"] in item_id_set})

    # Filter the dataset where the 'is_in_set' column is True
    filtered_dataset = dataset.filter(lambda x: x["is_in_set"])

    # Optionally, remove the 'is_in_set' column if not needed
    filtered_dataset = filtered_dataset.remove_columns("is_in_set")

    # Convert the filtered dataset to a pandas DataFrame (optional)
    df_fitlered = filtered_dataset.to_pandas()

    # Save to parquet
    df_fitlered.to_parquet(f"./{output_dir}/{dataset_name}_c4.parquet")
    clear_directory(Path(cache_dir))

## Creating new Amazon C4 Benchmark structure

In [31]:
meta_data_dir = Path("./c4-raw-meta")
assert meta_data_dir.exists(), f"Dir {meta_data_dir} not found!"

In [35]:
file_paths = list(meta_data_dir.iterdir())
file_paths = [path for path in file_paths if path.suffix == ".parquet"]
print(f"Number of parquet files: {len(file_paths)}")

Number of parquet files: 31


Extracting only necessary data. For now we are only using:
- parent_asin
- title
- description
- main_category
- categories

In [33]:
columns = ["parent_asin", "title", "description", "main_category", "categories"]

Category analysis

In [145]:
file_paths[9]

PosixPath('c4-raw-meta/raw_meta_Beauty_and_Personal_Care_c4.parquet')

In [146]:
df_items = pd.read_parquet(path=file_paths[9], columns=columns)
print(f"Number of data rows: {len(df_items)}")

Number of data rows: 64878


In [147]:
df_items.sample(10)

Unnamed: 0,parent_asin,title,description,main_category,categories
13470,B07GY1RJ46,PUR Cosmetics Quick Pro 3 Piece Lip Kit,[],Premium Beauty,"[Beauty & Personal Care, Skin Care, Lip Care, ..."
10239,B0078Z2LWA,100 Perma-Sharp Straight Edge Razor Blades for...,[These razor blades are designed for a perfect...,All Beauty,"[Beauty & Personal Care, Shave & Hair Removal,..."
4282,B00QW5KDJU,Skinn Cosmetics Plasma Foundation SPF 8 Ageles...,[],All Beauty,"[Beauty & Personal Care, Makeup, Face, Foundat..."
6746,B002QTRRQI,NYX Loose Face Powder-NXLFP03 Pure Shimmer,"[The finely-milled granules, which are almost ...",All Beauty,"[Beauty & Personal Care, Makeup, Face, Powder]"
7128,B01CYZ0DV4,"Spa Gift Set for Women, Birthday Gift Set for ...",[],All Beauty,"[Beauty & Personal Care, Skin Care, Body, Sets..."
21133,B0047EPOZ6,L'Oreal Paris EverSleek Sulfate-Free Smoothing...,[Intense Smoothing Shampoo],All Beauty,"[Beauty & Personal Care, Hair Care, Shampoo & ..."
41827,B08313K1WB,BECUS 10 inches Short Bob Wigs with Flat Bangs...,[],All Beauty,"[Beauty & Personal Care, Hair Care, Hair Exten..."
2228,B099KSCDBS,FRCOLOR Nail Drill Grinding Bit Holder Box Pro...,[],All Beauty,"[Beauty & Personal Care, Foot, Hand & Nail Car..."
57589,B0C2VJTRMM,Duoffanny Soft Cream Liquid Face Blush with Ai...,[],All Beauty,"[Beauty & Personal Care, Makeup, Face, Blush]"
50807,B075CZGGCW,Segbeauty Spray Bottle 9003,[1],Tools & Home Improvement,"[Beauty & Personal Care, Tools & Accessories, ..."


Extract category from file name

In [148]:
category = (
    file_paths[9]
    .stem[len("raw_meta_") : -len("_c4")]
    .replace("_", " ")
    .replace("and", "&")
)

In [149]:
sum(df_items["categories"].apply(lambda x: x[0] == category))

64878

In [169]:
"\n".join(df_items[df_items["parent_asin"] == "B002QTRRQI"]["description"].values[0])

'The finely-milled granules, which are almost undetectable, prevent caking and help to gently condition the skin. Ideal for setting makeup, mattifying the complexion and as the final glamour stroke for any look.'

In [172]:
df_items["description"].apply(lambda x: "\n".join(x))

0                                                         
1        SHEA SHAMPOO. Alaffia EveryDay Shea Shampoos a...
2        Set up your guest bathroom for the holiday wit...
3        NYX beauty products cover a wide variety of co...
4                                                         
                               ...                        
64873                                                     
64874                                                     
64875                                                     
64876                                                     
64877    Bombshell Cream Foundation is an amazing found...
Name: description, Length: 64878, dtype: object

Check how much data have empty title and description

In [54]:
num_empty = {}
for path in tqdm(file_paths, total=len(file_paths)):
    df_items = pd.read_parquet(path=path, columns=columns)
    num_empty[path.stem] = len(
        df_items[(df_items["title"].isna()) & (df_items["description"].isna())]
    )

print(f"Number of rows that have {sum(num_empty.values())}")

  0%|          | 0/31 [00:00<?, ?it/s]

Number of rows that have 3281


In [57]:
[(key, value) for key, value in num_empty.items() if value > 0]

[('raw_meta_Movies_and_TV_c4', 3281)]

Only Movies category contain empty values...

In [45]:
results = ds_train.filter(lambda x: x["item_id"] == "B0B5NR9D69")

Filter:   0%|          | 0/1058417 [00:00<?, ? examples/s]

In [47]:
results[0]

{'item_id': 'B0B5NR9D69', 'category': 'Movies', 'metadata': ' '}

In [188]:
dt = datetime.now().strftime("%Y-%b-%d_%H-%M-%S")
output_dir = Path(f"./c4-raw-meta-filtered_{dt}")
output_dir.mkdir(parents=True, exist_ok=True)

## Extacting Title + Description

In [201]:
def create_jsonl_entry(row, file_name):
    title = row["title"]
    description = row["description"]

    # Skip if both title and description are None
    if pd.isna(title) and pd.isna(description):
        return None

    return {
        "item_id": row["parent_asin"],
        "title": title,
        "description": description,
        "file_name": file_name,
    }

In [202]:
columns = ["parent_asin", "title", "description"]
num_rows = 0

for path in tqdm(file_paths, total=len(file_paths)):
    df_items = pd.read_parquet(path=path, columns=columns)
    file_name = (
        path.stem[len("raw_meta_") : -len("_c4")].replace("_", " ").replace("and", "&")
    )
    df_items["description"] = df_items["description"].apply(
        lambda x: "\n".join(x) if x is not None else None
    )

    jsonl_list = (
        df_items.apply(lambda row: create_jsonl_entry(row, file_name), axis=1)
        .dropna()
        .to_list()
    )
    num_rows += len(jsonl_list)

    with open(
        output_dir / "sampled_item_metadata_1M_filtered.jsonl", "a", encoding="utf-8"
    ) as f:
        for row in jsonl_list:
            json_string = json.dumps(row)
            f.write(json_string + "\n")

  0%|          | 0/31 [00:00<?, ?it/s]

In [203]:
print(f"Number of data rows: {num_rows}")

Number of data rows: 1055136


### Push to HF

In [225]:
dataset_name = "apexlearningcurve/Amazon-Search-Benchmark"
api = HfApi()

In [226]:
api.upload_file(
    path_or_fileobj="./c4-raw-meta-filtered_2024-Aug-20_20-44-50/sampled_item_metadata_1M_filtered.jsonl",
    path_in_repo="./sampled_item_metadata_1M_filtered.jsonl",
    repo_id=dataset_name,
    repo_type="dataset",
)

sampled_item_metadata_1M_filtered.jsonl:   0%|          | 0.00/673M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/apexlearningcurve/Amazon-Search-Benchmark/commit/3d82075b407e2c61fef8a3fbcf4c297228534faf', commit_message='Upload ./sampled_item_metadata_1M_filtered.jsonl with huggingface_hub', commit_description='', oid='3d82075b407e2c61fef8a3fbcf4c297228534faf', pr_url=None, pr_revision=None, pr_num=None)

Upload raw parquet files

In [227]:
raw_files_dir = Path("./c4-raw-meta")

In [228]:
api.upload_folder(
    folder_path=raw_files_dir,
    path_in_repo="./raw_data",
    repo_id=dataset_name,
    repo_type="dataset",
)

raw_meta_Appliances_c4.parquet:   0%|          | 0.00/4.64M [00:00<?, ?B/s]

raw_meta_All_Beauty_c4.parquet:   0%|          | 0.00/400k [00:00<?, ?B/s]

raw_meta_Arts_Crafts_and_Sewing_c4.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

Upload 31 LFS files:   0%|          | 0/31 [00:00<?, ?it/s]

raw_meta_Amazon_Fashion_c4.parquet:   0%|          | 0.00/640k [00:00<?, ?B/s]

raw_meta_Automotive_c4.parquet:   0%|          | 0.00/39.2M [00:00<?, ?B/s]

raw_meta_Baby_Products_c4.parquet:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

raw_meta_Beauty_and_Personal_Care_c4.parquet:   0%|          | 0.00/64.1M [00:00<?, ?B/s]

raw_meta_Books_c4.parquet:   0%|          | 0.00/66.9M [00:00<?, ?B/s]

raw_meta_CDs_and_Vinyl_c4.parquet:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

raw_meta_Cell_Phones_and_Accessories_c4.parquet:   0%|          | 0.00/33.0M [00:00<?, ?B/s]

raw_meta_Clothing_Shoes_and_Jewelry_c4.parquet:   0%|          | 0.00/115M [00:00<?, ?B/s]

raw_meta_Electronics_c4.parquet:   0%|          | 0.00/90.5M [00:00<?, ?B/s]

raw_meta_Gift_Cards_c4.parquet:   0%|          | 0.00/122k [00:00<?, ?B/s]

raw_meta_Grocery_and_Gourmet_Food_c4.parquet:   0%|          | 0.00/21.2M [00:00<?, ?B/s]

raw_meta_Handmade_Products_c4.parquet:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

raw_meta_Health_and_Household_c4.parquet:   0%|          | 0.00/73.6M [00:00<?, ?B/s]

raw_meta_Health_and_Personal_Care_c4.parquet:   0%|          | 0.00/453k [00:00<?, ?B/s]

raw_meta_Home_and_Kitchen_c4.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

raw_meta_Industrial_and_Scientific_c4.parquet:   0%|          | 0.00/11.6M [00:00<?, ?B/s]

raw_meta_Kindle_Store_c4.parquet:   0%|          | 0.00/127M [00:00<?, ?B/s]

raw_meta_Magazine_Subscriptions_c4.parquet:   0%|          | 0.00/51.7k [00:00<?, ?B/s]

raw_meta_Movies_and_TV_c4.parquet:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

raw_meta_Musical_Instruments_c4.parquet:   0%|          | 0.00/6.44M [00:00<?, ?B/s]

raw_meta_Office_Products_c4.parquet:   0%|          | 0.00/30.8M [00:00<?, ?B/s]

raw_meta_Patio_Lawn_and_Garden_c4.parquet:   0%|          | 0.00/48.9M [00:00<?, ?B/s]

raw_meta_Pet_Supplies_c4.parquet:   0%|          | 0.00/51.3M [00:00<?, ?B/s]

raw_meta_Software_c4.parquet:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

raw_meta_Sports_and_Outdoors_c4.parquet:   0%|          | 0.00/36.0M [00:00<?, ?B/s]

raw_meta_Tools_and_Home_Improvement_c4.parquet:   0%|          | 0.00/85.7M [00:00<?, ?B/s]

raw_meta_Toys_and_Games_c4.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

raw_meta_Video_Games_c4.parquet:   0%|          | 0.00/8.10M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/apexlearningcurve/Amazon-Search-Benchmark/commit/8907e754c1f50639465ab25b5938c37c61997e8d', commit_message='Upload folder using huggingface_hub', commit_description='', oid='8907e754c1f50639465ab25b5938c37c61997e8d', pr_url=None, pr_revision=None, pr_num=None)

# Under Construction 🚧

## Extracting category

In [125]:
def category_checkup(value, category):
    if value is None:
        return False
    elif len(value):
        return value[0] == category
    else:
        return False

In [126]:
for path in tqdm(file_paths, total=len(file_paths)):
    df_items = pd.read_parquet(path=path, columns=columns)
    category = (
        path.stem[len("raw_meta_") : -len("_c4")].replace("_", " ").replace("and", "&")
    )
    cat_mask = df_items["categories"].apply(lambda x: category_checkup(x, category))
    num_good_cat = sum(cat_mask)
    if num_good_cat == len(df_items):
        print(path)

  0%|          | 0/31 [00:00<?, ?it/s]

c4-raw-meta/raw_meta_CDs_and_Vinyl_c4.parquet
c4-raw-meta/raw_meta_Beauty_and_Personal_Care_c4.parquet
