In [16]:
import json
import os
from collections import defaultdict

import pandas as pd
from datasets import load_dataset
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

True

### Download the dataset

In [2]:
ds_test = load_dataset("McAuley-Lab/Amazon-C4", split="test")

### Lets remove unnecessary columns

In [3]:
ds_test.column_names

['qid', 'query', 'item_id', 'user_id', 'ori_rating', 'ori_review']

In [4]:
ds_test = ds_test.remove_columns(column_names=["user_id", "ori_rating", "ori_review"])

In [6]:
ds_test.push_to_hub("apexlearningcurve/c-4", token=os.getenv("HF_TOKEN"))

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/apexlearningcurve/c-4/commit/f080ae5afe28d07a5b256b33852c0d4066df6f27', commit_message='Upload dataset', commit_description='', oid='f080ae5afe28d07a5b256b33852c0d4066df6f27', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
ds_train = load_dataset(
    "McAuley-Lab/Amazon-C4",
    data_files=["sampled_item_metadata_1M.jsonl"],
    split="train",
)

In [6]:
ds_train.column_names

['item_id', 'category', 'metadata']

Check if test examples are found in the train split

In [7]:
ds_test[0]

{'qid': 0,
 'query': "I need filters that effectively trap dust and improve the air quality in my home. It's surprising how much dust they can collect in just a few months.",
 'item_id': 'B0C5QYYHTJ'}

In [8]:
results = ds_train.filter(lambda x: x["item_id"] == ds_test[0]["item_id"])
results.to_dict()

Filter:   0%|          | 0/1058417 [00:00<?, ? examples/s]

{'item_id': ['B0C5QYYHTJ'],
 'category': ['Home'],
 'metadata': ['Flintar Core 300 True HEPA Replacement Filters, Compatible with LEVOIT Core 300, Core 300S VortexAir Air Purifier, 3-in-1 H13 Grade True HEPA Filter Replacement, Core 300-RF, 2-Pack. Flintar Premium high-efficiency H13 Grade True HEPA Replacement Filter is made in Taiwan and is fully compatible with LEVOIT Core 300 and Core 300S VortexAir Air Purifier. This True HEPA Filtration System includes:   - Fine Pre-Filter: Traps larger particles in the air like dust, hairs, pet fur, lint, and more - H13 Grade True HEPA Filter: Captures 99.97% of harmful airborne particles down to 0.3 microns in size   - High-Efficiency Activated Carbon Filter: Absorbs household odors from pets, cooking, smoke, wildfire, and harmful VOC’s Using Flintar premium high-efficiency air purifier filters and replacing the filters regularly will help optimize air cleaning performance. Replace your HEPA Filter every 6 months for optimal performance. Fully 

we'll assume the rest are there as well for now

### Finding the product categories (files)

Load the c4 training dataset and save the item_ids only

In [12]:
ds_train = ds_train.to_pandas()

In [13]:
ds_train["item_id"].to_csv("./c4_item_ids.csv")

Load asin2category mapping table

In [14]:
with open("./asin2category.json", "r") as fp:
    mapping_dict = json.load(fp)

Checking the strucutre

In [15]:
list(mapping_dict.items())[0]

('B07R3DYMH6', 'Home and Kitchen')

In [3]:
c4_item_ids = pd.read_csv("./c4_item_ids.csv")

create a dict: "category name" : [list of item_ids]

In [12]:
c4_category2id = defaultdict(list)
for _, item_id in c4_item_ids.values:
    c4_category2id[mapping_dict[item_id]].append(item_id)

In [19]:
df = pd.DataFrame(list(c4_category2id.items()), columns=["category", "ids"])

In [21]:
df.to_csv("./category2item_asin_ids.csv")

### Fetch additional data for each item by category

In [24]:
df = pd.read_csv("./category2item_asin_ids.csv", index_col=[0])
df.head()

Unnamed: 0,category,ids
0,Beauty and Personal Care,"['B0778XR2QM', 'B07Q443QPB', 'B00CMGHTHC', 'B0..."
1,Clothing Shoes and Jewelry,"['B07NRD63N7', 'B07T589YKW', 'B07CZJNMYN', 'B0..."
2,Patio Lawn and Garden,"['B09655QKSN', 'B09DQ9BS43', 'B094XNG7CP', 'B0..."
3,Kindle Store,"['B004Z1RFB2', 'B009RAOQ9A', 'B07K1F9RWY', 'B0..."
4,Home and Kitchen,"['B07JNQCMX7', 'B077SPV6JD', 'B09LKND673', 'B0..."


In [41]:
cache_dir = "./cache"

#### Industrial_and_Scientific


In [46]:
category = "Industrial_and_Scientific"
ds_Industrial_and_Scientific = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023",
    f"raw_meta_{category}",
    trust_remote_code=True,
    cache_dir=cache_dir,
    split="full",
)

In [47]:
ds_Industrial_and_Scientific

Dataset({
    features: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
    num_rows: 427564
})

In [None]:
data_keys = [
    "parent_asin",
    "main_category",
    "categories",
    "title",
    "description",
    "features",
    "details",
    "images",
]

In [57]:
import ast


def get_item_ids_by_category(category: str, df) -> list[str]:
    temp = df[df["category"] == category]
    return ast.literal_eval(temp["ids"].values[0])

In [61]:
item_ids = get_item_ids_by_category("Industrial and Scientific", df)
item_id_set = set(item_ids)  # Convert the list to a set for faster lookups

# Use map to add a boolean column indicating whether the item_id is in the set
dataset = ds_Industrial_and_Scientific.map(
    lambda x: {"is_in_set": x["parent_asin"] in item_id_set}
)

# Filter the dataset where the 'is_in_set' column is True
filtered_dataset = dataset.filter(lambda x: x["is_in_set"])

# Optionally, remove the 'is_in_set' column if not needed
filtered_dataset = filtered_dataset.remove_columns("is_in_set")

# Convert the filtered dataset to a pandas DataFrame (optional)
df = filtered_dataset.to_pandas()

Map:   0%|          | 0/427564 [00:00<?, ? examples/s]

Filter:   0%|          | 0/427564 [00:00<?, ? examples/s]

In [68]:
df.to_parquet(f"./c4-raw-meta/raw_meta_{category}_c4.parquet")

In [67]:
import shutil
from pathlib import Path


def clear_directory(directory: Path):
    if directory.exists() and directory.is_dir():
        # Delete all contents of the directory
        for item in directory.iterdir():
            if item.is_dir():
                shutil.rmtree(item)
            else:
                item.unlink()

In [70]:
clear_directory(Path("./cache/"))

#### Run over all categories

In [78]:
import ast
import shutil
from pathlib import Path

import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm

In [79]:
def clear_directory(directory: Path):
    if directory.exists() and directory.is_dir():
        # Delete all contents of the directory
        for item in directory.iterdir():
            if item.is_dir():
                shutil.rmtree(item)
            else:
                item.unlink()

In [80]:
category2asin_ids_path = "category2item_asin_ids.csv"
output_dir = "c4-raw-meta"
cache_dir = "cache"
df = pd.read_csv(category2asin_ids_path, index_col=[0])
df.head()

Unnamed: 0,category,ids
0,Beauty and Personal Care,"['B0778XR2QM', 'B07Q443QPB', 'B00CMGHTHC', 'B0..."
1,Clothing Shoes and Jewelry,"['B07NRD63N7', 'B07T589YKW', 'B07CZJNMYN', 'B0..."
2,Patio Lawn and Garden,"['B09655QKSN', 'B09DQ9BS43', 'B094XNG7CP', 'B0..."
3,Kindle Store,"['B004Z1RFB2', 'B009RAOQ9A', 'B07K1F9RWY', 'B0..."
4,Home and Kitchen,"['B07JNQCMX7', 'B077SPV6JD', 'B09LKND673', 'B0..."


In [81]:
len(df)

31

remove already processed categories

In [85]:
def remove_processed(df):
    file_list = list(Path(output_dir).iterdir())
    file_stems = [file.stem for file in file_list]
    cleaned_file_names = [
        file_stem[len("raw_meta_") : -len("_c4")].replace("_", " ")
        for file_stem in file_stems
    ]
    return df[~df["category"].isin(cleaned_file_names)]

In [86]:
print(len(df))
df = remove_processed(df)
print(len(df))

31
29


In [87]:
for row in tqdm(df.itertuples(), total=len(df)):
    category_name = row.category.replace(" ", "_")
    dataset_name = f"raw_meta_{category_name}"

    item_ids = ast.literal_eval(row.ids)
    ds = load_dataset(
        path="McAuley-Lab/Amazon-Reviews-2023",
        name=dataset_name,
        trust_remote_code=True,
        cache_dir=cache_dir,
        split="full",
    )

    # Use map to add a boolean column indicating whether the item_id is in the set
    item_id_set = set(item_ids)  # Convert the list to a set for faster lookups
    dataset = ds.map(lambda x: {"is_in_set": x["parent_asin"] in item_id_set})

    # Filter the dataset where the 'is_in_set' column is True
    filtered_dataset = dataset.filter(lambda x: x["is_in_set"])

    # Optionally, remove the 'is_in_set' column if not needed
    filtered_dataset = filtered_dataset.remove_columns("is_in_set")

    # Convert the filtered dataset to a pandas DataFrame (optional)
    df_fitlered = filtered_dataset.to_pandas()

    # Save to parquet
    df_fitlered.to_parquet(f"./{output_dir}/{dataset_name}_c4.parquet")
    clear_directory(Path(cache_dir))

  0%|          | 0/29 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/18.0G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/31 [00:00<?, ?it/s]

Map:   0%|          | 0/7218481 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7218481 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/2.71G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/851907 [00:00<?, ? examples/s]

Filter:   0%|          | 0/851907 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/6.87G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1591371 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1591371 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/11.8G [00:00<?, ?B/s]