In [None]:
# 5/28/2025
# Jupyter Notebook for extracting the Julian McAuley dataset of Amazon reviews. Specifically, I wish to pull baby products
# Will save this on an external drive to free up some space

In [2]:
# Initialize necessary packages
import pandas as pd # For data manipulation
from datasets import load_dataset, config # contains the baby dataset
import shutil # Use to clear the cache of this dataframe
import os # Use to clear the cache of this dataframe

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
# Loads the baby dataset into a dataframe
# We are not splitting into training or testing - in this case, train contains all of the data
baby_products_raw = load_dataset("McAuley-Lab/Amazon-Reviews-2023", name = "raw_review_Baby_Products", split = "full", 
                                 trust_remote_code = True)
baby_products = baby_products_raw.to_pandas()

print(f"Loaded {len(baby_products)} baby product reviews")

Generating full split: 6028884 examples [01:02, 96657.99 examples/s]


Loaded 6028884 baby product reviews


In [24]:
# See how this dataframe is structured
baby_products.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4.0,Good buy for preschool naps and home use...,I bought two of these for my kids for nap time...,[],B004FM7VOW,B089MS68G8,AGKASBHYZPGTEPO6LWZPVJWB2BVA,1471546337000,1,True
1,5.0,THEY WORK- and are super cute to boot...,LOVE THESE! AND THEY WORK!!! I was on the fenc...,[],B01E5E703G,B01E5E703G,AGKASBHYZPGTEPO6LWZPVJWB2BVA,1471542244000,1,True
2,1.0,cute but small and pretty much unusable as a c...,cute but small and pretty much unusable as a c...,[],B00F463XV8,B00F9386Q8,AGKASBHYZPGTEPO6LWZPVJWB2BVA,1452650881000,0,True
3,5.0,Works great perfect size!,I have lots of different disposable diaper bag...,[],B0007V644S,B07RRDX26B,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1408994051000,0,True
4,5.0,Cute and Works Great,I was so excited for bath time when I register...,[],B002LARFLY,B00OLRJET6,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1349818961000,0,False


In [None]:
# Save this code to my external drive
external_drive_path = "/Volumes/Samsung1TB/programming/data-science/baby-product-dashboard/raw-data/baby_products.csv"
baby_products.to_csv(external_drive_path, index = False)
print(f"DataFrame saved to {external_drive_path}")

DataFrame saved to /Volumes/Samsung1TB/programming/data-science/baby-product-dashboard/raw-data/baby_products.csv


In [5]:
# The previous dataset just has the reviews for sentiment analysis - we also need the product names
meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Baby_Products", split = "full",
    trust_remote_code = True)

baby_product_names = meta.to_pandas()

print(f"Loaded {len(baby_product_names)} baby product names")

Loaded 217724 baby product names


In [6]:
# Save this dataset to the external drive
external_drive_path = "/Volumes/Samsung1TB/programming/data-science/baby-product-dashboard/raw-data/baby_product_names.csv"
baby_product_names.to_csv(external_drive_path, index = False)
print(f"DataFrame saved to {external_drive_path}")

DataFrame saved to /Volumes/Samsung1TB/programming/data-science/baby-product-dashboard/raw-data/baby_product_names.csv


In [7]:
# The following code cleans up the cache and gets back some memory this dataframe takes up

# Get Hugging Face datasets cache directory
cache_dir = config.HF_DATASETS_CACHE

# List all subdirectories to help locate the Baby Products dataset
for root, dirs, files in os.walk(cache_dir):
    for d in dirs:
        if "Baby_Products" in d:
            full_path = os.path.join(root, d)
            print("Deleting:", full_path)
            shutil.rmtree(full_path)

Deleting: /Users/adamng/.cache/huggingface/datasets/McAuley-Lab___amazon-reviews-2023/raw_meta_Baby_Products
