In [1]:
# %pip install datasets
from datasets import load_dataset

# Load the Amazon Reviews 2023 dataset
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Home_and_Kitchen", trust_remote_code=True)

# Print the dataset splits and details
print(dataset)

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

DatasetDict({
    full: Dataset({
        features: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
        num_rows: 3735584
    })
})


In [None]:
dataset.save_to_disk("amazon_products_dataset")

In [None]:
import os
from google.cloud import storage

# Set your project and bucket name
bucket_name = 'recomviz_home_and_kitchen'
destination_folder = 'datasets/amazon_products_dataset'  # GCS path

# Initialize storage client
client = storage.Client()
bucket = client.bucket(bucket_name)

# Upload function
def upload_folder_to_gcs(local_path, gcs_path):
    for root, _, files in os.walk(local_path):
        for file in files:
            local_file = os.path.join(root, file)
            relative_path = os.path.relpath(local_file, local_path)
            blob_path = os.path.join(gcs_path, relative_path)
            blob = bucket.blob(blob_path)
            blob.upload_from_filename(local_file)
            print(f"Uploaded {local_file} to {blob_path}")

# Call the function
upload_folder_to_gcs("amazon_products_dataset", destination_folder)

In [None]:
# %pip install google-cloud-storage
from google.cloud import storage
import os

def download_gcs_folder(bucket_name, gcs_folder, local_folder):
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    blobs = bucket.list_blobs(prefix=gcs_folder)

    for blob in blobs:
        # Get local path
        local_path = os.path.join(local_folder, os.path.relpath(blob.name, gcs_folder))

        # Create local directories if needed
        os.makedirs(os.path.dirname(local_path), exist_ok=True)

        # Download the file
        blob.download_to_filename(local_path)
        print(f"Downloaded {blob.name} to {local_path}")

# 🔁 Replace with your values:
bucket_name = "recomviz_home_and_kitchen"
gcs_folder = "datasets/amazon_products_dataset"  # GCS path (folder)
local_folder = "amazon_products_dataset"          # Local target

download_gcs_folder(bucket_name, gcs_folder, local_folder)

In [2]:
item_item_dataset = dataset["full"].select_columns(["title", "parent_asin", "description", "images"])
print(item_item_dataset)

Dataset({
    features: ['title', 'parent_asin', 'description', 'images'],
    num_rows: 3735584
})


In [3]:
# Parameters
batch_size = 100_000
total_rows = item_item_dataset.num_rows
bucket_path = "gs://recomviz_home_and_kitchen/datasets/converted_item_metadata"

In [5]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
# Batch conversion with progress bar
for i in tqdm(range(0, total_rows, batch_size), desc="Uploading item metadata in batches"):
    # Select batch
    batch = item_item_dataset.select(range(i, min(i + batch_size, total_rows)))
    df = batch.to_pandas()
    
    # Drop nulls
    df = df.dropna(subset=["parent_asin", "title", "description"])
    
    # Rename for consistency
    df.rename(columns={"parent_asin": "product_id"}, inplace=True)
    
    # Save batch as Parquet to GCS
    df.to_parquet(
        f"{bucket_path}/chunk_{i//batch_size:05}.parquet",
        index=False,
        engine="pyarrow",
        storage_options={"token": "cloud"}  # Vertex AI-compatible
    )

Uploading item metadata in batches: 100%|██████████| 38/38 [03:17<00:00,  5.21s/it]
