# ðŸ›’ Amazon Products Dataset API

Access Bright Data's pre-collected Amazon Products dataset:
- **85 fields** including pricing, ratings, reviews, categories, and more
- Filter by price, rating, brand, category, availability, and other criteria

---

## Setup

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

API_TOKEN = os.getenv("BRIGHTDATA_API_TOKEN")
if not API_TOKEN:
    raise ValueError("Set BRIGHTDATA_API_TOKEN in .env file")

print(f"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}")
print("Setup complete!")

API Token: 7011787d-2...3336
Setup complete!


## Initialize Client

In [2]:
from brightdata import BrightDataClient

client = BrightDataClient(token=API_TOKEN)

print("Client initialized")

Client initialized


---
## Test 1: Explore Amazon Products Fields

Before filtering, explore available fields using the class metadata.

In [3]:
from brightdata.datasets import AmazonProducts

print("=== Amazon Products Dataset ===")
print(f"Dataset ID: {AmazonProducts.DATASET_ID}")
print(f"Total fields: {len(AmazonProducts.FIELDS)}")

# Show field types breakdown
print(f"\nField types:")
print(f"  Text fields: {len(AmazonProducts.get_fields_by_type('text'))}")
print(f"  Number fields: {len(AmazonProducts.get_fields_by_type('number'))}")
print(f"  Array fields: {len(AmazonProducts.get_fields_by_type('array'))}")
print(f"  Boolean fields: {len(AmazonProducts.get_fields_by_type('boolean'))}")
print(f"  URL fields: {len(AmazonProducts.get_fields_by_type('url'))}")

=== Amazon Products Dataset ===
Dataset ID: gd_l7q7dkf244hwjntr0
Total fields: 85

Field types:
  Text fields: 38
  Number fields: 14
  Array fields: 15
  Boolean fields: 7
  URL fields: 7


---
## Test 2: Get Dataset Metadata from API

In [5]:
print("Fetching Amazon Products metadata from API...\n")

async with client:
    metadata = await client.datasets.amazon_products.get_metadata()

print(f"Dataset ID: {metadata.id}")
print(f"Total fields from API: {len(metadata.fields)}")

print("\n=== Sample Fields ===")
for i, (name, field) in enumerate(list(metadata.fields.items())[:10]):
    print(f"  {name}: {field.type} - {field.description or 'N/A'}")

Fetching Amazon Products metadata from API...

Dataset ID: gd_l7q7dkf244hwjntr0
Total fields from API: 86

=== Sample Fields ===
  title: text - Product title
  seller_name: text - Seller name
  brand: text - Product brand
  description: text - A brief description of the product
  initial_price: price - Initial price
  currency: text - Currency of the product
  availability: text - Product availability
  reviews_count: number - Number of reviews
  categories: array - Product categories
  parent_asin: text - Parent ASIN of the product


---
## Test 3: Keyword Search with Rating Filter

Search for products by keyword and filter by rating.

In [None]:
# Step 1: Create filter and get snapshot_id
# Search for keyboards with 4.5+ star rating
FILTER = {
    "operator": "and",
    "filters": [
        {"name": "title", "operator": "includes", "value": "keyboard"},
        {"name": "rating", "operator": ">=", "value": 4.5}
    ]
}
LIMIT = 2

print("Filter: Keyboards with rating >= 4.5")
print(f"Records limit: {LIMIT}\n")

async with client:
    snapshot_id = await client.datasets.amazon_products(
        filter=FILTER,
        records_limit=LIMIT
    )

    

print(f"Snapshot created: {snapshot_id}")
print("\nRun the next cell to download the data...")

Filter: Keyboards with rating >= 4.5
Records limit: 2

Snapshot created: snap_mley0j875vz72i0rb

Run the next cell to download the data...


In [9]:
# Step 2: Download data (polls until ready)
print(f"Downloading snapshot: {snapshot_id}")
print("(This will poll until ready...)\n")

async with client:
    data = await client.datasets.amazon_products.download(
        snapshot_id,
        timeout=300,
        poll_interval=5
    )

print(f"Downloaded {len(data)} products:")
for product in data[:5]:
    print(f"\n  Title: {product.get('title', 'N/A')[:60]}...")
    print(f"  Rating: {product.get('rating', 'N/A')} ({product.get('reviews_count', 0)} reviews)")
    print(f"  Price: {product.get('currency', '')} {product.get('final_price', 'N/A')}")
    print(f"  Brand: {product.get('brand', 'N/A')}")

Downloading snapshot: snap_mley0j875vz72i0rb
(This will poll until ready...)



TimeoutError: Snapshot snap_mley0j875vz72i0rb not ready after 300s (status: scheduled)

---
## Test 4: Filter by Price Range

Find products in a specific price range.

In [None]:
# Step 1: Create filter
PRICE_FILTER = {
    "operator": "and",
    "filters": [
        {"name": "final_price", "operator": ">=", "value": 50},
        {"name": "final_price", "operator": "<=", "value": 100}
    ]
}

print("Filter: Products priced $50-$100")
print(f"Records limit: 5\n")

async with client:
    snapshot_id = await client.datasets.amazon_products(
        filter=PRICE_FILTER,
        records_limit=5
    )

print(f"Snapshot created: {snapshot_id}")

In [None]:
# Step 2: Download
print(f"Downloading snapshot: {snapshot_id}\n")

async with client:
    data = await client.datasets.amazon_products.download(snapshot_id)

print(f"Downloaded {len(data)} products:")
for product in data:
    print(f"  - {product.get('title', 'N/A')[:50]}... - ${product.get('final_price', 'N/A')}")

---
## Test 5: Filter by Availability and Prime

Find available Prime-eligible products.

In [None]:
# Step 1: Create filter
PRIME_FILTER = {
    "operator": "and",
    "filters": [
        {"name": "is_available", "operator": "=", "value": True},
        {"name": "amazon_prime", "operator": "=", "value": True}
    ]
}

print("Filter: Available + Prime eligible")
print(f"Records limit: 5\n")

async with client:
    snapshot_id = await client.datasets.amazon_products(
        filter=PRIME_FILTER,
        records_limit=5
    )

print(f"Snapshot created: {snapshot_id}")

In [None]:
# Step 2: Download
print(f"Downloading snapshot: {snapshot_id}\n")

async with client:
    data = await client.datasets.amazon_products.download(snapshot_id)

print(f"Downloaded {len(data)} products:")
for product in data:
    print(f"\n  Title: {product.get('title', 'N/A')[:50]}...")
    print(f"  Available: {product.get('is_available', 'N/A')}")
    print(f"  Prime: {product.get('amazon_prime', 'N/A')}")
    print(f"  Price: ${product.get('final_price', 'N/A')}")

---
## Test 6: Filter by Brand

In [None]:
# Step 1: Create filter
BRAND = "Apple"

BRAND_FILTER = {
    "name": "brand",
    "operator": "=",
    "value": BRAND
}

print(f"Filter: Brand = {BRAND}")
print(f"Records limit: 5\n")

async with client:
    snapshot_id = await client.datasets.amazon_products(
        filter=BRAND_FILTER,
        records_limit=5
    )

print(f"Snapshot created: {snapshot_id}")

In [None]:
# Step 2: Download
print(f"Downloading snapshot: {snapshot_id}\n")

async with client:
    data = await client.datasets.amazon_products.download(snapshot_id)

print(f"Downloaded {len(data)} products:")
for product in data:
    print(f"  - {product.get('title', 'N/A')[:60]}...")
    print(f"    Brand: {product.get('brand', 'N/A')}, Price: ${product.get('final_price', 'N/A')}")

---
## Test 7: Export Results

In [None]:
import json
from pathlib import Path

if data:
    output_file = Path.cwd() / "amazon_dataset_results.json"
    
    with open(output_file, "w") as f:
        json.dump(data, f, indent=2, default=str)
    
    print(f"Exported to: {output_file}")
    print(f"Records: {len(data)}")
else:
    print("No data to export")

from brightdata.datasets import export_json, export_csv, export

# Export to JSON
json_file = export_json(data, "amazon_results.json")
print(f"Exported to: {json_file}")

# Export to CSV
csv_file = export_csv(data, "amazon_results.csv")
print(f"Exported to: {csv_file}")

# Or use auto-detect based on extension
# export(data, "results.json")
# export(data, "results.csv")

print(f"\nRecords: {len(data)}")