In [1]:
# 🚀 Complete BrightData Workflow Demo
# This notebook demonstrates the full process: Filter → Search → Monitor → Download → View

import json
import time
import os
import sys
from pathlib import Path

# Ensure we're in the correct directory for imports
current_dir = Path.cwd()
if not (current_dir / 'util').exists():
    # If running from a subdirectory, add parent to path
    parent_dir = current_dir.parent
    sys.path.insert(0, str(parent_dir))
    os.chdir(parent_dir)
    print(f"📁 Changed to directory: {parent_dir}")
else:
    print(f"📁 Using current directory: {current_dir}")

from util import BrightDataFilter

print("🎯 BrightData Complete Workflow Demo")
print("=" * 50)

# Step 1: Initialize dataset connections
print("\n📊 Step 1: Initialize Dataset Connections")
print("-" * 40)

# Create dataset connections with built-in filter fields
amazon_products = BrightDataFilter("amazon_products")
shopee = BrightDataFilter("shopee")

print("✅ Dataset connections created with built-in filter fields")
print(f"Amazon Products: {amazon_products.dataset_id}")
print(f"Shopee: {shopee.dataset_id}")

# Show available fields
print(f"\n📋 Available filter fields:")
print(f"Amazon fields: {len(amazon_products.filter.get_field_names())} fields")
print(f"Shopee fields: {len(shopee.filter.get_field_names())} fields")


📁 Using current directory: /Users/derek/Documents/Projects/walmart insights
🎯 BrightData Complete Workflow Demo

📊 Step 1: Initialize Dataset Connections
----------------------------------------
✅ Dataset connections created with built-in filter fields
Amazon Products: gd_l7q7dkf244hwjntr0
Shopee: gd_lk122xxgf86xf97py

📋 Available filter fields:
Amazon fields: 52 fields
Shopee fields: 36 fields


In [2]:
# Step 2: Create Database Queries (Filters)
print("\n🔍 Step 2: Create Database Queries")
print("-" * 40)
AF = amazon_products.filter
# Create a complex filter for high-quality, affordable products
high_volumn_low_inventory = (
    (AF.rating >= 4.0) &           # Good ratings
    (AF.reviews_count >= 50) &     # Sufficient reviews
    (AF.bought_past_month >= 1000) &
        (
        AF.is_available.is_false() |  
        (AF.availability.includes(['only', 'within', 'limited']))
    ) &      # Exclude products with FREE delivery
    (AF.currency == "USD")         # USD currency
)

# Show individual conditions
print(f"\n📋 Query breakdown:")
print(high_volumn_low_inventory)



🔍 Step 2: Create Database Queries
----------------------------------------

📋 Query breakdown:
(
  rating >= 4.0
  AND
  reviews_count >= 50
  AND
  bought_past_month >= 1000
  AND
  (
    is_available = False
    OR
    availability includes ['only', 'within', 'limited']
  )
  AND
  currency = USD
)


In [5]:
# Step 3: Submit Database Query to BrightData
print("\n📤 Step 3: Submit Database Query")
print("-" * 40)

# Submit the query to BrightData API
print("🚀 Submitting query to BrightData...")
try:
    result = amazon_products.search_data(high_volumn_low_inventory, records_limit=1000)
    snapshot_id = result['snapshot_id']
    print(f"✅ Query submitted successfully!")
    print(f"📋 Snapshot ID: {snapshot_id}")
    print(f"📊 Records limit: 1000")
    print(f"💾 Local record saved automatically")
    
except Exception as e:
    error_msg = str(e)
    print(f"❌ Query submission failed: {error_msg}")
    
    # Handle different types of errors
    if "401" in error_msg or "Invalid credentials" in error_msg:
        print("🔑 Authentication issue detected")
        print("💡 Make sure your API key is correctly set in secrets.yaml")
        print("📋 Get your API key from: https://brightdata.com/cp/setting/users")
        print("🔧 Example secrets.yaml format:")
        print("   brightdata:")
        print("     api_key: 'your_api_key_here'")
    elif "400" in error_msg:
        print("🔍 Query validation issue detected")
        print("💡 Check your filter conditions and field names")
    else:
        print("🌐 Network or API issue detected")
        print("💡 Check your internet connection and API status")
    
    # For demo purposes, use a mock snapshot ID
    snapshot_id = "snap_demo123456789"
    print(f"\n🔄 Using demo snapshot ID: {snapshot_id}")
    print("📝 This allows the demo to continue showing the workflow")
    print("💡 In production, fix the authentication issue and retry")



📤 Step 3: Submit Database Query
----------------------------------------
🚀 Submitting query to BrightData...
❌ Query submission failed: API request failed: HTTP 401: Invalid credentials
🔑 Authentication issue detected
💡 Make sure your API key is correctly set in secrets.yaml
📋 Get your API key from: https://brightdata.com/cp/setting/users
🔧 Example secrets.yaml format:
   brightdata:
     api_key: 'your_api_key_here'

🔄 Using demo snapshot ID: snap_demo123456789
📝 This allows the demo to continue showing the workflow
💡 In production, fix the authentication issue and retry


In [None]:
# Step 4: Monitor Snapshot Status
print("\n⏳ Step 4: Monitor Snapshot Status")
print("-" * 40)

# Check snapshot status
print(f"🔍 Checking status for snapshot: {snapshot_id}")
try:
    # Get snapshot metadata
    metadata = amazon_products.get_snapshot_metadata(snapshot_id)
    
    print(f"📊 Snapshot Status: {metadata.get('status', 'Unknown')}")
    print(f"📈 Progress: {metadata.get('progress', 'N/A')}")
    print(f"💰 Cost: ${metadata.get('cost', 'N/A')}")
    print(f"📅 Created: {metadata.get('created_at', 'N/A')}")
    print(f"📅 Updated: {metadata.get('updated_at', 'N/A')}")
    
    # Check if ready for download
    if metadata.get('status') == 'ready':
        print("✅ Snapshot is ready for download!")
        download_ready = True
    else:
        print("⏳ Snapshot is still processing...")
        download_ready = False
        
except Exception as e:
    print(f"❌ Error checking status: {e}")
    print("🔄 This is normal for demo purposes")
    download_ready = False

Dataset: Amazon Products
Available fields: 52

📋 Field Reference (showing first 10 fields):
title: Product title
asin: Unique identifier for each product
parent_asin: Parent ASIN of the product
brand: Product brand
description: A brief description of the product
categories: Product categories
initial_price: Initial price
final_price: Final price of the product
final_price_high: Highest value of the final price when it is a range
currency: Currency of the product
... and 42 more fields


In [None]:
# Step 5: Download Snapshot Data
print("\n📥 Step 5: Download Snapshot Data")
print("-" * 40)

if download_ready:
    print("🚀 Attempting to download snapshot data...")
    try:
        # Try direct download first
        response = amazon_products.download_snapshot_content(snapshot_id, format="json")
        
        # Save to downloads directory
        downloads_dir = Path("downloads")
        downloads_dir.mkdir(exist_ok=True)
        
        file_path = downloads_dir / f"{snapshot_id}.json"
        
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f"✅ Snapshot downloaded successfully!")
        print(f"📁 Saved to: {file_path}")
        print(f"📊 File size: {file_path.stat().st_size:,} bytes")
        
    except Exception as e:
        print(f"❌ Direct download failed: {e}")
        print("📤 This might require the deliver snapshot method")
        file_path = None
else:
    print("⏳ Snapshot not ready for download yet")
    print("💡 Use the snapshot manager to monitor and download when ready:")
    print("   python snapshot_manager.py")
    file_path = None


In [None]:
# Step 6: View and Analyze Downloaded Data
print("\n📊 Step 6: View and Analyze Downloaded Data")
print("-" * 40)

if file_path and file_path.exists():
    print(f"📖 Reading downloaded data from: {file_path}")
    try:
        # Load the JSON data
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        print(f"✅ Data loaded successfully!")
        
        # Analyze the data structure
        if isinstance(data, list):
            print(f"📊 Total records: {len(data)}")
            if len(data) > 0:
                print(f"📋 Sample record fields: {list(data[0].keys())}")
                
                # Show sample record
                print(f"\n🔍 Sample record:")
                sample = data[0]
                for key, value in list(sample.items())[:5]:  # Show first 5 fields
                    print(f"  {key}: {value}")
                if len(sample) > 5:
                    print(f"  ... and {len(sample) - 5} more fields")
                    
        elif isinstance(data, dict):
            print(f"📊 Data structure: Dictionary with {len(data)} keys")
            print(f"📋 Keys: {list(data.keys())}")
            
        else:
            print(f"📊 Data type: {type(data)}")
            
    except Exception as e:
        print(f"❌ Error reading data: {e}")
        
else:
    print("📁 No downloaded file available to view")
    print("💡 Complete the download step first, or check existing downloads:")
    
    # Check for existing downloads
    downloads_dir = Path("downloads")
    if downloads_dir.exists():
        existing_files = list(downloads_dir.glob("*.json"))
        if existing_files:
            print(f"📂 Found {len(existing_files)} existing download(s):")
            for file in existing_files[:3]:  # Show first 3
                print(f"  - {file.name}")
        else:
            print("📂 No existing downloads found")
    else:
        print("📂 Downloads directory doesn't exist yet")


In [None]:
# Step 7: Snapshot Management Tools
print("\n🛠️ Step 7: Snapshot Management Tools")
print("-" * 40)

print("📋 Available snapshot management commands:")
print("=" * 50)

print("1. 📊 List all snapshots:")
print("   python snapshot_manager.py")
print()

print("2. 🔍 Check specific snapshot status:")
print("   python snapshot_manager.py -s <snapshot_id>")
print()

print("3. 📥 Download ready snapshots:")
print("   python snapshot_manager.py -d")
print()

print("4. 👀 View downloaded data:")
print("   python snapshot_manager.py -v <snapshot_id>")
print()

print("5. 🗑️ Clean up old snapshots:")
print("   python snapshot_manager.py -c")
print()

print("💡 Pro Tips:")
print("• Snapshots can take 30+ minutes to process")
print("• Use the snapshot manager to monitor progress")
print("• Local records are automatically saved for each submission")
print("• Download URLs may require delivery job initiation")
print("• Check the downloads/ folder for your data files")


In [None]:
# 🎉 Workflow Summary
print("\n🎯 Complete Workflow Summary")
print("=" * 50)

print("✅ What we accomplished:")
print("1. 📊 Initialized dataset connections with built-in filter fields")
print("2. 🔍 Created complex database queries using intuitive syntax")
print("3. 📤 Submitted queries to BrightData API")
print("4. ⏳ Monitored snapshot processing status")
print("5. 📥 Downloaded snapshot data (when ready)")
print("6. 📊 Analyzed and viewed downloaded data")
print("7. 🛠️ Learned about snapshot management tools")
print()

print("🚀 Key Benefits of the BrightData System:")
print("• Unified API: dataset.filter.field syntax")
print("• Automatic API key loading from secrets.yaml")
print("• Local record management for all submissions")
print("• Multiple download methods (direct + deliver)")
print("• Comprehensive snapshot monitoring")
print("• Type-safe field validation")
print("• Support for multiple datasets")
print()

print("📚 Next Steps:")
print("• Try different filter combinations")
print("• Experiment with other datasets (Shopee, Amazon-Walmart)")
print("• Use the snapshot manager for production workflows")
print("• Check the README.md for advanced features")
print()

print("🎊 Happy data querying with BrightData!")


In [None]:
# 🔧 API Key Setup Guide
print("\n🔑 API Key Setup Guide")
print("=" * 50)

print("📋 To use the BrightData API, you need to set up your API key:")
print()

print("1. 🌐 Get your API key from BrightData:")
print("   https://brightdata.com/cp/setting/users")
print()

print("2. 📁 Create a secrets.yaml file in your project root:")
print("   brightdata:")
print("     api_key: 'your_actual_api_key_here'")
print()

print("3. 🔒 Keep your API key secure:")
print("   • Never commit secrets.yaml to version control")
print("   • Add secrets.yaml to your .gitignore file")
print("   • Use environment variables in production")
print()

print("4. ✅ Test your setup:")
print("   from util import get_brightdata_api_key")
print("   api_key = get_brightdata_api_key()")
print("   print(f'API key loaded: {api_key[:10]}...')")
print()

print("💡 Once your API key is set up, re-run the demo cells above!")
print("🎯 The demo will work with real data instead of mock data.")
