In [1]:
# 🚀 Complete BrightData Workflow Demo
# This notebook demonstrates the full process: Filter → Search → Monitor → Download → View

import json
import time
import os
import sys
from pathlib import Path

# Ensure we're in the correct directory for imports
current_dir = Path.cwd()
if not (current_dir / 'util').exists():
    # If running from a subdirectory, add parent to path
    parent_dir = current_dir.parent
    sys.path.insert(0, str(parent_dir))
    os.chdir(parent_dir)
    print(f"📁 Changed to directory: {parent_dir}")
else:
    print(f"📁 Using current directory: {current_dir}")

from util import BrightDataFilter

print("🎯 BrightData Complete Workflow Demo")
print("=" * 50)

# Step 1: Initialize dataset connections
print("\n📊 Step 1: Initialize Dataset Connections")
print("-" * 40)

# Create dataset connections with built-in filter fields
amazon_products = BrightDataFilter("amazon_products")
shopee = BrightDataFilter("shopee")

print("✅ Dataset connections created with built-in filter fields")
print(f"Amazon Products: {amazon_products.dataset_id}")
print(f"Shopee: {shopee.dataset_id}")

# Show available fields
print(f"\n📋 Available filter fields:")
print(f"Amazon fields: {len(amazon_products.filter.get_field_names())} fields")
print(f"Shopee fields: {len(shopee.filter.get_field_names())} fields")


📁 Using current directory: /Users/derek/Documents/Projects/walmart insights
🎯 BrightData Complete Workflow Demo

📊 Step 1: Initialize Dataset Connections
----------------------------------------
✅ Dataset connections created with built-in filter fields
Amazon Products: gd_l7q7dkf244hwjntr0
Shopee: gd_lk122xxgf86xf97py

📋 Available filter fields:
Amazon fields: 52 fields
Shopee fields: 36 fields


In [2]:
# Step 2: Create Database Queries (Filters)
print("\n🔍 Step 2: Create Database Queries")
print("-" * 40)
AF = amazon_products.filter
# Create a complex filter for high-quality, affordable products
high_volumn_low_inventory = (
    (AF.rating >= 4.0) &           # Good ratings
    (AF.reviews_count >= 50) &     # Sufficient reviews
    (AF.bought_past_month >= 1000) &
        (
        AF.is_available.is_false() |  
        (AF.availability.includes(['only', 'within', 'limited']))
    ) &      # Exclude products with FREE delivery
    (AF.currency == "USD")         # USD currency
)

# Show individual conditions
print(f"\n📋 Query breakdown:")
print(high_volumn_low_inventory)



🔍 Step 2: Create Database Queries
----------------------------------------

📋 Query breakdown:
(
  rating >= 4.0
  AND
  reviews_count >= 50
  AND
  bought_past_month >= 1000
  AND
  (
    is_available = False
    OR
    availability includes ['only', 'within', 'limited']
  )
  AND
  currency = USD
)


In [3]:
# Step 3: Submit Database Query to BrightData
print("\n📤 Step 3: Submit Database Query")
print("-" * 40)

# Submit the query to BrightData API
print("🚀 Submitting query to BrightData...")
try:
    result = amazon_products.search_data(high_volumn_low_inventory, records_limit=1000)
    snapshot_id = result['snapshot_id']
    print(f"✅ Query submitted successfully!")
    print(f"📋 Snapshot ID: {snapshot_id}")
    print(f"📊 Records limit: 1000")
    print(f"💾 Local record saved automatically")
    
except Exception as e:
    error_msg = str(e)
    print(f"❌ Query submission failed: {error_msg}")
    
    # Handle different types of errors
    if "401" in error_msg or "Invalid credentials" in error_msg:
        print("🔑 Authentication issue detected")
        print("💡 Make sure your API key is correctly set in secrets.yaml")
        print("📋 Get your API key from: https://brightdata.com/cp/setting/users")
        print("🔧 Example secrets.yaml format:")
        print("   brightdata:")
        print("     api_key: 'your_api_key_here'")
    elif "400" in error_msg:
        print("🔍 Query validation issue detected")
        print("💡 Check your filter conditions and field names")
    else:
        print("🌐 Network or API issue detected")
        print("💡 Check your internet connection and API status")
    
    # For demo purposes, use a mock snapshot ID
    snapshot_id = "snap_demo123456789"
    print(f"\n🔄 Using demo snapshot ID: {snapshot_id}")
    print("📝 This allows the demo to continue showing the workflow")
    print("💡 In production, fix the authentication issue and retry")



📤 Step 3: Submit Database Query
----------------------------------------
🚀 Submitting query to BrightData...
🔄 Found existing snapshot with same conditions: snap_mfloqx442j7c6rppa8
📊 Status: submitted
💰 Cost: $None
📅 Created: 2025-09-15T15:18:21.590629
✅ Query submitted successfully!
📋 Snapshot ID: snap_mfloqx442j7c6rppa8
📊 Records limit: 1000
💾 Local record saved automatically


In [4]:
# Step 4: Monitor Snapshot Status
print("\n⏳ Step 4: Monitor Snapshot Status")
print("-" * 40)

# Check snapshot status
print(f"🔍 Checking status for snapshot: {snapshot_id}")
try:
    # Get snapshot metadata
    metadata = amazon_products.get_snapshot_metadata(snapshot_id)
    
    print(f"📊 Snapshot Status: {metadata.get('status', 'Unknown')}")
    print(f"📈 Progress: {metadata.get('progress', 'N/A')}")
    print(f"💰 Cost: ${metadata.get('cost', 'N/A')}")
    print(f"📅 Created: {metadata.get('created_at', 'N/A')}")
    print(f"📅 Updated: {metadata.get('updated_at', 'N/A')}")
    
    # Check if ready for download
    if metadata.get('status') == 'ready':
        print("✅ Snapshot is ready for download!")
        download_ready = True
    else:
        print("⏳ Snapshot is still processing...")
        download_ready = False
        
except Exception as e:
    print(f"❌ Error checking status: {e}")
    print("🔄 This is normal for demo purposes")
    download_ready = False


⏳ Step 4: Monitor Snapshot Status
----------------------------------------
🔍 Checking status for snapshot: snap_mfloqx442j7c6rppa8
📊 Snapshot Status: scheduled
📈 Progress: N/A
💰 Cost: $0
📅 Created: N/A
📅 Updated: N/A
⏳ Snapshot is still processing...


In [None]:
# Step 5: Download Snapshot Data
print("\n📥 Step 5: Download Snapshot Data")
print("-" * 40)

if download_ready:
    print("🚀 Attempting to download snapshot data...")
    try:
        # Try direct download first
        response = amazon_products.download_snapshot_content(snapshot_id, format="json")
        
        # Save to downloads directory
        downloads_dir = Path("data/downloads")
        downloads_dir.mkdir(exist_ok=True)
        
        file_path = downloads_dir / f"{snapshot_id}.json"
        
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f"✅ Snapshot downloaded successfully!")
        print(f"📁 Saved to: {file_path}")
        print(f"📊 File size: {file_path.stat().st_size:,} bytes")
        
    except Exception as e:
        print(f"❌ Direct download failed: {e}")
        print("📤 This might require the deliver snapshot method")
        file_path = None
else:
    print("⏳ Snapshot not ready for download yet")
    print("💡 Use the snapshot manager to monitor and download when ready:")
    print("   python snapshot_manager.py")
    file_path = None



📥 Step 5: Download Snapshot Data
----------------------------------------
⏳ Snapshot not ready for download yet
💡 Use the snapshot manager to monitor and download when ready:
   python snapshot_manager.py


In [None]:
# Step 6: View and Analyze Downloaded Data
print("\n📊 Step 6: View and Analyze Downloaded Data")
print("-" * 40)
snap_id = "snap_mflgxpdw1xqp0hpiav"
file_path = Path("data/downloads") / f"{snap_id}.json"
import pandas as pd
snapshot = pd.read_json(file_path)
snapshot



📊 Step 6: View and Analyze Downloaded Data
----------------------------------------


Unnamed: 0,about_the_author,amazon_choice,amazon_prime,answered_questions,asin,availability,badge,bought_past_month,brand,bs_category,...,title,top_review,upc,url,variations,variations_values,video,video_count,videos,zipcode
0,,0.0,0.0,0.0,B0CYGSXP5S,Only 5 left in stock.,Lowest price in 30 days,,Xinafan,Coffee Scoops,...,Xinafan Smart Coffee Scoop for Ninja Coffee Ma...,Pile poile,,https://www.amazon.ca/Xinafan-Measuring-Replac...,,,True,1,[https://www.amazon.ca/vdp/0aa80c3ee43746fd897...,T3H 4G
1,,0.0,,0.0,B093WBKB5K,In Stock,,,OWENIE,Place Mats,...,OWENIE Thanksgiving Harvest Pumpkin Placemats ...,so mant uses and the children love then fot th...,761588156465,https://www.amazon.com/Thanksgiving-Placemats-...,"[{'asin': 'B08BP5QQCS', 'color': 'White', 'cur...","[{'values': ['Ivory', 'Rust', 'White'], 'varia...",True,1,[https://www.amazon.com/vdp/036a05a482a348df93...,11001
2,,0.0,,0.0,B0BL3MHFXH,Only 4 left in stock.,,,Marlon,Women's Nightdresses & Nightshirts,...,Marlon Women's Cherie Satin & Lace Classic Nig...,ideal this warm summer worth the money,,https://www.amazon.co.uk/Marlon-Womens-Cherie-...,"[{'asin': 'B0BL3NTLH1', 'color': 'Pink', 'curr...","[{'values': ['8-10', '12-14', '20-22'], 'varia...",False,0,[https://www.amazon.co.uk/vdp/1c05efbdf4cd4ec9...,
3,,0.0,,0.0,3831332312,Out of Print--Limited Availability.,,,Wulf Mämpel,,...,(Delikat)Essen: Menschen und Geschichten aus E...,,,https://www.amazon.com/Delikat-Essen/dp/383133...,,,False,0,,11001
4,,0.0,,0.0,1429676183,Temporarily out of stock. Order now and we'll ...,,,Sally Lee,Children's Diet & Nutrition Books (Books),...,"Food Safety (First Facts, Staying Safe)",Just as described,,https://www.amazon.com/Food-Safety-Staying-Saf...,,,False,0,,11001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,0.0,,0.0,B00WDFB7VQ,Currently unavailable. We don't know when or i...,,,Near and Deer,Collectible Figurines,...,"Faux Taxidermy Ram Head Mount, Silver, R1010",Really pretty piece. I purchased the ram in al...,,https://www.amazon.com/Near-Deer-R1010-Taxider...,,,False,0,,11001
96,,1.0,1.0,0.0,B0BR64RM8Z,In Stock,Amazon's Choice,,EcoBlossom,Menstrual Cups,...,"EcoBlossom Menstrual Cup Kit - Tampon, Pad, an...",what a fantastic product. Comes in its own ca...,,https://www.amazon.com/EcoBlossom-Menstrual-Cu...,"[{'asin': 'B0BR64RM8Z', 'color': 'Midnight Bla...","[{'values': ['Midnight Black (Regular)', 'Midn...",True,1,[https://www.amazon.com/vdp/0392d797d4d44639ac...,94107
97,,0.0,,0.0,B01HNIR54K,Currently unavailable. We don't know when or i...,,,"Hi-Tec Sports USA, Inc",,...,Hi-Tec Men's V-Lite Wildlife Low I Hiking Shoe,I liked the shoe. Too bad its is too tight ar...,,https://www.amazon.com/Hi-Tec-V-Lite-Wildlife-...,,,False,0,,11001
98,,0.0,,0.0,B01LKCW8XI,Currently unavailable. We don't know when or i...,,,ZHANZZK,Shower Curtains,...,ZHANZZK Family Rules Educational Waterproof Ba...,i like this shower curtain a lot. the black an...,,https://www.amazon.com/Educational-Waterproof-...,,,False,0,,11001


In [16]:
# Statistical Analysis
print("📈 Statistical Analysis")
print("-" * 30)

if 'df' in locals() and not df.empty:
    # Basic statistics for numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns
    if len(numeric_cols) > 0:
        print("📊 Numeric Columns Statistics:")
        display(df[numeric_cols].describe())
        
        # Correlation matrix for numeric columns (if more than 1)
        if len(numeric_cols) > 1:
            print("\n🔗 Correlation Matrix:")
            corr_matrix = df[numeric_cols].corr()
            display(corr_matrix)
    
    # Categorical analysis
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"\n📋 Categorical Columns Analysis:")
        for col in categorical_cols[:5]:  # Show top 5 categorical columns
            print(f"\n{col} - Value Counts:")
            value_counts = df[col].value_counts()
            print(f"  Unique values: {df[col].nunique()}")
            print(f"  Most common: {value_counts.head(3).to_dict()}")
    
    # Missing data analysis
    print(f"\n❓ Missing Data Analysis:")
    missing_data = df.isnull().sum()
    missing_percent = (missing_data / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing_data,
        'Missing %': missing_percent
    }).sort_values('Missing Count', ascending=False)
    
    # Only show columns with missing data
    missing_df = missing_df[missing_df['Missing Count'] > 0]
    if not missing_df.empty:
        display(missing_df)
    else:
        print("✅ No missing data found!")
    
    # Data quality insights
    print(f"\n🔍 Data Quality Insights:")
    print(f"  • Total records: {len(df):,}")
    print(f"  • Total columns: {len(df.columns)}")
    print(f"  • Numeric columns: {len(numeric_cols)}")
    print(f"  • Categorical columns: {len(categorical_cols)}")
    
    # Check for duplicates (handle unhashable types)
    try:
        duplicate_count = df.duplicated().sum()
        print(f"  • Duplicate rows: {duplicate_count}")
    except TypeError:
        print(f"  • Duplicate rows: Cannot check (unhashable data types present)")
        
    # Check for columns with complex data types
    complex_cols = []
    for col in df.columns:
        if df[col].dtype == 'object':
            # Check if column contains dictionaries or lists
            sample_values = df[col].dropna().head(10)
            if len(sample_values) > 0:
                first_val = sample_values.iloc[0]
                if isinstance(first_val, (dict, list)):
                    complex_cols.append(col)
    
    if complex_cols:
        print(f"  • Complex data columns: {len(complex_cols)} ({', '.join(complex_cols[:3])}{'...' if len(complex_cols) > 3 else ''})")
    
else:
    print("❌ No data available for analysis")
    print("💡 Make sure to load the data in the previous cell first")


📈 Statistical Analysis
------------------------------
📊 Numeric Columns Statistics:


Unnamed: 0,amazon_choice,amazon_prime,answered_questions,bought_past_month,bs_rank,climate_pledge_friendly,editorial_reviews,final_price,final_price_high,images_count,initial_price,is_available,max_quantity_available,number_of_sellers,premium_brand,rating,reviews_count,root_bs_rank,sponsered,video_count
count,84.0,13.0,86.0,2.0,51.0,84.0,0.0,44.0,1.0,100.0,37.0,84.0,42.0,87.0,13.0,100.0,100.0,56.0,84.0,100.0
mean,0.02381,0.461538,0.034884,75.0,28274520000.0,0.0,,39.316136,24.99,4.14,36.177027,0.488095,23.880952,1.206897,0.0,4.556,63.04,1697228000.0,0.25,0.21
std,0.153371,0.518875,0.323498,35.355339,201919800000.0,0.0,,54.412697,,2.824925,38.009451,0.50286,20.217358,1.487451,0.0,0.337645,107.77398,12688480000.0,0.435613,0.555869
min,0.0,0.0,0.0,50.0,4.0,0.0,,5.99,24.99,1.0,5.99,0.0,1.0,1.0,0.0,4.0,1.0,560.0,0.0,0.0
25%,0.0,0.0,0.0,62.5,574.0,0.0,,12.99,24.99,1.0,12.99,0.0,10.0,1.0,0.0,4.3,2.75,160722.5,0.0,0.0
50%,0.0,0.0,0.0,75.0,2067.0,0.0,,19.99,24.99,3.5,19.99,0.0,30.0,1.0,0.0,4.5,11.5,299708.0,0.0,0.0
75%,0.0,1.0,0.0,87.5,13123.0,0.0,,44.7625,24.99,7.0,48.04,1.0,30.0,1.0,0.0,5.0,67.5,2198956.0,0.25,0.0
max,1.0,1.0,3.0,100.0,1441996000000.0,0.0,,329.0,24.99,10.0,209.99,1.0,100.0,14.0,0.0,5.0,492.0,94953520000.0,1.0,3.0



🔗 Correlation Matrix:


Unnamed: 0,amazon_choice,amazon_prime,answered_questions,bought_past_month,bs_rank,climate_pledge_friendly,editorial_reviews,final_price,final_price_high,images_count,initial_price,is_available,max_quantity_available,number_of_sellers,premium_brand,rating,reviews_count,root_bs_rank,sponsered,video_count
amazon_choice,1.0,0.032898,,-1.0,-0.028572,,,-0.177181,,0.199127,-0.160589,0.003719,-0.043444,,,-0.110559,0.272756,-0.029184,0.090167,0.35284
amazon_prime,0.032898,1.0,,,0.443642,,,-0.03799,,-0.134093,-0.047056,0.507093,-0.292344,,,-0.365148,-0.120453,0.354684,0.051434,-0.404651
answered_questions,,,1.0,,,,,-0.057625,,0.026409,,,,1.0,,-0.077453,-0.01034,-0.019623,,-0.042925
bought_past_month,-1.0,,,1.0,1.0,,,1.0,,-1.0,1.0,1.0,1.0,,,-1.0,-1.0,-1.0,-1.0,-1.0
bs_rank,-0.028572,0.443642,,1.0,1.0,,,0.009333,,-0.044186,-0.001468,-0.229907,-0.183048,,,0.140657,-0.106976,-0.025156,-0.1,-0.073302
climate_pledge_friendly,,,,,,,,,,,,,,,,,,,,
editorial_reviews,,,,,,,,,,,,,,,,,,,,
final_price,-0.177181,-0.03799,-0.057625,1.0,0.009333,,,1.0,,-0.122698,0.991462,0.125312,-0.085527,-0.079827,,0.089895,-0.153105,-0.078967,0.241006,0.220494
final_price_high,,,,,,,,,,,,,,,,,,,,
images_count,0.199127,-0.134093,0.026409,-1.0,-0.044186,,,-0.122698,,1.0,0.086738,0.085579,-0.064537,0.007705,,-0.305883,0.330131,0.193071,0.32505,0.367043



📋 Categorical Columns Analysis:

about_the_author - Value Counts:
  Unique values: 1
  Most common: {'Jennifer Leitzman is a full time author and mother. She and her husband make their home in Virginia with their daughter and a dog. Before she decided to try her hand at writing, her love of stories led her to a Bachelors in history. For updates on her newest projects, visit her website JenniferLeitzman.com.': 1}

asin - Value Counts:
  Unique values: 100
  Most common: {'B0CYGSXP5S': 1, 'B01EZFJAT2': 1, 'B0735CJH1X': 1}

availability - Value Counts:
  Unique values: 21
  Most common: {"Currently unavailable. We don't know when or if this item will be back in stock.": 40, 'In Stock': 24, 'Only 1 left in stock - order soon.': 4}

badge - Value Counts:
  Unique values: 2
  Most common: {"Amazon's  Choice": 2, 'Lowest price in 30 days': 1}

brand - Value Counts:
  Unique values: 96
  Most common: {'inktastic': 2, 'Unique Loom': 2, 'SLEEKTRENDS': 1}

❓ Missing Data Analysis:


Unnamed: 0,Missing Count,Missing %
editorial_reviews,100,100.0
about_the_author,99,99.0
final_price_high,99,99.0
origin_url,99,99.0
input_asin,99,99.0
...,...,...
number_of_sellers,13,13.0
availability,11,11.0
seller_id,8,8.0
categories,7,7.0



🔍 Data Quality Insights:
  • Total records: 100
  • Total columns: 76
  • Numeric columns: 20
  • Categorical columns: 54
  • Duplicate rows: Cannot check (unhashable data types present)
  • Complex data columns: 19 (buybox_prices, categories, customers_say...)


# Step 7: Snapshot Management Tools

## 📋 Available snapshot management commands:

### 1. 📊 List all snapshots:
```bash
python snapshot_manager.py
```

### 2. 🔍 Check specific snapshot status:
```bash
python snapshot_manager.py -s <snapshot_id>
```

### 3. 📥 Download ready snapshots:
```bash
python snapshot_manager.py -d
```

### 4. 👀 View downloaded data:
```bash
python snapshot_manager.py -v <snapshot_id>
```

### 5. 🗑️ Clean up old snapshots:
```bash
python snapshot_manager.py -c
```

## 💡 Pro Tips:
- Snapshots can take 30+ minutes to process
- Use the snapshot manager to monitor progress
- Local records are automatically saved for each submission
- Download URLs may require delivery job initiation
- Check the data/downloads/ folder for your data files


# 🎉 Workflow Summary

## 🎯 Complete Workflow Summary

### ✅ What we accomplished:
1. 📊 Initialized dataset connections with built-in filter fields
2. 🔍 Created complex database queries using intuitive syntax
3. 📤 Submitted queries to BrightData API
4. ⏳ Monitored snapshot processing status
5. 📥 Downloaded snapshot data (when ready)
6. 📊 Analyzed and viewed downloaded data
7. 🛠️ Learned about snapshot management tools

## 🚀 Key Benefits of the BrightData System:
- **Unified API**: `dataset.filter.field` syntax
- **Automatic API key loading** from `secrets.yaml`
- **Local record management** for all submissions
- **Multiple download methods** (direct + deliver)
- **Comprehensive snapshot monitoring**
- **Type-safe field validation**
- **Support for multiple datasets**

## 📚 Next Steps:
- Try different filter combinations
- Experiment with other datasets (Shopee, Amazon-Walmart)
- Use the snapshot manager for production workflows
- Check the README.md for advanced features

## 🎊 Happy data querying with BrightData!
