In [1]:
# %%
import requests
import json
import os
from dotenv import load_dotenv
import time
from typing import Dict, Any, Optional, Literal
import pandas as pd
from urllib.parse import urlparse

# Load environment variables
load_dotenv()

brightdata_api_key = os.getenv("BRIGHTDATA_API_KEY")
if not brightdata_api_key:
    raise ValueError("BRIGHTDATA_API_KEY environment variable is required. Please check your .env file.")


def get_snapshot_output(snapshot_id: str, api_key: str, max_retries: int = 40, wait_time: int = 5) -> dict:
    """
    Retrieves snapshot output from Bright Data API with automatic retry logic.
    
    Args:
        snapshot_id: The snapshot ID returned from triggering the dataset
        api_key: Bright Data API key
        max_retries: Maximum number of retry attempts (default: 5)
        wait_time: Wait time in seconds between retries (default: 3)

    Returns:
        dict: The extracted job data once ready
        
    Raises:
        TimeoutError: If snapshot is not ready after max_retries
        requests.RequestException: If API request fails
    """
    url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
    headers = {
        "Authorization": f"Bearer {api_key}"
    }
    params = {
        "format": "json"
    }
    
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}/{max_retries}: Checking snapshot status...")
            response = requests.get(url, headers=headers, params=params)
            response.raise_for_status()  # Raise exception for HTTP errors
            
            data = response.json()

            # Status values that indicate the snapshot is still processing
            processing_statuses = {"building", "running", "pending", "queued", "STATUS"}
            
            # Check if snapshot is still running
            if isinstance(data, dict) and data.get("status") in processing_statuses:
                print(f"Snapshot still processing. Waiting {wait_time} seconds...")
                if attempt < max_retries - 1:  # Don't sleep on the last attempt
                    time.sleep(wait_time)
                continue
            
            # Data is ready - return the parsed response
            print("Snapshot ready! Data retrieved successfully.")
            return json.dumps(data, indent=2, ensure_ascii=False)

        except requests.RequestException as e:
            print(f"API request failed on attempt {attempt + 1}: {e}")
            if attempt < max_retries - 1:
                time.sleep(wait_time)
                continue
            raise
    
    # If we've exhausted all retries
    raise TimeoutError(f"Snapshot {snapshot_id} was not ready after {max_retries} attempts ({max_retries * wait_time} seconds)")

get_snapshot_output("s_mec8sma316swdsaaoh", brightdata_api_key)

Attempt 1/40: Checking snapshot status...
API request failed on attempt 1: HTTPSConnectionPool(host='api.brightdata.com', port=443): Max retries exceeded with url: /datasets/v3/snapshot/s_mec8sma316swdsaaoh?format=json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002CADC29EF90>, 'Connection to api.brightdata.com timed out. (connect timeout=None)'))
Attempt 2/40: Checking snapshot status...


KeyboardInterrupt: 