From feafc3f90c0f2ba3d03dc291e142f9a65d868199 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:12:22 +0200
Subject: [PATCH 01/70] Create client.ts
---
src/client.ts | 1 +
1 file changed, 1 insertion(+)
create mode 100644 src/client.ts
diff --git a/src/client.ts b/src/client.ts
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/client.ts
@@ -0,0 +1 @@
+
From 292df2dfaa889bf73eec7857593638de5653e589 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:13:46 +0200
Subject: [PATCH 02/70] Add files via upload
---
src/__init__.py | 82 +++++
src/client.py | 897 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 979 insertions(+)
create mode 100644 src/__init__.py
create mode 100644 src/client.py
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..c815a6c
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1,82 @@
+"""
+## Bright Data SDK for Python
+
+A comprehensive SDK for Bright Data's Web Scraping and SERP APIs, providing
+easy-to-use methods for web scraping, search engine result parsing, and data management.
+## Functions:
+First import the package and create a client:
+```python
+from brightdata import bdclient
+client = bdclient(your-apy-key)
+```
+Then use the client to call the desired functions:
+#### scrape()
+- Scrapes a website using Bright Data Web Unblocker API with proxy support (or multiple websites sequentially)
+- syntax: `results = client.scrape(url, country, max_workers, ...)`
+#### .scrape_linkedin. class
+- Scrapes LinkedIn data including posts, jobs, companies, and profiles, recieve structured data as a result
+- syntax: `results = client.scrape_linkedin.posts()/jobs()/companies()/profiles() # insert parameters per function`
+#### search()
+- Performs web searches using Bright Data SERP API with customizable search engines (or multiple search queries sequentially)
+- syntax: `results = client.search(query, search_engine, country, ...)`
+#### .search_linkedin. class
+- Search LinkedIn data including for specific posts, jobs, profiles. recieve the relevent data as a result
+- syntax: `results = client.search_linkedin.posts()/jobs()/profiles() # insert parameters per function`
+#### search_chatGPT()
+- Interact with ChatGPT using Bright Data's ChatGPT API, sending prompts and receiving responses
+- syntax: `results = client.search_chatGPT(prompt, additional_prompt, max_workers, ...)`
+#### download_content() / download_snapshot()
+- Saves the scraped content to local files in various formats (JSON, CSV, etc.)
+- syntax: `client.download_content(results)`
+- syntax: `client.download_snapshot(results)`
+#### connect_browser()
+- Get WebSocket endpoint for connecting to Bright Data's scraping browser with Playwright/Selenium
+- syntax: `endpoint_url = client.connect_browser()` then use with browser automation tools
+#### crawl()
+- Crawl websites to discover and scrape multiple pages using Bright Data's Web Crawl API
+- syntax: `result = client.crawl(url, filter, exclude_filter, depth, ...)`
+#### parse_content()
+- Parse and extract useful information from API responses (JSON or HTML)
+- syntax: `parsed = client.parse_content(data, extract_text=True, extract_links=True)`
+
+### Features:
+- Web Scraping: Scrape websites using Bright Data Web Unlocker API with proxy support
+- Search Engine Results: Perform web searches using Bright Data SERP API
+- Web Crawling: Discover and scrape multiple pages from websites with advanced filtering
+- Content Parsing: Extract text, links, images, and structured data from API responses
+- Browser Automation: Simple authentication for Bright Data's scraping browser with Playwright/Selenium
+- Multiple Search Engines: Support for Google, Bing, and Yandex
+- Parallel Processing: Concurrent processing for multiple URLs or queries
+- Robust Error Handling: Comprehensive error handling with retry logic
+- Input Validation: Automatic validation of URLs, zone names, and parameters
+- Zone Management: Automatic zone creation and management
+- Multiple Output Formats: JSON, raw HTML, markdown, and more
+"""
+
+from .client import bdclient
+from .exceptions import (
+ BrightDataError,
+ ValidationError,
+ AuthenticationError,
+ ZoneError,
+ NetworkError,
+ APIError
+)
+from .utils import parse_content, parse_multiple, extract_structured_data
+
+__version__ = "1.1.3"
+__author__ = "Bright Data"
+__email__ = "support@brightdata.com"
+
+__all__ = [
+ 'bdclient',
+ 'BrightDataError',
+ 'ValidationError',
+ 'AuthenticationError',
+ 'ZoneError',
+ 'NetworkError',
+ 'APIError',
+ 'parse_content',
+ 'parse_multiple',
+ 'extract_structured_data'
+]
\ No newline at end of file
diff --git a/src/client.py b/src/client.py
new file mode 100644
index 0000000..b148792
--- /dev/null
+++ b/src/client.py
@@ -0,0 +1,897 @@
+import os
+import json
+import requests
+from datetime import datetime
+from typing import Union, Dict, Any, List
+
+from .api import WebScraper, SearchAPI
+from .api.chatgpt import ChatGPTAPI
+from .api.linkedin import LinkedInAPI, LinkedInScraper, LinkedInSearcher
+from .api.download import DownloadAPI
+from .api.crawl import CrawlAPI
+from .api.extract import ExtractAPI
+from .utils import ZoneManager, setup_logging, get_logger, parse_content
+from .exceptions import ValidationError, AuthenticationError, APIError
+
+def _get_version():
+ """Get version from __init__.py, cached at module import time."""
+ try:
+ import os
+ init_file = os.path.join(os.path.dirname(__file__), '__init__.py')
+ with open(init_file, 'r', encoding='utf-8') as f:
+ for line in f:
+ if line.startswith('__version__'):
+ return line.split('"')[1]
+ except (OSError, IndexError):
+ pass
+ return "unknown"
+
+__version__ = _get_version()
+
+logger = get_logger('client')
+
+
+class bdclient:
+ """Main client for the Bright Data SDK"""
+
+ DEFAULT_MAX_WORKERS = 10
+ DEFAULT_TIMEOUT = 65
+ CONNECTION_POOL_SIZE = 20
+ MAX_RETRIES = 3
+ RETRY_BACKOFF_FACTOR = 1.5
+ RETRY_STATUSES = {429, 500, 502, 503, 504}
+
+ def __init__(
+ self,
+ api_token: str = None,
+ auto_create_zones: bool = True,
+ web_unlocker_zone: str = None,
+ serp_zone: str = None,
+ browser_zone: str = None,
+ browser_username: str = None,
+ browser_password: str = None,
+ browser_type: str = "playwright",
+ log_level: str = "INFO",
+ structured_logging: bool = True,
+ verbose: bool = None
+ ):
+ """
+ Initialize the Bright Data client with your API token
+
+ Create an account at https://brightdata.com/ to get your API token.
+ Go to settings > API keys , and verify that your API key have "Admin" permissions.
+
+ Args:
+ api_token: Your Bright Data API token (can also be set via BRIGHTDATA_API_TOKEN env var)
+ auto_create_zones: Automatically create required zones if they don't exist (default: True)
+ web_unlocker_zone: Custom zone name for web unlocker (default: from env or 'sdk_unlocker')
+ serp_zone: Custom zone name for SERP API (default: from env or 'sdk_serp')
+ browser_zone: Custom zone name for Browser API (default: from env or 'sdk_browser')
+ browser_username: Username for Browser API in format "username-zone-{zone_name}" (can also be set via BRIGHTDATA_BROWSER_USERNAME env var)
+ browser_password: Password for Browser API authentication (can also be set via BRIGHTDATA_BROWSER_PASSWORD env var)
+ browser_type: Browser automation tool type - "playwright", "puppeteer", or "selenium" (default: "playwright")
+ log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+ structured_logging: Whether to use structured JSON logging (default: True)
+ verbose: Enable verbose logging (default: False). Can also be set via BRIGHTDATA_VERBOSE env var.
+ When False, only shows WARNING and above. When True, shows all logs per log_level.
+ """
+ try:
+ from dotenv import load_dotenv
+ load_dotenv()
+ except ImportError:
+ pass
+
+ if verbose is None:
+ env_verbose = os.getenv('BRIGHTDATA_VERBOSE', '').lower()
+ verbose = env_verbose in ('true', '1', 'yes', 'on')
+
+ setup_logging(log_level, structured_logging, verbose)
+ logger.info("Initializing Bright Data SDK client")
+
+ self.api_token = api_token or os.getenv('BRIGHTDATA_API_TOKEN')
+ if not self.api_token:
+ logger.error("API token not provided")
+ raise ValidationError("API token is required. Provide it as parameter or set BRIGHTDATA_API_TOKEN environment variable")
+
+ if not isinstance(self.api_token, str):
+ logger.error("API token must be a string")
+ raise ValidationError("API token must be a string")
+
+ if len(self.api_token.strip()) < 10:
+ logger.error("API token appears to be invalid (too short)")
+ raise ValidationError("API token appears to be invalid")
+
+ token_preview = f"{self.api_token[:4]}***{self.api_token[-4:]}" if len(self.api_token) > 8 else "***"
+ logger.info(f"API token validated successfully: {token_preview}")
+
+ self.web_unlocker_zone = web_unlocker_zone or os.getenv('WEB_UNLOCKER_ZONE', 'sdk_unlocker')
+ self.serp_zone = serp_zone or os.getenv('SERP_ZONE', 'sdk_serp')
+ self.browser_zone = browser_zone or os.getenv('BROWSER_ZONE', 'sdk_browser')
+ self.auto_create_zones = auto_create_zones
+
+ self.browser_username = browser_username or os.getenv('BRIGHTDATA_BROWSER_USERNAME')
+ self.browser_password = browser_password or os.getenv('BRIGHTDATA_BROWSER_PASSWORD')
+
+
+
+ valid_browser_types = ["playwright", "puppeteer", "selenium"]
+ if browser_type not in valid_browser_types:
+ raise ValidationError(f"Invalid browser_type '{browser_type}'. Must be one of: {valid_browser_types}")
+ self.browser_type = browser_type
+
+ if self.browser_username and self.browser_password:
+ browser_preview = f"{self.browser_username[:3]}***"
+ logger.info(f"Browser credentials configured: {browser_preview} (type: {self.browser_type})")
+ elif self.browser_username or self.browser_password:
+ logger.warning("Incomplete browser credentials: both username and password are required for browser API")
+ else:
+ logger.debug("No browser credentials provided - browser API will not be available")
+
+ self.session = requests.Session()
+
+ auth_header = f'Bearer {self.api_token}'
+ self.session.headers.update({
+ 'Authorization': auth_header,
+ 'Content-Type': 'application/json',
+ 'User-Agent': f'brightdata-sdk/{__version__}'
+ })
+
+ logger.info("HTTP session configured with secure headers")
+
+ adapter = requests.adapters.HTTPAdapter(
+ pool_connections=self.CONNECTION_POOL_SIZE,
+ pool_maxsize=self.CONNECTION_POOL_SIZE,
+ max_retries=0
+ )
+ self.session.mount('https://', adapter)
+ self.session.mount('http://', adapter)
+
+ self.zone_manager = ZoneManager(self.session)
+ self.web_scraper = WebScraper(
+ self.session,
+ self.DEFAULT_TIMEOUT,
+ self.MAX_RETRIES,
+ self.RETRY_BACKOFF_FACTOR
+ )
+ self.search_api = SearchAPI(
+ self.session,
+ self.DEFAULT_TIMEOUT,
+ self.MAX_RETRIES,
+ self.RETRY_BACKOFF_FACTOR
+ )
+ self.chatgpt_api = ChatGPTAPI(
+ self.session,
+ self.api_token,
+ self.DEFAULT_TIMEOUT,
+ self.MAX_RETRIES,
+ self.RETRY_BACKOFF_FACTOR
+ )
+ self.linkedin_api = LinkedInAPI(
+ self.session,
+ self.api_token,
+ self.DEFAULT_TIMEOUT,
+ self.MAX_RETRIES,
+ self.RETRY_BACKOFF_FACTOR
+ )
+ self.download_api = DownloadAPI(
+ self.session,
+ self.api_token,
+ self.DEFAULT_TIMEOUT
+ )
+ self.crawl_api = CrawlAPI(
+ self.session,
+ self.api_token,
+ self.DEFAULT_TIMEOUT,
+ self.MAX_RETRIES,
+ self.RETRY_BACKOFF_FACTOR
+ )
+ self.extract_api = ExtractAPI(self)
+
+ if self.auto_create_zones:
+ self.zone_manager.ensure_required_zones(
+ self.web_unlocker_zone,
+ self.serp_zone
+ )
+
+ def scrape(
+ self,
+ url: Union[str, List[str]],
+ zone: str = None,
+ response_format: str = "raw",
+ method: str = "GET",
+ country: str = "",
+ data_format: str = "html",
+ async_request: bool = False,
+ max_workers: int = None,
+ timeout: int = None
+ ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]:
+ """
+ ## Unlock and scrape websites using Bright Data Web Unlocker API
+
+ Scrapes one or multiple URLs through Bright Data's proxy network with anti-bot detection bypass.
+
+ ### Parameters:
+ - `url` (str | List[str]): Single URL string or list of URLs to scrape
+ - `zone` (str, optional): Zone identifier (default: auto-configured web_unlocker_zone)
+ - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`)
+ - `method` (str, optional): HTTP method for the request (default: `"GET"`)
+ - `country` (str, optional): Two-letter ISO country code for proxy location (defaults to fastest connection)
+ - `data_format` (str, optional): Additional format transformation (default: `"html"`)
+ - `async_request` (bool, optional): Enable asynchronous processing (default: `False`)
+ - `max_workers` (int, optional): Maximum parallel workers for multiple URLs (default: `10`)
+ - `timeout` (int, optional): Request timeout in seconds (default: `30`)
+
+ ### Returns:
+ - Single URL: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"`
+ - Multiple URLs: `List[Union[Dict[str, Any], str]]` corresponding to each input URL
+
+ ### Example Usage:
+ ```python
+ # Single URL scraping
+ result = client.scrape(
+ url="https://example.com",
+ response_format="json"
+ )
+
+ # Multiple URLs scraping
+ urls = ["https://site1.com", "https://site2.com"]
+ results = client.scrape(
+ url=urls,
+ response_format="raw",
+ max_workers=5
+ )
+ ```
+
+ ### Raises:
+ - `ValidationError`: Invalid URL format or empty URL list
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+ zone = zone or self.web_unlocker_zone
+ max_workers = max_workers or self.DEFAULT_MAX_WORKERS
+
+ return self.web_scraper.scrape(
+ url, zone, response_format, method, country, data_format,
+ async_request, max_workers, timeout
+ )
+
+ def search(
+ self,
+ query: Union[str, List[str]],
+ search_engine: str = "google",
+ zone: str = None,
+ response_format: str = "raw",
+ method: str = "GET",
+ country: str = "",
+ data_format: str = "html",
+ async_request: bool = False,
+ max_workers: int = None,
+ timeout: int = None,
+ parse: bool = False
+ ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]:
+ """
+ ## Search the web using Bright Data SERP API
+
+ Performs web searches through major search engines using Bright Data's proxy network
+ for reliable, bot-detection-free results.
+
+ ### Parameters:
+ - `query` (str | List[str]): Search query string or list of search queries
+ - `search_engine` (str, optional): Search engine to use - `"google"`, `"bing"`, or `"yandex"` (default: `"google"`)
+ - `zone` (str, optional): Zone identifier (default: auto-configured serp_zone)
+ - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`)
+ - `method` (str, optional): HTTP method for the request (default: `"GET"`)
+ - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`)
+ - `data_format` (str, optional): Additional format transformation (default: `"html"`)
+ - `async_request` (bool, optional): Enable asynchronous processing (default: `False`)
+ - `max_workers` (int, optional): Maximum parallel workers for multiple queries (default: `10`)
+ - `timeout` (int, optional): Request timeout in seconds (default: `30`)
+ - `parse` (bool, optional): Enable JSON parsing by adding brd_json=1 to URL (default: `False`)
+
+ ### Returns:
+ - Single query: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"`
+ - Multiple queries: `List[Union[Dict[str, Any], str]]` corresponding to each input query
+
+ ### Example Usage:
+ ```python
+ # Single search query
+ result = client.search(
+ query="best laptops 2024",
+ search_engine="google",
+ response_format="json"
+ )
+
+ # Multiple search queries
+ queries = ["python tutorials", "machine learning courses", "web development"]
+ results = client.search(
+ query=queries,
+ search_engine="bing",
+ max_workers=3
+ )
+ ```
+
+ ### Supported Search Engines:
+ - `"google"` - Google Search
+ - `"bing"` - Microsoft Bing
+ - `"yandex"` - Yandex Search
+
+ ### Raises:
+ - `ValidationError`: Invalid search engine, empty query, or validation errors
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+ zone = zone or self.serp_zone
+ max_workers = max_workers or self.DEFAULT_MAX_WORKERS
+
+ return self.search_api.search(
+ query, search_engine, zone, response_format, method, country,
+ data_format, async_request, max_workers, timeout, parse
+ )
+
+ def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str:
+ """
+ ## Download content to a file based on its format
+
+ ### Args:
+ content: The content to download (dict for JSON, string for other formats)
+ filename: Optional filename. If not provided, generates one with timestamp
+ format: Format of the content ("json", "csv", "ndjson", "jsonl", "txt")
+ parse: If True, automatically parse JSON strings in 'body' fields to objects (default: False)
+
+ ### Returns:
+ Path to the downloaded file
+ """
+ return self.download_api.download_content(content, filename, format, parse)
+
+
+ def search_chatGPT(
+ self,
+ prompt: Union[str, List[str]],
+ country: Union[str, List[str]] = "",
+ additional_prompt: Union[str, List[str]] = "",
+ web_search: Union[bool, List[bool]] = False,
+ sync: bool = True
+ ) -> Dict[str, Any]:
+ """
+ ## Search ChatGPT responses using Bright Data's ChatGPT dataset API
+
+ Sends one or multiple prompts to ChatGPT through Bright Data's proxy network
+ with support for both synchronous and asynchronous processing.
+
+ ### Parameters:
+ - `prompt` (str | List[str]): Single prompt string or list of prompts to send to ChatGPT
+ - `country` (str | List[str], optional): Two-letter ISO country code(s) for proxy location (default: "")
+ - `additional_prompt` (str | List[str], optional): Follow-up prompt(s) after receiving the first answer (default: "")
+ - `web_search` (bool | List[bool], optional): Whether to click the web search button in ChatGPT (default: False)
+ - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing
+
+ ### Returns:
+ - `Dict[str, Any]`: If sync=True, returns ChatGPT response data directly. If sync=False, returns response with snapshot_id for async processing
+
+ ### Example Usage:
+ ```python
+ # Single prompt (synchronous - returns data immediately)
+ result = client.search_chatGPT(prompt="Top hotels in New York")
+
+ # Multiple prompts (synchronous - returns data immediately)
+ result = client.search_chatGPT(
+ prompt=["Top hotels in New York", "Best restaurants in Paris", "Tourist attractions in Tokyo"],
+ additional_prompt=["Are you sure?", "", "What about hidden gems?"]
+ )
+
+ # Asynchronous with web search enabled (returns snapshot_id)
+ result = client.search_chatGPT(
+ prompt="Latest AI developments",
+ web_search=True,
+ sync=False
+ )
+ # Snapshot ID is automatically printed for async requests
+ ```
+
+ ### Raises:
+ - `ValidationError`: Invalid prompt or parameters
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+ if isinstance(prompt, str):
+ prompts = [prompt]
+ else:
+ prompts = prompt
+
+ if not prompts or len(prompts) == 0:
+ raise ValidationError("At least one prompt is required")
+
+ for p in prompts:
+ if not p or not isinstance(p, str):
+ raise ValidationError("All prompts must be non-empty strings")
+
+ def normalize_param(param, param_name):
+ if isinstance(param, list):
+ if len(param) != len(prompts):
+ raise ValidationError(f"{param_name} list must have same length as prompts list")
+ return param
+ else:
+ return [param] * len(prompts)
+
+ countries = normalize_param(country, "country")
+ additional_prompts = normalize_param(additional_prompt, "additional_prompt")
+ web_searches = normalize_param(web_search, "web_search")
+
+ for c in countries:
+ if not isinstance(c, str):
+ raise ValidationError("All countries must be strings")
+
+ for ap in additional_prompts:
+ if not isinstance(ap, str):
+ raise ValidationError("All additional_prompts must be strings")
+
+ for ws in web_searches:
+ if not isinstance(ws, bool):
+ raise ValidationError("All web_search values must be booleans")
+
+ return self.chatgpt_api.scrape_chatgpt(
+ prompts,
+ countries,
+ additional_prompts,
+ web_searches,
+ sync,
+ self.DEFAULT_TIMEOUT
+ )
+
+ @property
+ def scrape_linkedin(self):
+ """
+ ## LinkedIn Data Scraping Interface
+
+ Provides specialized methods for scraping different types of LinkedIn data
+ using Bright Data's collect API with pre-configured dataset IDs.
+
+ ### Available Methods:
+ - `profiles(url)` - Scrape LinkedIn profile data
+ - `companies(url)` - Scrape LinkedIn company data
+ - `jobs(url)` - Scrape LinkedIn job listing data
+ - `posts(url)` - Scrape LinkedIn post content
+
+ ### Example Usage:
+ ```python
+ # Scrape LinkedIn profiles
+ result = client.scrape_linkedin.profiles("https://www.linkedin.com/in/username/")
+
+ # Scrape multiple companies
+ companies = [
+ "https://www.linkedin.com/company/ibm",
+ "https://www.linkedin.com/company/bright-data"
+ ]
+ result = client.scrape_linkedin.companies(companies)
+
+ # Scrape job listings
+ result = client.scrape_linkedin.jobs("https://www.linkedin.com/jobs/view/123456/")
+
+ # Scrape posts
+ result = client.scrape_linkedin.posts("https://www.linkedin.com/posts/user-activity-123/")
+ ```
+
+ ### Returns:
+ Each method returns a `Dict[str, Any]` containing snapshot_id and metadata for tracking the request.
+ Use the snapshot_id with `download_snapshot()` to retrieve the collected data.
+ """
+ if not hasattr(self, '_linkedin_scraper'):
+ self._linkedin_scraper = LinkedInScraper(self.linkedin_api)
+ return self._linkedin_scraper
+
+ @property
+ def search_linkedin(self):
+ """
+ ## LinkedIn Data Search Interface
+
+ Provides specialized methods for discovering new LinkedIn data by various search criteria
+ using Bright Data's collect API with pre-configured dataset IDs.
+
+ ### Available Methods:
+ - `profiles(first_name, last_name)` - Search LinkedIn profiles by name
+ - `jobs(url=..., location=...)` - Search LinkedIn jobs by URL or keyword criteria
+ - `posts(profile_url=..., company_url=..., url=...)` - Search LinkedIn posts by various methods
+
+ ### Example Usage:
+ ```python
+ # Search profiles by name
+ result = client.search_linkedin.profiles("James", "Smith")
+
+ # Search jobs by location and keywords
+ result = client.search_linkedin.jobs(
+ location="Paris",
+ keyword="product manager",
+ country="FR"
+ )
+
+ # Search posts by profile URL with date range
+ result = client.search_linkedin.posts(
+ profile_url="https://www.linkedin.com/in/username",
+ start_date="2018-04-25T00:00:00.000Z",
+ end_date="2021-05-25T00:00:00.000Z"
+ )
+ ```
+
+ ### Returns:
+ Each method returns a `Dict[str, Any]` containing snapshot_id (async) or direct data (sync) for tracking the request.
+ Use the snapshot_id with `download_snapshot()` to retrieve the collected data.
+ """
+ if not hasattr(self, '_linkedin_searcher'):
+ self._linkedin_searcher = LinkedInSearcher(self.linkedin_api)
+ return self._linkedin_searcher
+
+ def download_snapshot(
+ self,
+ snapshot_id: str,
+ format: str = "json",
+ compress: bool = False,
+ batch_size: int = None,
+ part: int = None
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]], str]:
+ """
+ ## Download snapshot content from Bright Data dataset API
+
+ Downloads the snapshot content using the snapshot ID returned from scrape_chatGPT()
+ or other dataset collection triggers.
+
+ ### Parameters:
+ - `snapshot_id` (str): The snapshot ID returned when collection was triggered (required)
+ - `format` (str, optional): Format of the data - "json", "ndjson", "jsonl", or "csv" (default: "json")
+ - `compress` (bool, optional): Whether the result should be compressed (default: False)
+ - `batch_size` (int, optional): Divide into batches of X records (minimum: 1000)
+ - `part` (int, optional): If batch_size provided, specify which part to download
+
+ ### Returns:
+ - `Union[Dict, List, str]`: Snapshot data in the requested format, OR
+ - `Dict`: Status response if snapshot is not ready yet (status="not_ready")
+
+ ### Example Usage:
+ ```python
+ # Download complete snapshot
+ result = client.download_snapshot("s_m4x7enmven8djfqak")
+
+ # Check if snapshot is ready
+ if isinstance(result, dict) and result.get('status') == 'not_ready':
+ print(f"Not ready: {result['message']}")
+ # Try again later
+ else:
+ # Snapshot data is ready
+ data = result
+
+ # Download as CSV format
+ csv_data = client.download_snapshot("s_m4x7enmven8djfqak", format="csv")
+ ```
+
+ ### Raises:
+ - `ValidationError`: Invalid parameters or snapshot_id format
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed, snapshot not found, or server error
+ """
+ return self.download_api.download_snapshot(snapshot_id, format, compress, batch_size, part)
+
+
+ def list_zones(self) -> List[Dict[str, Any]]:
+ """
+ ## List all active zones in your Bright Data account
+
+ ### Returns:
+ List of zone dictionaries with their configurations
+ """
+ return self.zone_manager.list_zones()
+
+ def connect_browser(self) -> str:
+ """
+ ## Get WebSocket endpoint URL for connecting to Bright Data's scraping browser
+
+ Returns the WebSocket endpoint URL that can be used with Playwright or Selenium
+ to connect to Bright Data's scraping browser service.
+
+ ### Returns:
+ WebSocket endpoint URL string for browser connection
+
+ ### Example Usage:
+ ```python
+ # For Playwright (default)
+ client = bdclient(
+ api_token="your_token",
+ browser_username="username-zone-browser_zone1",
+ browser_password="your_password",
+ browser_type="playwright" # Playwright/ Puppeteer (default)
+ )
+ endpoint_url = client.connect_browser() # Returns: wss://...@brd.superproxy.io:9222
+
+ # For Selenium
+ client = bdclient(
+ api_token="your_token",
+ browser_username="username-zone-browser_zone1",
+ browser_password="your_password",
+ browser_type="selenium"
+ )
+ endpoint_url = client.connect_browser() # Returns: https://...@brd.superproxy.io:9515
+ ```
+
+ ### Raises:
+ - `ValidationError`: Browser credentials not provided or invalid
+ - `AuthenticationError`: Invalid browser credentials
+ """
+ if not self.browser_username or not self.browser_password:
+ logger.error("Browser credentials not configured")
+ raise ValidationError(
+ "Browser credentials are required. Provide browser_username and browser_password "
+ "parameters or set BRIGHTDATA_BROWSER_USERNAME and BRIGHTDATA_BROWSER_PASSWORD "
+ "environment variables."
+ )
+
+ if not isinstance(self.browser_username, str) or not isinstance(self.browser_password, str):
+ logger.error("Browser credentials must be strings")
+ raise ValidationError("Browser username and password must be strings")
+
+ if len(self.browser_username.strip()) == 0 or len(self.browser_password.strip()) == 0:
+ logger.error("Browser credentials cannot be empty")
+ raise ValidationError("Browser username and password cannot be empty")
+
+ auth_string = f"{self.browser_username}:{self.browser_password}"
+
+ if self.browser_type == "selenium":
+ endpoint_url = f"https://{auth_string}@brd.superproxy.io:9515"
+ logger.debug(f"Browser endpoint URL: https://***:***@brd.superproxy.io:9515")
+ else:
+ endpoint_url = f"wss://{auth_string}@brd.superproxy.io:9222"
+ logger.debug(f"Browser endpoint URL: wss://***:***@brd.superproxy.io:9222")
+
+ logger.info(f"Generated {self.browser_type} connection endpoint for user: {self.browser_username[:3]}***")
+
+ return endpoint_url
+
+ def crawl(
+ self,
+ url: Union[str, List[str]],
+ ignore_sitemap: bool = None,
+ depth: int = None,
+ filter: str = None,
+ exclude_filter: str = None,
+ custom_output_fields: List[str] = None,
+ include_errors: bool = True
+ ) -> Dict[str, Any]:
+ """
+ ## Crawl websites using Bright Data's Web Crawl API
+
+ Performs web crawling to discover and scrape multiple pages from a website
+ starting from the specified URL(s). Returns a snapshot_id for tracking the crawl progress.
+
+ ### Parameters:
+ - `url` (str | List[str]): Domain URL(s) to crawl (required)
+ - `ignore_sitemap` (bool, optional): Ignore sitemap when crawling
+ - `depth` (int, optional): Maximum depth to crawl relative to the entered URL
+ - `filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/")
+ - `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/")
+ - `custom_output_fields` (List[str], optional): Custom output schema fields to include
+ - `include_errors` (bool, optional): Include errors in response (default: True)
+
+ ### Returns:
+ - `Dict[str, Any]`: Crawl response with snapshot_id for tracking
+
+ ### Example Usage:
+ ```python
+ # Single URL crawl
+ result = client.crawl("https://example.com/")
+ snapshot_id = result['snapshot_id']
+
+ # Multiple URLs with filters
+ urls = ["https://example.com/", "https://example2.com/"]
+ result = client.crawl(
+ url=urls,
+ filter="/product/",
+ exclude_filter="/ads/",
+ depth=2,
+ ignore_sitemap=True
+ )
+
+ # Custom output schema
+ result = client.crawl(
+ url="https://example.com/",
+ custom_output_fields=["markdown", "url", "page_title"]
+ )
+
+ # Download results using snapshot_id
+ data = client.download_snapshot(result['snapshot_id'])
+ ```
+
+ ### Available Output Fields:
+ - `markdown` - Page content in markdown format
+ - `url` - Page URL
+ - `html2text` - Page content as plain text
+ - `page_html` - Raw HTML content
+ - `ld_json` - Structured data (JSON-LD)
+ - `page_title` - Page title
+ - `timestamp` - Crawl timestamp
+ - `input` - Input parameters used
+ - `discovery_input` - Discovery parameters
+ - `error` - Error information (if any)
+ - `error_code` - Error code (if any)
+ - `warning` - Warning information (if any)
+ - `warning_code` - Warning code (if any)
+
+ ### Raises:
+ - `ValidationError`: Invalid URL or parameters
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+ return self.crawl_api.crawl(
+ url=url,
+ ignore_sitemap=ignore_sitemap,
+ depth=depth,
+ filter=filter,
+ exclude_filter=exclude_filter,
+ custom_output_fields=custom_output_fields,
+ include_errors=include_errors
+ )
+
+ def parse_content(
+ self,
+ data: Union[str, Dict, List],
+ extract_text: bool = True,
+ extract_links: bool = False,
+ extract_images: bool = False
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+ """
+ ## Parse content from API responses
+
+ Extract and parse useful information from scraping, search, or crawling results.
+ Automatically detects and handles both single and multiple results from batch operations.
+
+ ### Parameters:
+ - `data` (str | Dict | List): Response data from scrape(), search(), or crawl() methods
+ - `extract_text` (bool, optional): Extract clean text content (default: True)
+ - `extract_links` (bool, optional): Extract all links from content (default: False)
+ - `extract_images` (bool, optional): Extract image URLs from content (default: False)
+
+ ### Returns:
+ - `Dict[str, Any]`: Parsed content for single results
+ - `List[Dict[str, Any]]`: List of parsed content for multiple results (auto-detected)
+
+ ### Example Usage:
+ ```python
+ # Parse single URL results
+ scraped_data = client.scrape("https://example.com")
+ parsed = client.parse_content(scraped_data, extract_text=True, extract_links=True)
+ print(f"Title: {parsed['title']}")
+
+ # Parse multiple URL results (auto-detected)
+ scraped_data = client.scrape(["https://example1.com", "https://example2.com"])
+ parsed_list = client.parse_content(scraped_data, extract_text=True)
+ for result in parsed_list:
+ print(f"Title: {result['title']}")
+ ```
+
+ ### Available Fields in Each Result:
+ - `type`: 'json' or 'html' - indicates the source data type
+ - `text`: Cleaned text content (if extract_text=True)
+ - `links`: List of {'url': str, 'text': str} objects (if extract_links=True)
+ - `images`: List of {'url': str, 'alt': str} objects (if extract_images=True)
+ - `title`: Page title (if available)
+ - `raw_length`: Length of original content
+ - `structured_data`: Original JSON data (if type='json')
+ """
+ return parse_content(
+ data=data,
+ extract_text=extract_text,
+ extract_links=extract_links,
+ extract_images=extract_images
+ )
+
+ def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: Dict[str, Any] = None, llm_key: str = None) -> str:
+ """
+ ## Extract specific information from websites using AI
+
+ Combines web scraping with OpenAI's language models to extract targeted information
+ from web pages based on natural language queries. Automatically parses URLs and
+ optimizes content for efficient LLM processing.
+
+ ### Parameters:
+ - `query` (str): Natural language query describing what to extract. If `url` parameter is provided,
+ this becomes the pure extraction query. If `url` is not provided, this should include
+ the URL (e.g. "extract the most recent news from cnn.com")
+ - `url` (str | List[str], optional): Direct URL(s) to scrape. If provided, bypasses URL extraction
+ from query and sends these URLs to the web unlocker API
+ - `output_scheme` (dict, optional): JSON Schema defining the expected structure for the LLM response.
+ Uses OpenAI's Structured Outputs for reliable type-safe responses.
+ Example: {"type": "object", "properties": {"title": {"type": "string"}, "date": {"type": "string"}}, "required": ["title", "date"]}
+ - `llm_key` (str, optional): OpenAI API key. If not provided, uses OPENAI_API_KEY env variable
+
+ ### Returns:
+ - `str`: Extracted content (also provides access to metadata via attributes)
+
+ ### Example Usage:
+ ```python
+ # Using URL parameter with structured output (new)
+ result = client.extract(
+ query="extract the most recent news headlines",
+ url="https://cnn.com",
+ output_scheme={
+ "type": "object",
+ "properties": {
+ "headlines": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "title": {"type": "string"},
+ "date": {"type": "string"}
+ },
+ "required": ["title", "date"]
+ }
+ }
+ },
+ "required": ["headlines"]
+ }
+ )
+ print(result) # Prints the extracted news content
+
+ # Using URL in query (original behavior)
+ result = client.extract("extract the most recent news from cnn.com")
+
+ # Multiple URLs with structured schema
+ result = client.extract(
+ query="extract main headlines",
+ url=["https://cnn.com", "https://bbc.com"],
+ output_scheme={
+ "type": "object",
+ "properties": {
+ "sources": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "source_name": {"type": "string"},
+ "headlines": {"type": "array", "items": {"type": "string"}}
+ },
+ "required": ["source_name", "headlines"]
+ }
+ }
+ },
+ "required": ["sources"]
+ }
+ )
+
+ # Access metadata attributes
+ print(f"Source: {result.url}")
+ print(f"Title: {result.source_title}")
+ print(f"Tokens used: {result.token_usage['total_tokens']}")
+
+ # Use with custom OpenAI key
+ result = client.extract(
+ query="get the price and description",
+ url="https://amazon.com/dp/B079QHML21",
+ llm_key="your-openai-api-key"
+ )
+ ```
+
+ ### Environment Variable Setup:
+ ```bash
+ # Set in .env file
+ OPENAI_API_KEY=your-openai-api-key
+ ```
+
+ ### Available Attributes:
+ ```python
+ result = client.extract("extract news from cnn.com")
+
+ # String value (default behavior)
+ str(result) # Extracted content
+
+ # Metadata attributes
+ result.query # 'extract news'
+ result.url # 'https://www.cnn.com'
+ result.source_title # 'CNN - Breaking News...'
+ result.content_length # 1234
+ result.token_usage # {'total_tokens': 2998, ...}
+ result.success # True
+ result.metadata # Full metadata dictionary
+ ```
+
+ ### Raises:
+ - `ValidationError`: Invalid query format, missing URL, or invalid LLM key
+ - `APIError`: Web scraping failed or LLM processing error
+ """
+ return self.extract_api.extract(query, url, output_scheme, llm_key)
\ No newline at end of file
From cc78cc0c18fde28e3c2222f7702ca791bb03913b Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:14:03 +0200
Subject: [PATCH 03/70] Delete src/client.ts
---
src/client.ts | 1 -
1 file changed, 1 deletion(-)
delete mode 100644 src/client.ts
diff --git a/src/client.ts b/src/client.ts
deleted file mode 100644
index 8b13789..0000000
--- a/src/client.ts
+++ /dev/null
@@ -1 +0,0 @@
-
From 94cce094c221c5168796ca3890d1c52e95ee5bfc Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:14:58 +0200
Subject: [PATCH 04/70] Create tst
---
src/api/tst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 src/api/tst
diff --git a/src/api/tst b/src/api/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/api/tst
@@ -0,0 +1 @@
+
From db06632ab8569fb6860837d17d41265be3c3dce2 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:15:23 +0200
Subject: [PATCH 05/70] Add files via upload
---
src/api/__init__.py | 13 +
src/api/chatgpt.py | 126 +++++++
src/api/crawl.py | 175 ++++++++++
src/api/download.py | 265 +++++++++++++++
src/api/extract.py | 419 +++++++++++++++++++++++
src/api/linkedin.py | 803 ++++++++++++++++++++++++++++++++++++++++++++
src/api/scraper.py | 205 +++++++++++
src/api/search.py | 212 ++++++++++++
8 files changed, 2218 insertions(+)
create mode 100644 src/api/__init__.py
create mode 100644 src/api/chatgpt.py
create mode 100644 src/api/crawl.py
create mode 100644 src/api/download.py
create mode 100644 src/api/extract.py
create mode 100644 src/api/linkedin.py
create mode 100644 src/api/scraper.py
create mode 100644 src/api/search.py
diff --git a/src/api/__init__.py b/src/api/__init__.py
new file mode 100644
index 0000000..a79c0fd
--- /dev/null
+++ b/src/api/__init__.py
@@ -0,0 +1,13 @@
+from .scraper import WebScraper
+from .search import SearchAPI
+from .chatgpt import ChatGPTAPI
+from .linkedin import LinkedInAPI
+from .crawl import CrawlAPI
+
+__all__ = [
+ 'WebScraper',
+ 'SearchAPI',
+ 'ChatGPTAPI',
+ 'LinkedInAPI',
+ 'CrawlAPI'
+]
\ No newline at end of file
diff --git a/src/api/chatgpt.py b/src/api/chatgpt.py
new file mode 100644
index 0000000..e9edb90
--- /dev/null
+++ b/src/api/chatgpt.py
@@ -0,0 +1,126 @@
+import json
+import requests
+from typing import Union, Dict, Any, List
+
+from ..utils import get_logger
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.chatgpt')
+
+
+class ChatGPTAPI:
+ """Handles ChatGPT scraping operations using Bright Data's ChatGPT dataset API"""
+
+ def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5):
+ self.session = session
+ self.api_token = api_token
+ self.default_timeout = default_timeout
+ self.max_retries = max_retries
+ self.retry_backoff = retry_backoff
+
+ def scrape_chatgpt(
+ self,
+ prompts: List[str],
+ countries: List[str],
+ additional_prompts: List[str],
+ web_searches: List[bool],
+ sync: bool = True,
+ timeout: int = None
+ ) -> Dict[str, Any]:
+ """
+ Internal method to handle ChatGPT scraping API requests
+
+ Parameters:
+ - prompts: List of prompts to send to ChatGPT
+ - countries: List of country codes matching prompts
+ - additional_prompts: List of follow-up prompts matching prompts
+ - web_searches: List of web_search flags matching prompts
+ - sync: If True, uses synchronous API for immediate results
+ - timeout: Request timeout in seconds
+
+ Returns:
+ - Dict containing response with snapshot_id or direct data (if sync=True)
+ """
+ url = "https://api.brightdata.com/datasets/v3/scrape" if sync else "https://api.brightdata.com/datasets/v3/trigger"
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": "gd_m7aof0k82r803d5bjm",
+ "include_errors": "true"
+ }
+
+ data = [
+ {
+ "url": "https://chatgpt.com/",
+ "prompt": prompts[i],
+ "country": countries[i],
+ "additional_prompt": additional_prompts[i],
+ "web_search": web_searches[i]
+ }
+ for i in range(len(prompts))
+ ]
+
+ try:
+ response = self.session.post(
+ url,
+ headers=headers,
+ params=params,
+ json=data,
+ timeout=timeout or (65 if sync else self.default_timeout)
+ )
+
+ if response.status_code == 401:
+ raise AuthenticationError("Invalid API token or insufficient permissions")
+ elif response.status_code != 200:
+ raise APIError(f"ChatGPT scraping request failed with status {response.status_code}: {response.text}")
+
+ if sync:
+ response_text = response.text
+ if '\n{' in response_text and response_text.strip().startswith('{'):
+ json_objects = []
+ for line in response_text.strip().split('\n'):
+ if line.strip():
+ try:
+ json_objects.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ result = json_objects
+ else:
+ try:
+ result = response.json()
+ except json.JSONDecodeError:
+ result = response_text
+
+ logger.info(f"ChatGPT data retrieved synchronously for {len(prompts)} prompt(s)")
+ print(f"Retrieved {len(result) if isinstance(result, list) else 1} ChatGPT response(s)")
+ else:
+ result = response.json()
+ snapshot_id = result.get('snapshot_id')
+ if snapshot_id:
+ logger.info(f"ChatGPT scraping job initiated successfully for {len(prompts)} prompt(s)")
+ print("")
+ print("Snapshot ID:")
+ print(snapshot_id)
+ print("")
+
+ return result
+
+ except requests.exceptions.Timeout:
+ raise APIError("Timeout while initiating ChatGPT scraping")
+ except requests.exceptions.RequestException as e:
+ raise APIError(f"Network error during ChatGPT scraping: {str(e)}")
+ except json.JSONDecodeError as e:
+ raise APIError(f"Failed to parse ChatGPT scraping response: {str(e)}")
+ except Exception as e:
+ if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+ raise
+ raise APIError(f"Unexpected error during ChatGPT scraping: {str(e)}")
\ No newline at end of file
diff --git a/src/api/crawl.py b/src/api/crawl.py
new file mode 100644
index 0000000..4fe047a
--- /dev/null
+++ b/src/api/crawl.py
@@ -0,0 +1,175 @@
+import json
+from typing import Union, Dict, Any, List, Optional
+from ..utils import get_logger, validate_url
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.crawl')
+
+
+class CrawlAPI:
+ """Handles crawl operations using Bright Data's Web Crawl API"""
+
+ CRAWL_DATASET_ID = "gd_m6gjtfmeh43we6cqc"
+
+ AVAILABLE_OUTPUT_FIELDS = [
+ "markdown", "url", "html2text", "page_html", "ld_json",
+ "page_title", "timestamp", "input", "discovery_input",
+ "error", "error_code", "warning", "warning_code"
+ ]
+
+ def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5):
+ self.session = session
+ self.api_token = api_token
+ self.default_timeout = default_timeout
+ self.max_retries = max_retries
+ self.retry_backoff = retry_backoff
+
+ def crawl(
+ self,
+ url: Union[str, List[str]],
+ ignore_sitemap: Optional[bool] = None,
+ depth: Optional[int] = None,
+ filter: Optional[str] = None,
+ exclude_filter: Optional[str] = None,
+ custom_output_fields: Optional[List[str]] = None,
+ include_errors: bool = True
+ ) -> Dict[str, Any]:
+ """
+ ## Crawl websites using Bright Data's Web Crawl API
+
+ Performs web crawling to discover and scrape multiple pages from a website
+ starting from the specified URL(s).
+
+ ### Parameters:
+ - `url` (str | List[str]): Domain URL(s) to crawl (required)
+ - `ignore_sitemap` (bool, optional): Ignore sitemap when crawling
+ - `depth` (int, optional): Maximum depth to crawl relative to the entered URL
+ - `filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/")
+ - `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/")
+ - `custom_output_fields` (List[str], optional): Custom output schema fields to include
+ - `include_errors` (bool, optional): Include errors in response (default: True)
+
+ ### Returns:
+ - `Dict[str, Any]`: Crawl response with snapshot_id for tracking
+
+ ### Example Usage:
+ ```python
+ # Single URL crawl
+ result = client.crawl("https://example.com/")
+
+ # Multiple URLs with filters
+ urls = ["https://example.com/", "https://example2.com/"]
+ result = client.crawl(
+ url=urls,
+ filter="/product/",
+ exclude_filter="/ads/",
+ depth=2,
+ ignore_sitemap=True
+ )
+
+ # Custom output schema
+ result = client.crawl(
+ url="https://example.com/",
+ custom_output_fields=["markdown", "url", "page_title"]
+ )
+ ```
+
+ ### Raises:
+ - `ValidationError`: Invalid URL or parameters
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+ if isinstance(url, str):
+ urls = [url]
+ elif isinstance(url, list):
+ urls = url
+ else:
+ raise ValidationError("URL must be a string or list of strings")
+
+ if not urls:
+ raise ValidationError("At least one URL is required")
+
+ for u in urls:
+ if not isinstance(u, str) or not u.strip():
+ raise ValidationError("All URLs must be non-empty strings")
+ validate_url(u)
+
+ if custom_output_fields is not None:
+ if not isinstance(custom_output_fields, list):
+ raise ValidationError("custom_output_fields must be a list")
+
+ invalid_fields = [field for field in custom_output_fields if field not in self.AVAILABLE_OUTPUT_FIELDS]
+ if invalid_fields:
+ raise ValidationError(f"Invalid output fields: {invalid_fields}. Available fields: {self.AVAILABLE_OUTPUT_FIELDS}")
+
+ crawl_inputs = []
+ for u in urls:
+ crawl_input = {"url": u}
+
+ if ignore_sitemap is not None:
+ crawl_input["ignore_sitemap"] = ignore_sitemap
+ if depth is not None:
+ crawl_input["depth"] = depth
+ if filter is not None:
+ crawl_input["filter"] = filter
+ if exclude_filter is not None:
+ crawl_input["exclude_filter"] = exclude_filter
+
+ crawl_inputs.append(crawl_input)
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ params = {
+ "dataset_id": self.CRAWL_DATASET_ID,
+ "include_errors": str(include_errors).lower(),
+ "type": "discover_new",
+ "discover_by": "domain_url"
+ }
+
+ if custom_output_fields:
+ payload = {
+ "input": crawl_inputs,
+ "custom_output_fields": custom_output_fields
+ }
+ else:
+ payload = crawl_inputs
+
+ logger.info(f"Starting crawl for {len(urls)} URL(s)")
+ logger.debug(f"Crawl parameters: depth={depth}, filter={filter}, exclude_filter={exclude_filter}")
+
+ try:
+ response = self.session.post(
+ api_url,
+ params=params,
+ json=payload,
+ timeout=self.default_timeout
+ )
+
+ if response.status_code == 200:
+ result = response.json()
+ snapshot_id = result.get('snapshot_id')
+ logger.info(f"Crawl initiated successfully. Snapshot ID: {snapshot_id}")
+ return result
+
+ elif response.status_code == 401:
+ logger.error("Unauthorized (401): Check API token")
+ raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}")
+ elif response.status_code == 403:
+ logger.error("Forbidden (403): Insufficient permissions")
+ raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}")
+ elif response.status_code == 400:
+ logger.error(f"Bad request (400): {response.text}")
+ raise APIError(f"Bad request (400): {response.text}")
+ else:
+ logger.error(f"Crawl request failed ({response.status_code}): {response.text}")
+ raise APIError(
+ f"Crawl request failed ({response.status_code}): {response.text}",
+ status_code=response.status_code,
+ response_text=response.text
+ )
+
+ except Exception as e:
+ if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+ raise
+ logger.error(f"Unexpected error during crawl: {e}")
+ raise APIError(f"Unexpected error during crawl: {str(e)}")
\ No newline at end of file
diff --git a/src/api/download.py b/src/api/download.py
new file mode 100644
index 0000000..4bccdc0
--- /dev/null
+++ b/src/api/download.py
@@ -0,0 +1,265 @@
+import json
+import requests
+from datetime import datetime
+from typing import Union, Dict, Any, List
+
+from ..utils import get_logger
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.download')
+
+
+class DownloadAPI:
+ """Handles snapshot and content download operations using Bright Data's download API"""
+
+ def __init__(self, session, api_token, default_timeout=30):
+ self.session = session
+ self.api_token = api_token
+ self.default_timeout = default_timeout
+
+ def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str:
+ """
+ ## Download content to a file based on its format
+
+ ### Args:
+ content: The content to download (dict for JSON, string for other formats)
+ filename: Optional filename. If not provided, generates one with timestamp
+ format: Format of the content ("json", "csv", "ndjson", "jsonl", "txt")
+ parse: If True, automatically parse JSON strings in 'body' fields to objects (default: False)
+
+ ### Returns:
+ Path to the downloaded file
+ """
+
+ if not filename:
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ filename = f"brightdata_results_{timestamp}.{format}"
+
+ if not filename.endswith(f".{format}"):
+ filename = f"{filename}.{format}"
+
+ if parse and isinstance(content, (list, dict)):
+ content = self._parse_body_json(content)
+
+ try:
+ if format == "json":
+ with open(filename, 'w', encoding='utf-8') as f:
+ if isinstance(content, dict) or isinstance(content, list):
+ json.dump(content, f, indent=2, ensure_ascii=False)
+ else:
+ f.write(str(content))
+ else:
+ with open(filename, 'w', encoding='utf-8') as f:
+ f.write(str(content))
+
+ logger.info(f"Content downloaded to: {filename}")
+ return filename
+
+ except IOError as e:
+ raise APIError(f"Failed to write file {filename}: {str(e)}")
+ except Exception as e:
+ raise APIError(f"Failed to download content: {str(e)}")
+
+ def download_snapshot(
+ self,
+ snapshot_id: str,
+ format: str = "json",
+ compress: bool = False,
+ batch_size: int = None,
+ part: int = None
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]], str]:
+ """
+ ## Download snapshot content from Bright Data dataset API
+
+ Downloads the snapshot content using the snapshot ID returned from scrape_chatGPT()
+ or other dataset collection triggers.
+
+ ### Parameters:
+ - `snapshot_id` (str): The snapshot ID returned when collection was triggered (required)
+ - `format` (str, optional): Format of the data - "json", "ndjson", "jsonl", or "csv" (default: "json")
+ - `compress` (bool, optional): Whether the result should be compressed (default: False)
+ - `batch_size` (int, optional): Divide into batches of X records (minimum: 1000)
+ - `part` (int, optional): If batch_size provided, specify which part to download
+
+ ### Returns:
+ - `Union[Dict, List, str]`: Snapshot data in the requested format
+
+ ### Example Usage:
+ ```python
+ # Download complete snapshot
+ data = client.download_snapshot("s_m4x7enmven8djfqak")
+
+ # Download as CSV format
+ csv_data = client.download_snapshot("s_m4x7enmven8djfqak", format="csv")
+
+ # Download in batches
+ batch_data = client.download_snapshot(
+ "s_m4x7enmven8djfqak",
+ batch_size=1000,
+ part=1
+ )
+ ```
+
+ ### Raises:
+ - `ValidationError`: Invalid parameters or snapshot_id format
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed, snapshot not found, or server error
+ """
+ if not snapshot_id or not isinstance(snapshot_id, str):
+ raise ValidationError("Snapshot ID is required and must be a non-empty string")
+
+ if format not in ["json", "ndjson", "jsonl", "csv"]:
+ raise ValidationError("Format must be one of: json, ndjson, jsonl, csv")
+
+ if not isinstance(compress, bool):
+ raise ValidationError("Compress must be a boolean")
+
+ if batch_size is not None:
+ if not isinstance(batch_size, int) or batch_size < 1000:
+ raise ValidationError("Batch size must be an integer >= 1000")
+
+ if part is not None:
+ if not isinstance(part, int) or part < 1:
+ raise ValidationError("Part must be a positive integer")
+ if batch_size is None:
+ raise ValidationError("Part parameter requires batch_size to be specified")
+
+ url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.api_token}",
+ "Accept": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "format": format
+ }
+
+ if compress:
+ params["compress"] = "true"
+
+ if batch_size is not None:
+ params["batch_size"] = batch_size
+
+ if part is not None:
+ params["part"] = part
+
+ try:
+ logger.info(f"Downloading snapshot {snapshot_id} in {format} format")
+
+ response = self.session.get(
+ url,
+ headers=headers,
+ params=params,
+ timeout=self.default_timeout
+ )
+
+ if response.status_code == 200:
+ pass
+ elif response.status_code == 202:
+ try:
+ response_data = response.json()
+ message = response_data.get('message', 'Snapshot is not ready yet')
+ print("Snapshot is not ready yet, try again soon")
+ return {"status": "not_ready", "message": message, "snapshot_id": snapshot_id}
+ except json.JSONDecodeError:
+ print("Snapshot is not ready yet, try again soon")
+ return {"status": "not_ready", "message": "Snapshot is not ready yet, check again soon", "snapshot_id": snapshot_id}
+ elif response.status_code == 401:
+ raise AuthenticationError("Invalid API token or insufficient permissions")
+ elif response.status_code == 404:
+ raise APIError(f"Snapshot '{snapshot_id}' not found")
+ else:
+ raise APIError(f"Download request failed with status {response.status_code}: {response.text}")
+
+ if format == "csv":
+ data = response.text
+ save_data = data
+ else:
+ response_text = response.text
+ if '\n{' in response_text and response_text.strip().startswith('{'):
+ json_objects = []
+ for line in response_text.strip().split('\n'):
+ if line.strip():
+ try:
+ json_objects.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ data = json_objects
+ save_data = json_objects
+ else:
+ try:
+ data = response.json()
+ save_data = data
+ except json.JSONDecodeError:
+ data = response_text
+ save_data = response_text
+
+ try:
+ output_file = f"snapshot_{snapshot_id}.{format}"
+ if format == "csv" or isinstance(save_data, str):
+ with open(output_file, 'w', encoding='utf-8') as f:
+ f.write(str(save_data))
+ else:
+ with open(output_file, 'w', encoding='utf-8') as f:
+ json.dump(save_data, f, indent=2, ensure_ascii=False)
+ logger.info(f"Data saved to: {output_file}")
+ except Exception:
+ pass
+
+ logger.info(f"Successfully downloaded snapshot {snapshot_id}")
+ return data
+
+ except requests.exceptions.Timeout:
+ raise APIError("Timeout while downloading snapshot")
+ except requests.exceptions.RequestException as e:
+ raise APIError(f"Network error during snapshot download: {str(e)}")
+ except Exception as e:
+ if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+ raise
+ raise APIError(f"Unexpected error during snapshot download: {str(e)}")
+
+ def _parse_body_json(self, content: Union[Dict, List]) -> Union[Dict, List]:
+ """
+ Parse JSON strings in 'body' fields to objects
+
+ Args:
+ content: The content to process
+
+ Returns:
+ Content with parsed body fields
+ """
+ if content is None:
+ return content
+
+ if isinstance(content, list):
+ for item in content:
+ if isinstance(item, dict) and 'body' in item:
+ body = item['body']
+ if isinstance(body, str):
+ try:
+ item['body'] = json.loads(body)
+ except (json.JSONDecodeError, TypeError):
+ pass
+ elif isinstance(item, (dict, list)):
+ self._parse_body_json(item)
+
+ elif isinstance(content, dict):
+ if 'body' in content:
+ body = content['body']
+ if isinstance(body, str):
+ try:
+ content['body'] = json.loads(body)
+ except (json.JSONDecodeError, TypeError):
+ pass
+
+ for key, value in content.items():
+ if isinstance(value, (dict, list)):
+ content[key] = self._parse_body_json(value)
+
+ return content
\ No newline at end of file
diff --git a/src/api/extract.py b/src/api/extract.py
new file mode 100644
index 0000000..1b04b84
--- /dev/null
+++ b/src/api/extract.py
@@ -0,0 +1,419 @@
+import os
+import re
+import json
+import openai
+from typing import Dict, Any, Tuple, Union, List
+from urllib.parse import urlparse
+
+from ..utils import get_logger
+from ..exceptions import ValidationError, APIError
+
+logger = get_logger('api.extract')
+
+
+class ExtractResult(str):
+ """
+ Custom result class that behaves like a string (extracted content)
+ but also provides access to metadata attributes
+ """
+ def __new__(cls, extracted_content, metadata):
+ obj = str.__new__(cls, extracted_content)
+ obj._metadata = metadata
+ return obj
+
+ def __getattr__(self, name):
+ if name in self._metadata:
+ return self._metadata[name]
+ raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
+
+ def __getitem__(self, key):
+ return self._metadata[key]
+
+ def get(self, key, default=None):
+ return self._metadata.get(key, default)
+
+ def keys(self):
+ return self._metadata.keys()
+
+ def values(self):
+ return self._metadata.values()
+
+ def items(self):
+ return self._metadata.items()
+
+ @property
+ def metadata(self):
+ """Access full metadata dictionary"""
+ return self._metadata
+
+
+class ExtractAPI:
+ """Handles content extraction using web scraping + LLM processing"""
+
+ def __init__(self, client):
+ self.client = client
+
+ def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: Dict[str, Any] = None, llm_key: str = None) -> Dict[str, Any]:
+ """
+ ## Extract specific information from websites using AI
+
+ Combines web scraping with OpenAI's language models to extract targeted information
+ from web pages based on natural language queries.
+
+ ### Parameters:
+ - `query` (str): Natural language query describing what to extract. If `url` parameter is provided,
+ this becomes the pure extraction query. If `url` is not provided, this should include
+ the URL (e.g. "extract the most recent news from cnn.com")
+ - `url` (str | List[str], optional): Direct URL(s) to scrape. If provided, bypasses URL extraction
+ from query and sends these URLs to the web unlocker API
+ - `output_scheme` (dict, optional): JSON Schema defining the expected structure for the LLM response.
+ Uses OpenAI's Structured Outputs for reliable type-safe responses.
+ Example: {"type": "object", "properties": {"title": {"type": "string"}, "date": {"type": "string"}}, "required": ["title", "date"]}
+ - `llm_key` (str, optional): OpenAI API key. If not provided, uses OPENAI_API_KEY env variable
+
+ ### Returns:
+ - `ExtractResult`: String containing extracted content with metadata attributes access
+
+ ### Example Usage:
+ ```python
+ # Using URL parameter with structured output
+ result = client.extract(
+ query="extract the most recent news headlines",
+ url="https://cnn.com",
+ output_scheme={
+ "type": "object",
+ "properties": {
+ "headlines": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "title": {"type": "string"},
+ "date": {"type": "string"}
+ },
+ "required": ["title", "date"]
+ }
+ }
+ },
+ "required": ["headlines"]
+ }
+ )
+
+ # Using URL in query (original behavior)
+ result = client.extract(
+ query="extract the most recent news from cnn.com",
+ llm_key="your-openai-api-key"
+ )
+
+ # Multiple URLs with structured schema
+ result = client.extract(
+ query="extract main headlines",
+ url=["https://cnn.com", "https://bbc.com"],
+ output_scheme={
+ "type": "object",
+ "properties": {
+ "sources": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "source_name": {"type": "string"},
+ "headlines": {"type": "array", "items": {"type": "string"}}
+ },
+ "required": ["source_name", "headlines"]
+ }
+ }
+ },
+ "required": ["sources"]
+ }
+ )
+ ```
+
+ ### Raises:
+ - `ValidationError`: Invalid query format or missing LLM key
+ - `APIError`: Scraping failed or LLM processing error
+ """
+ if not query or not isinstance(query, str):
+ raise ValidationError("Query must be a non-empty string")
+
+ query = query.strip()
+ if len(query) > 10000:
+ raise ValidationError("Query is too long (maximum 10,000 characters)")
+ if len(query) < 5:
+ raise ValidationError("Query is too short (minimum 5 characters)")
+
+ if not llm_key:
+ llm_key = os.getenv('OPENAI_API_KEY')
+
+ if not llm_key or not isinstance(llm_key, str):
+ raise ValidationError("OpenAI API key is required. Provide it as parameter or set OPENAI_API_KEY environment variable")
+
+ if output_scheme is not None:
+ if not isinstance(output_scheme, dict):
+ raise ValidationError("output_scheme must be a dict containing a valid JSON Schema")
+ if "type" not in output_scheme:
+ raise ValidationError("output_scheme must have a 'type' property")
+
+ self._validate_structured_outputs_schema(output_scheme)
+
+ logger.info(f"Processing extract query: {query[:50]}...")
+
+ try:
+ if url is not None:
+ parsed_query = query.strip()
+ target_urls = url if isinstance(url, list) else [url]
+ logger.info(f"Using provided URL(s): {target_urls}")
+ else:
+ parsed_query, extracted_url = self._parse_query_and_url(query)
+ target_urls = [extracted_url]
+ logger.info(f"Parsed - Query: '{parsed_query}', URL: '{extracted_url}'")
+
+ if len(target_urls) == 1:
+ scraped_content = self.client.scrape(target_urls[0], response_format="raw")
+ source_url = target_urls[0]
+ else:
+ scraped_content = self.client.scrape(target_urls, response_format="raw")
+ source_url = ', '.join(target_urls)
+
+ logger.info(f"Scraped content from {len(target_urls)} URL(s)")
+
+ if isinstance(scraped_content, list):
+ all_text = []
+ all_titles = []
+ for i, content in enumerate(scraped_content):
+ parsed = self.client.parse_content(
+ content,
+ extract_text=True,
+ extract_links=False,
+ extract_images=False
+ )
+ all_text.append(f"--- Content from {target_urls[i]} ---\n{parsed.get('text', '')}")
+ all_titles.append(parsed.get('title', 'Unknown'))
+
+ combined_text = "\n\n".join(all_text)
+ combined_title = " | ".join(all_titles)
+ parsed_content = {'text': combined_text, 'title': combined_title}
+ else:
+ parsed_content = self.client.parse_content(
+ scraped_content,
+ extract_text=True,
+ extract_links=False,
+ extract_images=False
+ )
+
+ logger.info(f"Parsed content - text length: {len(parsed_content.get('text', ''))}")
+
+ extracted_info, token_usage = self._process_with_llm(
+ parsed_query,
+ parsed_content.get('text', ''),
+ llm_key,
+ source_url,
+ output_scheme
+ )
+
+ metadata = {
+ 'query': parsed_query,
+ 'url': source_url,
+ 'extracted_content': extracted_info,
+ 'source_title': parsed_content.get('title', 'Unknown'),
+ 'content_length': len(parsed_content.get('text', '')),
+ 'token_usage': token_usage,
+ 'success': True
+ }
+
+ return ExtractResult(extracted_info, metadata)
+
+ except Exception as e:
+ if isinstance(e, (ValidationError, APIError)):
+ raise
+ logger.error(f"Unexpected error during extraction: {e}")
+ raise APIError(f"Extraction failed: {str(e)}")
+
+ def _parse_query_and_url(self, query: str) -> Tuple[str, str]:
+ """
+ Parse natural language query to extract the task and URL
+
+ Args:
+ query: Natural language query like "extract news from cnn.com"
+
+ Returns:
+ Tuple of (parsed_query, full_url)
+ """
+ query = query.strip()
+
+ url_patterns = [
+ r'from\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)',
+ r'on\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)',
+ r'at\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)',
+ r'((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)'
+ ]
+
+ url = None
+ for pattern in url_patterns:
+ match = re.search(pattern, query, re.IGNORECASE)
+ if match:
+ url = match.group(1)
+ break
+
+ if not url:
+ raise ValidationError("Could not extract URL from query. Please include a website URL.")
+
+ full_url = self._build_full_url(url)
+
+ extract_query = re.sub(r'\b(?:from|on|at)\s+(?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*', '', query, flags=re.IGNORECASE)
+ extract_query = re.sub(r'\b(?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*', '', extract_query, flags=re.IGNORECASE)
+ extract_query = re.sub(r'\s+', ' ', extract_query).strip()
+
+ if not extract_query:
+ extract_query = "extract the main content"
+
+ return extract_query, full_url
+
+ def _build_full_url(self, url: str) -> str:
+ """
+ Build a complete URL from potentially partial URL
+
+ Args:
+ url: Potentially partial URL like "cnn.com" or "https://example.com"
+
+ Returns:
+ Complete URL with https:// and www if needed
+ """
+ url = url.strip()
+
+ if not url.startswith(('http://', 'https://')):
+ if not url.startswith('www.'):
+ url = f'www.{url}'
+ url = f'https://{url}'
+
+ parsed = urlparse(url)
+ if not parsed.netloc:
+ raise ValidationError(f"Invalid URL format: {url}")
+
+ return url
+
+ def _validate_structured_outputs_schema(self, schema: Dict[str, Any], path: str = "") -> None:
+ """
+ Validate JSON Schema for OpenAI Structured Outputs compatibility
+
+ Args:
+ schema: JSON Schema to validate
+ path: Current path in schema (for error reporting)
+ """
+ if not isinstance(schema, dict):
+ return
+
+ schema_type = schema.get("type")
+
+ if schema_type == "object":
+ if "properties" not in schema:
+ raise ValidationError(f"Object schema at '{path}' must have 'properties' defined")
+ if "required" not in schema:
+ raise ValidationError(f"Object schema at '{path}' must have 'required' array (OpenAI Structured Outputs requirement)")
+ if "additionalProperties" not in schema or schema["additionalProperties"] is not False:
+ raise ValidationError(f"Object schema at '{path}' must have 'additionalProperties': false (OpenAI Structured Outputs requirement)")
+
+ properties = set(schema["properties"].keys())
+ required = set(schema["required"])
+ if properties != required:
+ missing = properties - required
+ extra = required - properties
+ error_msg = f"OpenAI Structured Outputs requires ALL properties to be in 'required' array at '{path}'."
+ if missing:
+ error_msg += f" Missing from required: {list(missing)}"
+ if extra:
+ error_msg += f" Extra in required: {list(extra)}"
+ raise ValidationError(error_msg)
+
+ for prop_name, prop_schema in schema["properties"].items():
+ self._validate_structured_outputs_schema(prop_schema, f"{path}.{prop_name}")
+
+ elif schema_type == "array":
+ if "items" in schema:
+ self._validate_structured_outputs_schema(schema["items"], f"{path}[]")
+
+ def _process_with_llm(self, query: str, content: str, llm_key: str, source_url: str, output_scheme: Dict[str, Any] = None) -> Tuple[str, Dict[str, int]]:
+ """
+ Process scraped content with OpenAI to extract requested information
+
+ Args:
+ query: What to extract from the content
+ content: Scraped and parsed text content
+ llm_key: OpenAI API key
+ source_url: Source URL for context
+ output_scheme: JSON Schema dict for structured outputs (optional)
+
+ Returns:
+ Tuple of (extracted information, token usage dict)
+ """
+ if len(content) > 15000:
+ beginning = content[:8000]
+ end = content[-4000:]
+ content = f"{beginning}\n\n... [middle content truncated for token efficiency] ...\n\n{end}"
+ elif len(content) > 12000:
+ content = content[:12000] + "\n\n... [content truncated to optimize tokens]"
+
+ client = openai.OpenAI(api_key=llm_key)
+
+ system_prompt = f"""You are a precise web content extraction specialist. Your task: {query}
+
+SOURCE: {source_url}
+
+INSTRUCTIONS:
+1. Extract ONLY the specific information requested
+2. Include relevant details (dates, numbers, names) when available
+3. If requested info isn't found, briefly state what content IS available
+4. Keep response concise but complete
+5. Be accurate and factual"""
+
+ user_prompt = f"CONTENT TO ANALYZE:\n\n{content}\n\nEXTRACT: {query}"
+
+ try:
+ call_params = {
+ "model": "gpt-4o-2024-08-06",
+ "messages": [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": user_prompt}
+ ],
+ "max_tokens": 1000,
+ "temperature": 0.1
+ }
+
+ if output_scheme:
+ call_params["response_format"] = {
+ "type": "json_schema",
+ "json_schema": {
+ "name": "extracted_content",
+ "strict": True,
+ "schema": output_scheme
+ }
+ }
+ logger.info("Using OpenAI Structured Outputs with provided schema")
+ else:
+ logger.info("Using regular OpenAI completion (no structured schema provided)")
+
+ response = client.chat.completions.create(**call_params)
+
+ if not response.choices or not response.choices[0].message.content:
+ raise APIError("OpenAI returned empty response")
+
+ extracted_content = response.choices[0].message.content.strip()
+
+ if output_scheme:
+ logger.info("Received structured JSON response from OpenAI")
+ else:
+ logger.info("Received text response from OpenAI")
+
+ token_usage = {
+ 'prompt_tokens': response.usage.prompt_tokens,
+ 'completion_tokens': response.usage.completion_tokens,
+ 'total_tokens': response.usage.total_tokens
+ }
+
+ logger.info(f"OpenAI token usage: {token_usage['total_tokens']} total ({token_usage['prompt_tokens']} prompt + {token_usage['completion_tokens']} completion)")
+
+ return extracted_content, token_usage
+
+ except Exception as e:
+ logger.error(f"OpenAI API error: {e}")
+ raise APIError(f"Failed to process content with LLM: {str(e)}")
\ No newline at end of file
diff --git a/src/api/linkedin.py b/src/api/linkedin.py
new file mode 100644
index 0000000..19ede6b
--- /dev/null
+++ b/src/api/linkedin.py
@@ -0,0 +1,803 @@
+import json
+import re
+import requests
+from typing import Union, Dict, Any, List
+
+from ..utils import get_logger
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.linkedin')
+
+
+class LinkedInAPI:
+ """Handles LinkedIn data collection using Bright Data's collect API"""
+
+ DATASET_IDS = {
+ 'profile': 'gd_l1viktl72bvl7bjuj0',
+ 'company': 'gd_l1vikfnt1wgvvqz95w',
+ 'job': 'gd_lpfll7v5hcqtkxl6l',
+ 'post': 'gd_lyy3tktm25m4avu764'
+ }
+
+ URL_PATTERNS = {
+ 'profile': re.compile(r'linkedin\.com/in/[^/?]+/?(\?.*)?$'),
+ 'company': re.compile(r'linkedin\.com/(company|organization-guest/company)/[^/?]+/?(\?.*)?$'),
+ 'job': re.compile(r'linkedin\.com/jobs/view/[^/?]+/?(\?.*)?$'),
+ 'post': re.compile(r'linkedin\.com/(posts|pulse)/[^/?]+/?(\?.*)?$')
+ }
+
+ def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5):
+ self.session = session
+ self.api_token = api_token
+ self.default_timeout = default_timeout
+ self.max_retries = max_retries
+ self.retry_backoff = retry_backoff
+
+ def _identify_dataset_type(self, url: str) -> str:
+ """
+ Identify LinkedIn dataset type based on URL pattern
+
+ Args:
+ url: LinkedIn URL to analyze
+
+ Returns:
+ Dataset type ('profile', 'company', 'job', 'post')
+
+ Raises:
+ ValidationError: If URL doesn't match any known LinkedIn pattern
+ """
+ if not url or not isinstance(url, str):
+ raise ValidationError("URL must be a non-empty string")
+
+ url = url.strip().lower()
+ for dataset_type, pattern in self.URL_PATTERNS.items():
+ if pattern.search(url):
+ logger.debug(f"URL '{url}' identified as LinkedIn {dataset_type}")
+ return dataset_type
+
+ raise ValidationError(f"URL '{url}' does not match any supported LinkedIn data type")
+
+ def _scrape_linkedin_dataset(
+ self,
+ urls: Union[str, List[str]],
+ dataset_id: str,
+ dataset_type: str,
+ sync: bool = True,
+ timeout: int = None
+ ) -> Dict[str, Any]:
+ """
+ Internal method to scrape LinkedIn data using Bright Data's collect API
+
+ Args:
+ urls: Single LinkedIn URL or list of LinkedIn URLs
+ dataset_id: Bright Data dataset ID for the specific LinkedIn data type
+ dataset_type: Type of LinkedIn data (for logging purposes)
+ sync: If True (default), uses synchronous API for immediate results
+ timeout: Request timeout in seconds
+
+ Returns:
+ Dict containing response with snapshot_id or direct data (if sync=True)
+
+ Raises:
+ ValidationError: Invalid URL format
+ AuthenticationError: Invalid API token or insufficient permissions
+ APIError: Request failed or server error
+ """
+ if isinstance(urls, str):
+ url_list = [urls]
+ else:
+ url_list = urls
+
+ if not url_list or len(url_list) == 0:
+ raise ValidationError("At least one URL is required")
+ for url in url_list:
+ if not url or not isinstance(url, str):
+ raise ValidationError("All URLs must be non-empty strings")
+
+ logger.info(f"Processing {len(url_list)} LinkedIn {dataset_type} URL(s) {'synchronously' if sync else 'asynchronously'}")
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+
+ if sync:
+ api_url = "https://api.brightdata.com/datasets/v3/scrape"
+ data = {
+ "input": [{"url": url} for url in url_list]
+ }
+ params = {
+ "dataset_id": dataset_id,
+ "notify": "false",
+ "include_errors": "true"
+ }
+ else:
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+ data = [{"url": url} for url in url_list]
+ params = {
+ "dataset_id": dataset_id,
+ "include_errors": "true"
+ }
+
+ try:
+ if sync:
+ response = self.session.post(
+ api_url,
+ headers=headers,
+ params=params,
+ json=data,
+ timeout=timeout or 65
+ )
+ else:
+ response = self.session.post(
+ api_url,
+ headers=headers,
+ params=params,
+ json=data,
+ timeout=timeout or self.default_timeout
+ )
+
+ if response.status_code == 401:
+ raise AuthenticationError("Invalid API token or insufficient permissions")
+ elif response.status_code not in [200, 202]:
+ raise APIError(f"LinkedIn data collection request failed with status {response.status_code}: {response.text}")
+
+ if sync:
+ response_text = response.text
+ if '\n{' in response_text and response_text.strip().startswith('{'):
+ json_objects = []
+ for line in response_text.strip().split('\n'):
+ if line.strip():
+ try:
+ json_objects.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ result = json_objects
+ else:
+ try:
+ result = response.json()
+ except json.JSONDecodeError:
+ result = response_text
+
+ logger.info(f"LinkedIn {dataset_type} data retrieved synchronously for {len(url_list)} URL(s)")
+ print(f"Retrieved {len(result) if isinstance(result, list) else 1} LinkedIn {dataset_type} record(s)")
+ else:
+ result = response.json()
+ snapshot_id = result.get('snapshot_id')
+ if snapshot_id:
+ logger.info(f"LinkedIn {dataset_type} data collection job initiated successfully for {len(url_list)} URL(s)")
+ print("")
+ print("Snapshot ID:")
+ print(snapshot_id)
+ print("")
+
+ return result
+
+ except requests.exceptions.Timeout:
+ raise APIError("Timeout while initiating LinkedIn data collection")
+ except requests.exceptions.RequestException as e:
+ raise APIError(f"Network error during LinkedIn data collection: {str(e)}")
+ except json.JSONDecodeError as e:
+ raise APIError(f"Failed to parse LinkedIn data collection response: {str(e)}")
+ except Exception as e:
+ if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+ raise
+ raise APIError(f"Unexpected error during LinkedIn data collection: {str(e)}")
+
+
+class LinkedInScraper:
+ """LinkedIn data scraping interface with specialized methods for different data types"""
+
+ def __init__(self, linkedin_api):
+ self.linkedin_api = linkedin_api
+
+ def profiles(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]:
+ """
+ ## Scrape LinkedIn Profile Data
+
+ Scrapes structured data from LinkedIn profiles using the profiles dataset.
+
+ ### Parameters:
+ - `url` (str | List[str]): Single LinkedIn profile URL or list of profile URLs
+ - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing
+ - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async)
+
+ ### Returns:
+ - `Dict[str, Any]`: If sync=True, returns scraped profile data directly. If sync=False, returns response with snapshot_id for async processing
+
+ ### Example URLs:
+ - `https://www.linkedin.com/in/username/`
+ - `https://linkedin.com/in/first-last-123456/`
+
+ ### Example Usage:
+ ```python
+ # Single profile (synchronous - returns data immediately)
+ result = client.scrape_linkedin.profiles("https://www.linkedin.com/in/elad-moshe-05a90413/")
+
+ # Multiple profiles (synchronous - returns data immediately)
+ profiles = [
+ "https://www.linkedin.com/in/user1/",
+ "https://www.linkedin.com/in/user2/"
+ ]
+ result = client.scrape_linkedin.profiles(profiles)
+
+ # Asynchronous processing (returns snapshot_id)
+ result = client.scrape_linkedin.profiles(profiles, sync=False)
+ ```
+ """
+ return self.linkedin_api._scrape_linkedin_dataset(
+ url,
+ self.linkedin_api.DATASET_IDS['profile'],
+ 'profile',
+ sync,
+ timeout
+ )
+
+ def companies(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]:
+ """
+ ## Scrape LinkedIn Company Data
+
+ Scrapes structured data from LinkedIn company pages using the companies dataset.
+
+ ### Parameters:
+ - `url` (str | List[str]): Single LinkedIn company URL or list of company URLs
+ - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing
+ - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async)
+
+ ### Returns:
+ - `Dict[str, Any]`: If sync=True, returns scraped company data directly. If sync=False, returns response with snapshot_id for async processing
+
+ ### Example URLs:
+ - `https://www.linkedin.com/company/company-name/`
+ - `https://linkedin.com/company/bright-data/`
+
+ ### Example Usage:
+ ```python
+ # Single company (synchronous)
+ result = client.scrape_linkedin.companies("https://www.linkedin.com/company/bright-data/")
+
+ # Multiple companies (synchronous)
+ companies = [
+ "https://www.linkedin.com/company/ibm/",
+ "https://www.linkedin.com/company/microsoft/"
+ ]
+ result = client.scrape_linkedin.companies(companies)
+
+ # Asynchronous processing
+ result = client.scrape_linkedin.companies(companies, sync=False)
+ ```
+ """
+ return self.linkedin_api._scrape_linkedin_dataset(
+ url,
+ self.linkedin_api.DATASET_IDS['company'],
+ 'company',
+ sync,
+ timeout
+ )
+
+ def jobs(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]:
+ """
+ ## Scrape LinkedIn Job Data
+
+ Scrapes structured data from LinkedIn job listings using the jobs dataset.
+
+ ### Parameters:
+ - `url` (str | List[str]): Single LinkedIn job URL or list of job URLs
+ - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing
+ - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async)
+
+ ### Returns:
+ - `Dict[str, Any]`: If sync=True, returns scraped job data directly. If sync=False, returns response with snapshot_id for async processing
+
+ ### Example URLs:
+ - `https://www.linkedin.com/jobs/view/1234567890/`
+ - `https://linkedin.com/jobs/view/job-id/`
+
+ ### Example Usage:
+ ```python
+ # Single job listing (synchronous)
+ result = client.scrape_linkedin.jobs("https://www.linkedin.com/jobs/view/1234567890/")
+
+ # Multiple job listings (synchronous)
+ jobs = [
+ "https://www.linkedin.com/jobs/view/1111111/",
+ "https://www.linkedin.com/jobs/view/2222222/"
+ ]
+ result = client.scrape_linkedin.jobs(jobs)
+
+ # Asynchronous processing
+ result = client.scrape_linkedin.jobs(jobs, sync=False)
+ ```
+ """
+ return self.linkedin_api._scrape_linkedin_dataset(
+ url,
+ self.linkedin_api.DATASET_IDS['job'],
+ 'job',
+ sync,
+ timeout
+ )
+
+ def posts(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]:
+ """
+ ## Scrape LinkedIn Post Data
+
+ Scrapes structured data from LinkedIn posts and articles using the posts dataset.
+
+ ### Parameters:
+ - `url` (str | List[str]): Single LinkedIn post URL or list of post URLs
+ - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing
+ - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async)
+
+ ### Returns:
+ - `Dict[str, Any]`: If sync=True, returns scraped post data directly. If sync=False, returns response with snapshot_id for async processing
+
+ ### Example URLs:
+ - `https://www.linkedin.com/posts/username-activity-123456/`
+ - `https://www.linkedin.com/pulse/article-title-author/`
+
+ ### Example Usage:
+ ```python
+ # Single post (synchronous)
+ result = client.scrape_linkedin.posts("https://www.linkedin.com/posts/user-activity-123/")
+
+ # Multiple posts (synchronous)
+ posts = [
+ "https://www.linkedin.com/posts/user1-activity-111/",
+ "https://www.linkedin.com/pulse/article-author/"
+ ]
+ result = client.scrape_linkedin.posts(posts)
+
+ # Asynchronous processing
+ result = client.scrape_linkedin.posts(posts, sync=False)
+ ```
+ """
+ return self.linkedin_api._scrape_linkedin_dataset(
+ url,
+ self.linkedin_api.DATASET_IDS['post'],
+ 'post',
+ sync,
+ timeout
+ )
+
+
+class LinkedInSearcher:
+ """LinkedIn search interface for discovering new LinkedIn data by various criteria"""
+
+ def __init__(self, linkedin_api):
+ self.linkedin_api = linkedin_api
+
+ def profiles(
+ self,
+ first_name: Union[str, List[str]],
+ last_name: Union[str, List[str]],
+ timeout: int = None
+ ) -> Dict[str, Any]:
+ """
+ ## Search LinkedIn Profiles by Name
+
+ Discovers LinkedIn profiles by searching for first and last names.
+
+ ### Parameters:
+ - `first_name` (str | List[str]): Single first name or list of first names to search for
+ - `last_name` (str | List[str]): Single last name or list of last names to search for
+ - `timeout` (int, optional): Request timeout in seconds (default: 30)
+
+ ### Returns:
+ - `Dict[str, Any]`: Response containing snapshot_id for async processing
+
+ ### Example Usage:
+ ```python
+ # Single name search (returns snapshot_id)
+ result = client.search_linkedin.profiles("James", "Smith")
+
+ # Multiple names search (returns snapshot_id)
+ first_names = ["James", "Idan"]
+ last_names = ["Smith", "Vilenski"]
+ result = client.search_linkedin.profiles(first_names, last_names)
+ ```
+ """
+ if isinstance(first_name, str):
+ first_names = [first_name]
+ else:
+ first_names = first_name
+
+ if isinstance(last_name, str):
+ last_names = [last_name]
+ else:
+ last_names = last_name
+
+ if len(first_names) != len(last_names):
+ raise ValidationError("first_name and last_name must have the same length")
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['profile'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "name"
+ }
+
+ data = [
+ {
+ "first_name": first_names[i],
+ "last_name": last_names[i]
+ }
+ for i in range(len(first_names))
+ ]
+
+ return self._make_request(api_url, headers, params, data, 'profile search', len(data), timeout)
+
+ def jobs(
+ self,
+ url: Union[str, List[str]] = None,
+ location: Union[str, List[str]] = None,
+ keyword: Union[str, List[str]] = "",
+ country: Union[str, List[str]] = "",
+ time_range: Union[str, List[str]] = "",
+ job_type: Union[str, List[str]] = "",
+ experience_level: Union[str, List[str]] = "",
+ remote: Union[str, List[str]] = "",
+ company: Union[str, List[str]] = "",
+ location_radius: Union[str, List[str]] = "",
+ selective_search: Union[bool, List[bool]] = False,
+ timeout: int = None
+ ) -> Dict[str, Any]:
+ """
+ ## Search LinkedIn Jobs by URL or Keywords
+
+ Discovers LinkedIn jobs either by searching specific job search URLs or by keyword criteria.
+
+ ### Parameters:
+ - `url` (str | List[str], optional): LinkedIn job search URLs to scrape
+ - `location` (str | List[str], optional): Job location(s) - required when searching by keyword
+ - `keyword` (str | List[str], optional): Job keyword(s) to search for (default: "")
+ - `country` (str | List[str], optional): Country code(s) (default: "")
+ - `time_range` (str | List[str], optional): Time range filter (default: "")
+ - `job_type` (str | List[str], optional): Job type filter (default: "")
+ - `experience_level` (str | List[str], optional): Experience level filter (default: "")
+ - `remote` (str | List[str], optional): Remote work filter (default: "")
+ - `company` (str | List[str], optional): Company name filter (default: "")
+ - `location_radius` (str | List[str], optional): Location radius filter (default: "")
+ - `selective_search` (bool | List[bool], optional): Enable selective search (default: False)
+ - `timeout` (int, optional): Request timeout in seconds (default: 30)
+
+ ### Returns:
+ - `Dict[str, Any]`: Response containing snapshot_id for async processing
+
+ ### Example Usage:
+ ```python
+ # Search by job URLs (returns snapshot_id)
+ job_urls = [
+ "https://www.linkedin.com/jobs/search?keywords=Software&location=Tel%20Aviv-Yafo",
+ "https://www.linkedin.com/jobs/reddit-inc.-jobs-worldwide?f_C=150573"
+ ]
+ result = client.search_linkedin.jobs(url=job_urls)
+
+ # Search by keyword (returns snapshot_id)
+ result = client.search_linkedin.jobs(
+ location="Paris",
+ keyword="product manager",
+ country="FR",
+ time_range="Past month",
+ job_type="Full-time"
+ )
+ ```
+ """
+ if url is not None:
+ return self._search_jobs_by_url(url, timeout)
+ elif location is not None:
+ return self._search_jobs_by_keyword(
+ location, keyword, country, time_range, job_type,
+ experience_level, remote, company, location_radius,
+ selective_search, timeout
+ )
+ else:
+ raise ValidationError("Either 'url' or 'location' parameter must be provided")
+
+ def posts(
+ self,
+ profile_url: Union[str, List[str]] = None,
+ company_url: Union[str, List[str]] = None,
+ url: Union[str, List[str]] = None,
+ start_date: Union[str, List[str]] = "",
+ end_date: Union[str, List[str]] = "",
+ timeout: int = None
+ ) -> Dict[str, Any]:
+ """
+ ## Search LinkedIn Posts by Profile, Company, or General URL
+
+ Discovers LinkedIn posts using various search methods.
+
+ ### Parameters:
+ - `profile_url` (str | List[str], optional): LinkedIn profile URL(s) to get posts from
+ - `company_url` (str | List[str], optional): LinkedIn company URL(s) to get posts from
+ - `url` (str | List[str], optional): General LinkedIn URL(s) for posts
+ - `start_date` (str | List[str], optional): Start date filter (ISO format, default: "")
+ - `end_date` (str | List[str], optional): End date filter (ISO format, default: "")
+ - `timeout` (int, optional): Request timeout in seconds (default: 30)
+
+ ### Returns:
+ - `Dict[str, Any]`: Response containing snapshot_id for async processing
+
+ ### Example Usage:
+ ```python
+ # Search posts by profile URL with date range (returns snapshot_id)
+ result = client.search_linkedin.posts(
+ profile_url="https://www.linkedin.com/in/bettywliu",
+ start_date="2018-04-25T00:00:00.000Z",
+ end_date="2021-05-25T00:00:00.000Z"
+ )
+
+ # Search posts by company URL (returns snapshot_id)
+ result = client.search_linkedin.posts(
+ company_url="https://www.linkedin.com/company/bright-data"
+ )
+
+ # Search posts by general URL (returns snapshot_id)
+ result = client.search_linkedin.posts(
+ url="https://www.linkedin.com/posts/activity-123456"
+ )
+ ```
+ """
+ if profile_url is not None:
+ return self._search_posts_by_profile(profile_url, start_date, end_date, timeout)
+ elif company_url is not None:
+ return self._search_posts_by_company(company_url, timeout)
+ elif url is not None:
+ return self._search_posts_by_url(url, timeout)
+ else:
+ raise ValidationError("One of 'profile_url', 'company_url', or 'url' parameter must be provided")
+
+ def _search_jobs_by_url(self, urls, timeout):
+ """Search jobs by LinkedIn job search URLs"""
+ if isinstance(urls, str):
+ url_list = [urls]
+ else:
+ url_list = urls
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['job'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "url"
+ }
+
+ data = [{"url": url} for url in url_list]
+ return self._make_request(api_url, headers, params, data, 'job search by URL', len(data), timeout)
+
+ def _search_jobs_by_keyword(self, location, keyword, country, time_range, job_type, experience_level, remote, company, location_radius, selective_search, timeout):
+ """Search jobs by keyword criteria"""
+ params_dict = {
+ 'location': location, 'keyword': keyword, 'country': country,
+ 'time_range': time_range, 'job_type': job_type, 'experience_level': experience_level,
+ 'remote': remote, 'company': company, 'location_radius': location_radius,
+ 'selective_search': selective_search
+ }
+
+ max_length = 1
+ for key, value in params_dict.items():
+ if isinstance(value, list):
+ max_length = max(max_length, len(value))
+ normalized_params = {}
+ for key, value in params_dict.items():
+ if isinstance(value, list):
+ if len(value) != max_length and len(value) != 1:
+ raise ValidationError(f"Parameter '{key}' list length must be 1 or {max_length}")
+ normalized_params[key] = value * max_length if len(value) == 1 else value
+ else:
+ normalized_params[key] = [value] * max_length
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['job'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "keyword"
+ }
+
+ data = []
+ for i in range(max_length):
+ data.append({
+ "location": normalized_params['location'][i],
+ "keyword": normalized_params['keyword'][i],
+ "country": normalized_params['country'][i],
+ "time_range": normalized_params['time_range'][i],
+ "job_type": normalized_params['job_type'][i],
+ "experience_level": normalized_params['experience_level'][i],
+ "remote": normalized_params['remote'][i],
+ "company": normalized_params['company'][i],
+ "location_radius": normalized_params['location_radius'][i],
+ "selective_search": normalized_params['selective_search'][i]
+ })
+
+ return self._make_request(api_url, headers, params, data, 'job search by keyword', len(data), timeout)
+
+ def _search_posts_by_profile(self, profile_urls, start_dates, end_dates, timeout):
+ """Search posts by profile URL with optional date filtering"""
+ if isinstance(profile_urls, str):
+ url_list = [profile_urls]
+ else:
+ url_list = profile_urls
+
+ if isinstance(start_dates, str):
+ start_list = [start_dates] * len(url_list)
+ else:
+ start_list = start_dates if len(start_dates) == len(url_list) else [start_dates[0]] * len(url_list)
+
+ if isinstance(end_dates, str):
+ end_list = [end_dates] * len(url_list)
+ else:
+ end_list = end_dates if len(end_dates) == len(url_list) else [end_dates[0]] * len(url_list)
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['post'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "profile_url"
+ }
+
+ data = []
+ for i in range(len(url_list)):
+ item = {"url": url_list[i]}
+ if start_list[i]:
+ item["start_date"] = start_list[i]
+ if end_list[i]:
+ item["end_date"] = end_list[i]
+ data.append(item)
+
+ return self._make_request(api_url, headers, params, data, 'post search by profile', len(data), timeout)
+
+ def _search_posts_by_company(self, company_urls, timeout):
+ """Search posts by company URL"""
+ if isinstance(company_urls, str):
+ url_list = [company_urls]
+ else:
+ url_list = company_urls
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['post'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "company_url"
+ }
+
+ data = [{"url": url} for url in url_list]
+ return self._make_request(api_url, headers, params, data, 'post search by company', len(data), timeout)
+
+ def _search_posts_by_url(self, urls, timeout):
+ """Search posts by general URL"""
+ if isinstance(urls, str):
+ url_list = [urls]
+ else:
+ url_list = urls
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['post'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "url"
+ }
+
+ data = [{"url": url} for url in url_list]
+ return self._make_request(api_url, headers, params, data, 'post search by URL', len(data), timeout)
+
+ def _make_request(self, api_url, headers, params, data, operation_type, count, timeout):
+ """Common method to make API requests (async only for search operations)"""
+ try:
+ response = self.linkedin_api.session.post(
+ api_url,
+ headers=headers,
+ params=params,
+ json=data,
+ timeout=timeout or self.linkedin_api.default_timeout
+ )
+
+ if response.status_code == 401:
+ raise AuthenticationError("Invalid API token or insufficient permissions")
+ elif response.status_code != 200:
+ raise APIError(f"LinkedIn {operation_type} request failed with status {response.status_code}: {response.text}")
+
+ result = response.json()
+ snapshot_id = result.get('snapshot_id')
+ if snapshot_id:
+ logger.info(f"LinkedIn {operation_type} job initiated successfully for {count} item(s)")
+ print("")
+ print("Snapshot ID:")
+ print(snapshot_id)
+ print("")
+
+ return result
+
+ except requests.exceptions.Timeout:
+ raise APIError(f"Timeout while initiating LinkedIn {operation_type}")
+ except requests.exceptions.RequestException as e:
+ raise APIError(f"Network error during LinkedIn {operation_type}: {str(e)}")
+ except json.JSONDecodeError as e:
+ raise APIError(f"Failed to parse LinkedIn {operation_type} response: {str(e)}")
+ except Exception as e:
+ if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+ raise
+ raise APIError(f"Unexpected error during LinkedIn {operation_type}: {str(e)}")
\ No newline at end of file
diff --git a/src/api/scraper.py b/src/api/scraper.py
new file mode 100644
index 0000000..0d4fc31
--- /dev/null
+++ b/src/api/scraper.py
@@ -0,0 +1,205 @@
+import time
+from typing import Union, Dict, Any, List
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from ..utils import (
+ validate_url, validate_zone_name, validate_country_code,
+ validate_timeout, validate_max_workers, validate_url_list,
+ validate_response_format, validate_http_method, retry_request,
+ get_logger, log_request, safe_json_parse, validate_response_size
+)
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.scraper')
+
+
+class WebScraper:
+ """Handles web scraping operations using Bright Data Web Unlocker API"""
+
+ def __init__(self, session, default_timeout=30, max_retries=3, retry_backoff=1.5):
+ self.session = session
+ self.default_timeout = default_timeout
+ self.max_retries = max_retries
+ self.retry_backoff = retry_backoff
+
+ def scrape(
+ self,
+ url: Union[str, List[str]],
+ zone: str,
+ response_format: str = "raw",
+ method: str = "GET",
+ country: str = "",
+ data_format: str = "markdown",
+ async_request: bool = False,
+ max_workers: int = 10,
+ timeout: int = None
+ ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]:
+ """
+ **Unlock and scrape websites using Bright Data Web Unlocker API**
+
+ Scrapes one or multiple URLs through Bright Data's proxy network with anti-bot detection bypass.
+
+ **Parameters:**
+ - `url` (str | List[str]): Single URL string or list of URLs to scrape
+ - `zone` (str): Your Bright Data zone identifier
+ - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`)
+ - `method` (str, optional): HTTP method for the request (default: `"GET"`)
+ - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`)
+ - `data_format` (str, optional): Additional format transformation (default: `"html"`)
+ - `async_request` (bool, optional): Enable asynchronous processing (default: `False`)
+ - `max_workers` (int, optional): Maximum parallel workers for multiple URLs (default: `10`)
+ - `timeout` (int, optional): Request timeout in seconds (default: `30`)
+
+ **Returns:**
+ - Single URL: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"`
+ - Multiple URLs: `List[Union[Dict[str, Any], str]]` corresponding to each input URL
+
+ **Example Usage:**
+ ```python
+ # Single URL scraping
+ result = client.scrape(
+ url="https://example.com",
+ zone="your_zone_name",
+ response_format="json"
+ )
+
+ # Multiple URLs scraping
+ urls = ["https://site1.com", "https://site2.com"]
+ results = client.scrape(
+ url=urls,
+ zone="your_zone_name",
+ response_format="raw",
+ max_workers=5
+ )
+ ```
+
+ **Raises:**
+ - `ValidationError`: Invalid URL format or empty URL list
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+
+ timeout = timeout or self.default_timeout
+ validate_zone_name(zone)
+ validate_response_format(response_format)
+ validate_http_method(method)
+ validate_country_code(country)
+ validate_timeout(timeout)
+ validate_max_workers(max_workers)
+
+ if isinstance(url, list):
+ validate_url_list(url)
+ effective_max_workers = min(len(url), max_workers or 10)
+
+ results = [None] * len(url)
+
+ with ThreadPoolExecutor(max_workers=effective_max_workers) as executor:
+ future_to_index = {
+ executor.submit(
+ self._perform_single_scrape,
+ single_url, zone, response_format, method, country,
+ data_format, async_request, timeout
+ ): i
+ for i, single_url in enumerate(url)
+ }
+ for future in as_completed(future_to_index):
+ index = future_to_index[future]
+ try:
+ result = future.result()
+ results[index] = result
+ except Exception as e:
+ raise APIError(f"Failed to scrape {url[index]}: {str(e)}")
+
+ return results
+ else:
+ validate_url(url)
+ return self._perform_single_scrape(
+ url, zone, response_format, method, country,
+ data_format, async_request, timeout
+ )
+
+ def _perform_single_scrape(
+ self,
+ url: str,
+ zone: str,
+ response_format: str,
+ method: str,
+ country: str,
+ data_format: str,
+ async_request: bool,
+ timeout: int
+ ) -> Union[Dict[str, Any], str]:
+ """
+ Perform a single scrape operation with comprehensive logging
+ """
+ endpoint = "https://api.brightdata.com/request"
+ start_time = time.time()
+
+ logger.info(f"Starting scrape request for URL: {url[:100]}{'...' if len(url) > 100 else ''}")
+
+ payload = {
+ "zone": zone,
+ "url": url,
+ "format": response_format,
+ "method": method,
+ "data_format": data_format
+ }
+
+ params = {}
+ if async_request:
+ params['async'] = 'true'
+
+ @retry_request(
+ max_retries=self.max_retries,
+ backoff_factor=self.retry_backoff,
+ retry_statuses={429, 500, 502, 503, 504}
+ )
+ def make_request():
+ return self.session.post(
+ endpoint,
+ json=payload,
+ params=params,
+ timeout=timeout
+ )
+
+ try:
+ response = make_request()
+ response_time = (time.time() - start_time) * 1000
+
+ # Log request details
+ log_request(logger, 'POST', endpoint, response.status_code, response_time)
+
+ if response.status_code == 200:
+ logger.info(f"Scrape completed successfully in {response_time:.2f}ms")
+
+ validate_response_size(response.text)
+
+ if response_format == "json":
+ result = safe_json_parse(response.text)
+ logger.debug(f"Processed response with {len(str(result))} characters")
+ return result
+ else:
+ logger.debug(f"Returning raw response with {len(response.text)} characters")
+ return response.text
+
+ elif response.status_code == 400:
+ logger.error(f"Bad Request (400) for URL {url}: {response.text}")
+ raise ValidationError(f"Bad Request (400): {response.text}")
+ elif response.status_code == 401:
+ logger.error(f"Unauthorized (401) for URL {url}: Check API token")
+ raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}")
+ elif response.status_code == 403:
+ logger.error(f"Forbidden (403) for URL {url}: Insufficient permissions")
+ raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}")
+ elif response.status_code == 404:
+ logger.error(f"Not Found (404) for URL {url}: {response.text}")
+ raise APIError(f"Not Found (404): {response.text}")
+ else:
+ logger.error(f"API Error ({response.status_code}) for URL {url}: {response.text}")
+ raise APIError(f"API Error ({response.status_code}): {response.text}",
+ status_code=response.status_code, response_text=response.text)
+
+ except Exception as e:
+ response_time = (time.time() - start_time) * 1000
+ logger.error(f"Request failed after {response_time:.2f}ms for URL {url}: {str(e)}", exc_info=True)
+ raise
\ No newline at end of file
diff --git a/src/api/search.py b/src/api/search.py
new file mode 100644
index 0000000..24e6365
--- /dev/null
+++ b/src/api/search.py
@@ -0,0 +1,212 @@
+import json
+import time
+from typing import Union, Dict, Any, List
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from urllib.parse import quote_plus
+
+from ..utils import (
+ validate_zone_name, validate_country_code, validate_timeout,
+ validate_max_workers, validate_search_engine, validate_query,
+ validate_response_format, validate_http_method, retry_request,
+ get_logger, log_request, safe_json_parse, validate_response_size
+)
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.search')
+
+
+class SearchAPI:
+ """Handles search operations using Bright Data SERP API"""
+
+ def __init__(self, session, default_timeout=30, max_retries=3, retry_backoff=1.5):
+ self.session = session
+ self.default_timeout = default_timeout
+ self.max_retries = max_retries
+ self.retry_backoff = retry_backoff
+
+ def search(
+ self,
+ query: Union[str, List[str]],
+ search_engine: str = "google",
+ zone: str = None,
+ response_format: str = "raw",
+ method: str = "GET",
+ country: str = "",
+ data_format: str = "markdown",
+ async_request: bool = False,
+ max_workers: int = 10,
+ timeout: int = None,
+ parse: bool = False
+ ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]:
+ """
+ ## Search the web using Bright Data SERP API
+
+ Performs web searches through major search engines using Bright Data's proxy network
+ for reliable, bot-detection-free results.
+
+ ### Parameters:
+ - `query` (str | List[str]): Search query string or list of search queries
+ - `search_engine` (str, optional): Search engine to use - `"google"`, `"bing"`, or `"yandex"` (default: `"google"`)
+ - `zone` (str, optional): Your Bright Data zone identifier (default: `None`)
+ - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`)
+ - `method` (str, optional): HTTP method for the request (default: `"GET"`)
+ - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`)
+ - `data_format` (str, optional): Additional format transformation (default: `"markdown"`)
+ - `async_request` (bool, optional): Enable asynchronous processing (default: `False`)
+ - `max_workers` (int, optional): Maximum parallel workers for multiple queries (default: `10`)
+ - `timeout` (int, optional): Request timeout in seconds (default: `30`)
+ - `parse` (bool, optional): Enable JSON parsing by adding brd_json=1 to URL (default: `False`)
+
+ ### Returns:
+ - Single query: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"`
+ - Multiple queries: `List[Union[Dict[str, Any], str]]` corresponding to each input query
+
+ ### Example Usage:
+ ```python
+ # Single search query
+ result = client.search(
+ query="best laptops 2024",
+ search_engine="google",
+ response_format="json"
+ )
+
+ # Multiple search queries
+ queries = ["python tutorials", "machine learning courses", "web development"]
+ results = client.search(
+ query=queries,
+ search_engine="bing",
+ zone="your_zone_name",
+ max_workers=3
+ )
+ ```
+
+ ### Supported Search Engines:
+ - `"google"` - Google Search
+ - `"bing"` - Microsoft Bing
+ - `"yandex"` - Yandex Search
+
+ ### Raises:
+ - `ValidationError`: Invalid search engine, empty query, or validation errors
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+
+ timeout = timeout or self.default_timeout
+ validate_zone_name(zone)
+ validate_search_engine(search_engine)
+ validate_query(query)
+ validate_response_format(response_format)
+ validate_http_method(method)
+ validate_country_code(country)
+ validate_timeout(timeout)
+ validate_max_workers(max_workers)
+
+ base_url_map = {
+ "google": "https://www.google.com/search?q=",
+ "bing": "https://www.bing.com/search?q=",
+ "yandex": "https://yandex.com/search/?text="
+ }
+
+ base_url = base_url_map[search_engine.lower()]
+
+ if isinstance(query, list):
+ effective_max_workers = min(len(query), max_workers or 10)
+ results = [None] * len(query)
+
+ with ThreadPoolExecutor(max_workers=effective_max_workers) as executor:
+ future_to_index = {
+ executor.submit(
+ self._perform_single_search,
+ single_query, zone, response_format, method, country,
+ data_format, async_request, base_url, timeout, parse
+ ): i
+ for i, single_query in enumerate(query)
+ }
+
+ for future in as_completed(future_to_index):
+ index = future_to_index[future]
+ try:
+ result = future.result()
+ results[index] = result
+ except Exception as e:
+ raise APIError(f"Failed to search '{query[index]}': {str(e)}")
+
+ return results
+ else:
+ return self._perform_single_search(
+ query, zone, response_format, method, country,
+ data_format, async_request, base_url, timeout, parse
+ )
+
+ def _perform_single_search(
+ self,
+ query: str,
+ zone: str,
+ response_format: str,
+ method: str,
+ country: str,
+ data_format: str,
+ async_request: bool,
+ base_url: str,
+ timeout: int,
+ parse: bool
+ ) -> Union[Dict[str, Any], str]:
+ """
+ Perform a single search operation
+ """
+ encoded_query = quote_plus(query)
+ url = f"{base_url}{encoded_query}"
+
+ if parse:
+ url += "&brd_json=1"
+
+ endpoint = "https://api.brightdata.com/request"
+
+ payload = {
+ "zone": zone,
+ "url": url,
+ "format": response_format,
+ "method": method,
+ "data_format": data_format
+ }
+
+ params = {}
+ if async_request:
+ params['async'] = 'true'
+
+ @retry_request(
+ max_retries=self.max_retries,
+ backoff_factor=self.retry_backoff,
+ retry_statuses={429, 500, 502, 503, 504}
+ )
+ def make_request():
+ return self.session.post(
+ endpoint,
+ json=payload,
+ params=params,
+ timeout=timeout
+ )
+
+ response = make_request()
+
+ if response.status_code == 200:
+ if response_format == "json":
+ try:
+ return response.json()
+ except json.JSONDecodeError as e:
+ logger.warning(f"Failed to parse JSON response: {e}")
+ return response.text
+ else:
+ return response.text
+
+ elif response.status_code == 400:
+ raise ValidationError(f"Bad Request (400): {response.text}")
+ elif response.status_code == 401:
+ raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}")
+ elif response.status_code == 403:
+ raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}")
+ elif response.status_code == 404:
+ raise APIError(f"Not Found (404): {response.text}")
+ else:
+ raise APIError(f"API Error ({response.status_code}): {response.text}",
+ status_code=response.status_code, response_text=response.text)
\ No newline at end of file
From b5a6625d449de9f874cdc3e50750a9131a71079b Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:16:03 +0200
Subject: [PATCH 06/70] Delete src/api/tst
---
src/api/tst | 1 -
1 file changed, 1 deletion(-)
delete mode 100644 src/api/tst
diff --git a/src/api/tst b/src/api/tst
deleted file mode 100644
index 8b13789..0000000
--- a/src/api/tst
+++ /dev/null
@@ -1 +0,0 @@
-
From bf6605de307c39c64ca0d968d692399353c3cbda Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:16:28 +0200
Subject: [PATCH 07/70] Create tst
---
src/.github/tst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 src/.github/tst
diff --git a/src/.github/tst b/src/.github/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/.github/tst
@@ -0,0 +1 @@
+
From 9d17b0dfaae400ea175f80a696f84b6554746702 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:18:00 +0200
Subject: [PATCH 08/70] Create tst
---
src/.github/workflows/tst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 src/.github/workflows/tst
diff --git a/src/.github/workflows/tst b/src/.github/workflows/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/.github/workflows/tst
@@ -0,0 +1 @@
+
From cbb11e4067bbc735b33f9a0693d6c27fd90c91d9 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:18:21 +0200
Subject: [PATCH 09/70] Add files via upload
---
src/.github/workflows/publish.yml | 65 +++++++++++++++
src/.github/workflows/test.yml | 129 ++++++++++++++++++++++++++++++
2 files changed, 194 insertions(+)
create mode 100644 src/.github/workflows/publish.yml
create mode 100644 src/.github/workflows/test.yml
diff --git a/src/.github/workflows/publish.yml b/src/.github/workflows/publish.yml
new file mode 100644
index 0000000..7c2ec42
--- /dev/null
+++ b/src/.github/workflows/publish.yml
@@ -0,0 +1,65 @@
+name: Build and Publish
+
+on:
+ push:
+ tags:
+ - 'v*'
+ release:
+ types: [published]
+ workflow_dispatch:
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.8'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install build twine
+ pip install -r requirements.txt
+
+ - name: Build package
+ run: python -m build
+
+ - name: Upload build artifacts
+ uses: actions/upload-artifact@v4
+ with:
+ name: dist-files
+ path: dist/
+
+ - name: Publish to PyPI
+ if: github.event_name == 'release'
+ env:
+ TWINE_USERNAME: __token__
+ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+ run: |
+ twine upload dist/*
+
+ test-install:
+ runs-on: ubuntu-latest
+ needs: build
+
+ steps:
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.8'
+
+ - name: Download build artifacts
+ uses: actions/download-artifact@v4
+ with:
+ name: dist-files
+ path: dist/
+
+ - name: Test wheel installation
+ run: |
+ pip install dist/*.whl
+ python -c "import brightdata; print('✅ Package imported successfully')"
\ No newline at end of file
diff --git a/src/.github/workflows/test.yml b/src/.github/workflows/test.yml
new file mode 100644
index 0000000..69a0a2d
--- /dev/null
+++ b/src/.github/workflows/test.yml
@@ -0,0 +1,129 @@
+name: Tests
+
+on:
+ push:
+ branches: [ main, develop ]
+ pull_request:
+ branches: [ main ]
+ schedule:
+ - cron: '0 2 * * *'
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+ pip install pytest pytest-cov
+
+ - name: Test package import
+ run: |
+ python -c "import brightdata; print('Import successful')"
+
+ - name: Run tests
+ run: |
+ python -m pytest tests/ -v --cov=brightdata --cov-report=xml
+
+ - name: Upload coverage to Codecov
+ if: matrix.python-version == '3.8'
+ uses: codecov/codecov-action@v3
+ with:
+ file: ./coverage.xml
+
+ test-pypi-package:
+ runs-on: ubuntu-latest
+ if: github.event_name == 'schedule'
+ strategy:
+ matrix:
+ python-version: ['3.8', '3.11']
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install PyPI package
+ run: |
+ python -m pip install --upgrade pip
+ pip install brightdata-sdk
+ pip install pytest
+
+ - name: Test PyPI package import
+ run: |
+ python -c "import brightdata; print('PyPI package import successful')"
+ python -c "from brightdata import bdclient; print('bdclient import successful')"
+
+ - name: Test PyPI package basic functionality
+ run: |
+ python -c "
+ import sys
+ from brightdata import bdclient, __version__
+ print(f'PyPI package version: {__version__}')
+
+ # Test that validation works (accept any validation error as success)
+ try:
+ client = bdclient(api_token='test_token_too_short')
+ print('WARNING: No validation error - this might indicate an issue')
+ except Exception as e:
+ print(f'Validation error caught: {str(e)[:100]}...')
+ print('PyPI package validation working correctly')
+
+ # Test basic client creation with disabled auto-zone creation
+ try:
+ client = bdclient(api_token='test_token_123456789', auto_create_zones=False)
+ print('Client creation successful')
+
+ # Test that basic methods exist
+ methods = ['scrape', 'search', 'download_content']
+ for method in methods:
+ if hasattr(client, method):
+ print(f'Method {method} exists')
+ else:
+ print(f'Method {method} missing (might be version difference)')
+
+ except Exception as e:
+ print(f'ERROR: Client creation failed: {e}')
+ sys.exit(1)
+
+ print('PyPI package basic functionality test completed')
+ "
+
+ - name: Test PyPI package compatibility
+ run: |
+ python -c "
+ print('Running PyPI package compatibility tests...')
+
+ # Test import compatibility
+ try:
+ from brightdata import bdclient, __version__
+ from brightdata.exceptions import ValidationError
+ print('Core imports working')
+ except ImportError as e:
+ print(f'ERROR: Import failed: {e}')
+ exit(1)
+
+ # Test that client requires token
+ try:
+ client = bdclient() # Should fail without token
+ print('WARNING: Client created without token - unexpected')
+ except Exception:
+ print('Token requirement validated')
+
+ print('PyPI package compatibility tests completed')
+ "
\ No newline at end of file
From c590d4942cf1b3c71574e28742215c3ed5355144 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:21:30 +0200
Subject: [PATCH 10/70] Create tst
---
examples/tst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 examples/tst
diff --git a/examples/tst b/examples/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/examples/tst
@@ -0,0 +1 @@
+
From baea61662b70cec4e73ec89e33e5367c431a141e Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:21:53 +0200
Subject: [PATCH 11/70] Add files via upload
---
examples/browser_connection_example.py | 33 +++++++++++++++++++++
examples/crawl_example.py | 11 +++++++
examples/download_snapshot_example.py | 9 ++++++
examples/extract_example.py | 30 +++++++++++++++++++
examples/scrape_chatgpt_example.py | 15 ++++++++++
examples/scrape_example.py | 16 +++++++++++
examples/scrape_linkedin_example.py | 32 +++++++++++++++++++++
examples/search_example.py | 16 +++++++++++
examples/search_linkedin_example.py | 40 ++++++++++++++++++++++++++
9 files changed, 202 insertions(+)
create mode 100644 examples/browser_connection_example.py
create mode 100644 examples/crawl_example.py
create mode 100644 examples/download_snapshot_example.py
create mode 100644 examples/extract_example.py
create mode 100644 examples/scrape_chatgpt_example.py
create mode 100644 examples/scrape_example.py
create mode 100644 examples/scrape_linkedin_example.py
create mode 100644 examples/search_example.py
create mode 100644 examples/search_linkedin_example.py
diff --git a/examples/browser_connection_example.py b/examples/browser_connection_example.py
new file mode 100644
index 0000000..a6ebf98
--- /dev/null
+++ b/examples/browser_connection_example.py
@@ -0,0 +1,33 @@
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from brightdata import bdclient
+from playwright.sync_api import sync_playwright, Playwright
+
+client = bdclient(
+ api_token="your-api-key",
+ browser_username="copy-from-zone-configuration",
+ browser_password="copy-from-zone-configuration",
+ browser_zone="your-custom-browser-zone"
+) # Hover over the function to see browser parameters (can also be taken from .env file)
+
+def scrape(playwright: Playwright, url="https://example.com"):
+ browser = playwright.chromium.connect_over_cdp(client.connect_browser()) # Connect to the browser using Bright Data's endpoint
+ try:
+ print(f'Connected! Navigating to {url}...')
+ page = browser.new_page()
+ page.goto(url, timeout=2*60_000)
+ print('Navigated! Scraping page content...')
+ data = page.content()
+ print(f'Scraped! Data: {data}')
+ finally:
+ browser.close()
+
+
+def main():
+ with sync_playwright() as playwright:
+ scrape(playwright)
+
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/examples/crawl_example.py b/examples/crawl_example.py
new file mode 100644
index 0000000..65b2695
--- /dev/null
+++ b/examples/crawl_example.py
@@ -0,0 +1,11 @@
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from brightdata import bdclient
+client = bdclient(api_token="your-api-key") # can also be taken from .env file
+
+result = client.crawl(
+ url="https://example.com/", depth=1, filter="/product/",
+ exclude_filter="/ads/", custom_output_fields=["markdown", "url", "page_title"]
+)
+print(f"Snapshot ID: {result['snapshot_id']}")
\ No newline at end of file
diff --git a/examples/download_snapshot_example.py b/examples/download_snapshot_example.py
new file mode 100644
index 0000000..ea7f8f0
--- /dev/null
+++ b/examples/download_snapshot_example.py
@@ -0,0 +1,9 @@
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from brightdata import bdclient
+
+client = bdclient(api_token="your-api-key") # can also be taken from .env file
+
+snapshot_id = "" # replace with your snapshot ID
+
+client.download_snapshot(snapshot_id)
\ No newline at end of file
diff --git a/examples/extract_example.py b/examples/extract_example.py
new file mode 100644
index 0000000..0723350
--- /dev/null
+++ b/examples/extract_example.py
@@ -0,0 +1,30 @@
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from brightdata import bdclient
+
+client = bdclient()
+
+# Basic extraction
+result = client.extract("Extract news headlines from CNN.com")
+print(result)
+
+# Using URL parameter with structured output
+schema = {
+ "type": "object",
+ "properties": {
+ "headlines": {
+ "type": "array",
+ "items": {"type": "string"}
+ }
+ },
+ "required": ["headlines"],
+ "additionalProperties": False
+}
+
+result = client.extract(
+ query="Extract main headlines",
+ url="https://cnn.com",
+ output_scheme=schema
+)
+print(result)
\ No newline at end of file
diff --git a/examples/scrape_chatgpt_example.py b/examples/scrape_chatgpt_example.py
new file mode 100644
index 0000000..b695734
--- /dev/null
+++ b/examples/scrape_chatgpt_example.py
@@ -0,0 +1,15 @@
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from brightdata import bdclient
+
+client = bdclient("your-api-key") # can also be taken from .env file
+
+result = client.search_chatGPT(
+ prompt="what day is it today?"
+ # prompt=["What are the top 3 programming languages in 2024?", "Best hotels in New York", "Explain quantum computing"],
+ # additional_prompt=["Can you explain why?", "Are you sure?", ""]
+)
+
+client.download_content(result)
+# In case of timeout error, your snapshot is still created and can be downloaded using the snapshot ID example file
diff --git a/examples/scrape_example.py b/examples/scrape_example.py
new file mode 100644
index 0000000..bf6b1a8
--- /dev/null
+++ b/examples/scrape_example.py
@@ -0,0 +1,16 @@
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from brightdata import bdclient
+
+client = bdclient(api_token="your-API-key") # Can also be taken from .env file
+
+URL = (["https://www.amazon.com/dp/B079QHML21",
+ "https://www.ebay.com/itm/365771796300",
+ "https://www.walmart.com/ip/Apple-MacBook-Air-13-3-inch-Laptop-Space-Gray-M1-Chip-8GB-RAM-256GB-storage/609040889"])
+
+results = client.scrape(url=URL, max_workers=5)
+
+result = client.parse_content(results, extract_text=True) # Choose what to extract
+
+print(result)
\ No newline at end of file
diff --git a/examples/scrape_linkedin_example.py b/examples/scrape_linkedin_example.py
new file mode 100644
index 0000000..8483f5a
--- /dev/null
+++ b/examples/scrape_linkedin_example.py
@@ -0,0 +1,32 @@
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from brightdata import bdclient
+
+client = bdclient() # can also be taken from .env file
+
+# LinkedIn Profile URLs
+profile_url = "https://www.linkedin.com/in/elad-moshe-05a90413/"
+
+# LinkedIn Company URLs
+company_urls = [
+ "https://il.linkedin.com/company/ibm",
+ "https://www.linkedin.com/company/bright-data",
+ "https://www.linkedin.com/company/stalkit"
+]
+
+# LinkedIn Job URLs
+job_urls = [
+ "https://www.linkedin.com/jobs/view/remote-typist-%E2%80%93-data-entry-specialist-work-from-home-at-cwa-group-4181034038?trk=public_jobs_topcard-title",
+ "https://www.linkedin.com/jobs/view/arrt-r-at-shared-imaging-llc-4180989163?trk=public_jobs_topcard-title"
+]
+
+# LinkedIn Post URLs
+post_urls = [
+ "https://www.linkedin.com/posts/orlenchner_scrapecon-activity-7180537307521769472-oSYN?trk=public_profile",
+ "https://www.linkedin.com/pulse/getting-value-out-sunburst-guillaume-de-b%C3%A9naz%C3%A9?trk=public_profile_article_view"
+]
+
+results = client.scrape_linkedin.posts(post_urls) # can also be changed to async
+
+client.download_content(results)
\ No newline at end of file
diff --git a/examples/search_example.py b/examples/search_example.py
new file mode 100644
index 0000000..3b9e3eb
--- /dev/null
+++ b/examples/search_example.py
@@ -0,0 +1,16 @@
+import sys, os
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from brightdata import bdclient
+
+client = bdclient(api_token="your-api-token", auto_create_zones=False, serp_zone="your-custom-serp-zone") # zone and API token can also be defined in .env file
+
+query = ["iphone 16", "coffee maker", "portable projector", "sony headphones",
+ "laptop stand", "power bank", "running shoes", "android tablet",
+ "hiking backpack", "dash cam"]
+
+results = client.search(query, max_workers=10,
+response_format="json", parse=True)
+
+client.download_content(results, parse=True) # parse=True to save as JSON, otherwise saves as raw HTML
\ No newline at end of file
diff --git a/examples/search_linkedin_example.py b/examples/search_linkedin_example.py
new file mode 100644
index 0000000..be5f7df
--- /dev/null
+++ b/examples/search_linkedin_example.py
@@ -0,0 +1,40 @@
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from brightdata import bdclient
+
+client = bdclient(api_token="your-api-key") # can also be taken from .env file
+
+# Search LinkedIn profiles by name
+first_names = ["James", "Idan"]
+last_names = ["Smith", "Vilenski"]
+result = client.search_linkedin.profiles(first_names, last_names)
+
+# Search jobs by URL
+job_urls = [
+ "https://www.linkedin.com/jobs/search?keywords=Software&location=Tel%20Aviv-Yafo",
+ "https://www.linkedin.com/jobs/reddit-inc.-jobs-worldwide?f_C=150573"
+]
+result = client.search_linkedin.jobs(url=job_urls)
+
+# Search jobs by keyword and location
+result = client.search_linkedin.jobs(
+ location="Paris",
+ keyword="product manager",
+ country="FR",
+ time_range="Past month",
+ job_type="Full-time"
+)
+
+# Search posts by profile URL with date range
+result = client.search_linkedin.posts(
+ profile_url="https://www.linkedin.com/in/bettywliu",
+ start_date="2018-04-25T00:00:00.000Z",
+ end_date="2021-05-25T00:00:00.000Z"
+)
+# Search posts by company URL
+result = client.search_linkedin.posts(
+ company_url="https://www.linkedin.com/company/bright-data"
+)
+
+# Returns snapshot ID that can be used to download the content later using download_snapshot function
\ No newline at end of file
From 468a96f19d94be277cf800ce081a35fbfd7b047d Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:22:22 +0200
Subject: [PATCH 12/70] Delete src/.github/tst
---
src/.github/tst | 1 -
1 file changed, 1 deletion(-)
delete mode 100644 src/.github/tst
diff --git a/src/.github/tst b/src/.github/tst
deleted file mode 100644
index 8b13789..0000000
--- a/src/.github/tst
+++ /dev/null
@@ -1 +0,0 @@
-
From f469f81e3f83fcfa72a7b42276ba64ebd7435440 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:24:44 +0200
Subject: [PATCH 13/70] Create tst
---
tests/tst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 tests/tst
diff --git a/tests/tst b/tests/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/tests/tst
@@ -0,0 +1 @@
+
From a6a8e5ae925c0bd6c7e80566add2c768862ff069 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:25:03 +0200
Subject: [PATCH 14/70] Add files via upload
---
tests/__init__.py | 0
tests/test_client.py | 121 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 121 insertions(+)
create mode 100644 tests/__init__.py
create mode 100644 tests/test_client.py
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_client.py b/tests/test_client.py
new file mode 100644
index 0000000..51b1315
--- /dev/null
+++ b/tests/test_client.py
@@ -0,0 +1,121 @@
+"""
+Comprehensive tests for the Bright Data SDK client.
+
+This test suite covers:
+- Client initialization with API tokens (from parameter and environment)
+- API token validation and error handling for missing tokens
+- Zone configuration (default and custom zone names)
+- URL validation in scrape method (scheme requirement)
+- Search query validation (empty query handling)
+- Search engine validation (unsupported engine handling)
+
+All tests are designed to run without requiring real API tokens by:
+- Using sufficiently long test tokens to pass validation
+- Mocking zone management to avoid network calls
+- Testing validation logic and error messages
+"""
+
+import pytest
+import os
+from unittest.mock import patch
+
+from brightdata import bdclient
+from brightdata.exceptions import ValidationError
+
+
+class TestBdClient:
+ """Test cases for the main bdclient class"""
+
+ @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones')
+ def test_client_init_with_token(self, mock_zones):
+ """Test client initialization with API token"""
+ with patch.dict(os.environ, {}, clear=True):
+ client = bdclient(api_token="valid_test_token_12345678", auto_create_zones=False)
+ assert client.api_token == "valid_test_token_12345678"
+
+ @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones')
+ def test_client_init_from_env(self, mock_zones):
+ """Test client initialization from environment variable"""
+ with patch.dict(os.environ, {"BRIGHTDATA_API_TOKEN": "valid_env_token_12345678"}):
+ client = bdclient(auto_create_zones=False)
+ assert client.api_token == "valid_env_token_12345678"
+
+ def test_client_init_no_token_raises_error(self):
+ """Test that missing API token raises ValidationError"""
+ with patch.dict(os.environ, {}, clear=True):
+ with patch('dotenv.load_dotenv'):
+ with pytest.raises(ValidationError, match="API token is required"):
+ bdclient()
+
+ @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones')
+ def test_client_zone_defaults(self, mock_zones):
+ """Test default zone configurations"""
+ with patch.dict(os.environ, {}, clear=True):
+ client = bdclient(api_token="valid_test_token_12345678", auto_create_zones=False)
+ assert client.web_unlocker_zone == "sdk_unlocker"
+ assert client.serp_zone == "sdk_serp"
+
+ @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones')
+ def test_client_custom_zones(self, mock_zones):
+ """Test custom zone configuration"""
+ with patch.dict(os.environ, {}, clear=True):
+ client = bdclient(
+ api_token="valid_test_token_12345678",
+ web_unlocker_zone="custom_unlocker",
+ serp_zone="custom_serp",
+ auto_create_zones=False
+ )
+ assert client.web_unlocker_zone == "custom_unlocker"
+ assert client.serp_zone == "custom_serp"
+
+
+class TestClientMethods:
+ """Test cases for client methods with mocked responses"""
+
+ @pytest.fixture
+ @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones')
+ def client(self, mock_zones):
+ """Create a test client with mocked validation"""
+ with patch.dict(os.environ, {}, clear=True):
+ client = bdclient(api_token="valid_test_token_12345678", auto_create_zones=False)
+ return client
+
+ def test_scrape_single_url_validation(self, client):
+ """Test URL validation in scrape method"""
+ with pytest.raises(ValidationError, match="URL must include a scheme"):
+ client.scrape("not_a_url")
+
+ def test_search_empty_query_validation(self, client):
+ """Test query validation in search method"""
+ with pytest.raises(ValidationError, match="cannot be empty"):
+ client.search("")
+
+ def test_search_unsupported_engine(self, client):
+ """Test unsupported search engine validation"""
+ with pytest.raises(ValidationError, match="Invalid search engine"):
+ client.search("test query", search_engine="invalid_engine")
+
+ def test_search_with_parse_parameter(self, client, monkeypatch):
+ """Test search with parse parameter adds brd_json=1 to URL"""
+ # Mock the session.post method to capture the request
+ captured_request = {}
+
+ def mock_post(*args, **kwargs):
+ captured_request.update(kwargs)
+ from unittest.mock import Mock
+ response = Mock()
+ response.status_code = 200
+ response.text = "mocked html response"
+ return response
+
+ monkeypatch.setattr(client.search_api.session, 'post', mock_post)
+
+ result = client.search("test query", parse=True)
+
+ # Verify the request was made with correct URL containing &brd_json=1
+ request_data = captured_request.get('json', {})
+ assert "&brd_json=1" in request_data["url"]
+
+
+if __name__ == "__main__":
+ pytest.main([__file__])
\ No newline at end of file
From 35e632a0593432597280737cfb03584a7b71ff3a Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:25:22 +0200
Subject: [PATCH 15/70] Delete tests/tst
---
tests/tst | 1 -
1 file changed, 1 deletion(-)
delete mode 100644 tests/tst
diff --git a/tests/tst b/tests/tst
deleted file mode 100644
index 8b13789..0000000
--- a/tests/tst
+++ /dev/null
@@ -1 +0,0 @@
-
From b54f9f067e3b7a04561e91ec61a72c0443ee9742 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:26:20 +0200
Subject: [PATCH 16/70] Create tst
---
.github/workflows/tst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 .github/workflows/tst
diff --git a/.github/workflows/tst b/.github/workflows/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/.github/workflows/tst
@@ -0,0 +1 @@
+
From 6d625f4b0017177206308683c0ac34106fe20a18 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:26:40 +0200
Subject: [PATCH 17/70] Add files via upload
---
.github/workflows/publish.yml | 65 +++++++++++++++++
.github/workflows/test.yml | 129 ++++++++++++++++++++++++++++++++++
2 files changed, 194 insertions(+)
create mode 100644 .github/workflows/publish.yml
create mode 100644 .github/workflows/test.yml
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..7c2ec42
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,65 @@
+name: Build and Publish
+
+on:
+ push:
+ tags:
+ - 'v*'
+ release:
+ types: [published]
+ workflow_dispatch:
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.8'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install build twine
+ pip install -r requirements.txt
+
+ - name: Build package
+ run: python -m build
+
+ - name: Upload build artifacts
+ uses: actions/upload-artifact@v4
+ with:
+ name: dist-files
+ path: dist/
+
+ - name: Publish to PyPI
+ if: github.event_name == 'release'
+ env:
+ TWINE_USERNAME: __token__
+ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+ run: |
+ twine upload dist/*
+
+ test-install:
+ runs-on: ubuntu-latest
+ needs: build
+
+ steps:
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.8'
+
+ - name: Download build artifacts
+ uses: actions/download-artifact@v4
+ with:
+ name: dist-files
+ path: dist/
+
+ - name: Test wheel installation
+ run: |
+ pip install dist/*.whl
+ python -c "import brightdata; print('✅ Package imported successfully')"
\ No newline at end of file
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..69a0a2d
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,129 @@
+name: Tests
+
+on:
+ push:
+ branches: [ main, develop ]
+ pull_request:
+ branches: [ main ]
+ schedule:
+ - cron: '0 2 * * *'
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+ pip install pytest pytest-cov
+
+ - name: Test package import
+ run: |
+ python -c "import brightdata; print('Import successful')"
+
+ - name: Run tests
+ run: |
+ python -m pytest tests/ -v --cov=brightdata --cov-report=xml
+
+ - name: Upload coverage to Codecov
+ if: matrix.python-version == '3.8'
+ uses: codecov/codecov-action@v3
+ with:
+ file: ./coverage.xml
+
+ test-pypi-package:
+ runs-on: ubuntu-latest
+ if: github.event_name == 'schedule'
+ strategy:
+ matrix:
+ python-version: ['3.8', '3.11']
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install PyPI package
+ run: |
+ python -m pip install --upgrade pip
+ pip install brightdata-sdk
+ pip install pytest
+
+ - name: Test PyPI package import
+ run: |
+ python -c "import brightdata; print('PyPI package import successful')"
+ python -c "from brightdata import bdclient; print('bdclient import successful')"
+
+ - name: Test PyPI package basic functionality
+ run: |
+ python -c "
+ import sys
+ from brightdata import bdclient, __version__
+ print(f'PyPI package version: {__version__}')
+
+ # Test that validation works (accept any validation error as success)
+ try:
+ client = bdclient(api_token='test_token_too_short')
+ print('WARNING: No validation error - this might indicate an issue')
+ except Exception as e:
+ print(f'Validation error caught: {str(e)[:100]}...')
+ print('PyPI package validation working correctly')
+
+ # Test basic client creation with disabled auto-zone creation
+ try:
+ client = bdclient(api_token='test_token_123456789', auto_create_zones=False)
+ print('Client creation successful')
+
+ # Test that basic methods exist
+ methods = ['scrape', 'search', 'download_content']
+ for method in methods:
+ if hasattr(client, method):
+ print(f'Method {method} exists')
+ else:
+ print(f'Method {method} missing (might be version difference)')
+
+ except Exception as e:
+ print(f'ERROR: Client creation failed: {e}')
+ sys.exit(1)
+
+ print('PyPI package basic functionality test completed')
+ "
+
+ - name: Test PyPI package compatibility
+ run: |
+ python -c "
+ print('Running PyPI package compatibility tests...')
+
+ # Test import compatibility
+ try:
+ from brightdata import bdclient, __version__
+ from brightdata.exceptions import ValidationError
+ print('Core imports working')
+ except ImportError as e:
+ print(f'ERROR: Import failed: {e}')
+ exit(1)
+
+ # Test that client requires token
+ try:
+ client = bdclient() # Should fail without token
+ print('WARNING: Client created without token - unexpected')
+ except Exception:
+ print('Token requirement validated')
+
+ print('PyPI package compatibility tests completed')
+ "
\ No newline at end of file
From 3407ff9a10868472e342d88722e52730703d004b Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:26:58 +0200
Subject: [PATCH 18/70] Delete src/.github/workflows directory
---
src/.github/workflows/publish.yml | 65 ---------------
src/.github/workflows/test.yml | 129 ------------------------------
src/.github/workflows/tst | 1 -
3 files changed, 195 deletions(-)
delete mode 100644 src/.github/workflows/publish.yml
delete mode 100644 src/.github/workflows/test.yml
delete mode 100644 src/.github/workflows/tst
diff --git a/src/.github/workflows/publish.yml b/src/.github/workflows/publish.yml
deleted file mode 100644
index 7c2ec42..0000000
--- a/src/.github/workflows/publish.yml
+++ /dev/null
@@ -1,65 +0,0 @@
-name: Build and Publish
-
-on:
- push:
- tags:
- - 'v*'
- release:
- types: [published]
- workflow_dispatch:
-
-jobs:
- build:
- runs-on: ubuntu-latest
-
- steps:
- - uses: actions/checkout@v4
-
- - name: Set up Python
- uses: actions/setup-python@v4
- with:
- python-version: '3.8'
-
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install build twine
- pip install -r requirements.txt
-
- - name: Build package
- run: python -m build
-
- - name: Upload build artifacts
- uses: actions/upload-artifact@v4
- with:
- name: dist-files
- path: dist/
-
- - name: Publish to PyPI
- if: github.event_name == 'release'
- env:
- TWINE_USERNAME: __token__
- TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
- run: |
- twine upload dist/*
-
- test-install:
- runs-on: ubuntu-latest
- needs: build
-
- steps:
- - name: Set up Python
- uses: actions/setup-python@v4
- with:
- python-version: '3.8'
-
- - name: Download build artifacts
- uses: actions/download-artifact@v4
- with:
- name: dist-files
- path: dist/
-
- - name: Test wheel installation
- run: |
- pip install dist/*.whl
- python -c "import brightdata; print('✅ Package imported successfully')"
\ No newline at end of file
diff --git a/src/.github/workflows/test.yml b/src/.github/workflows/test.yml
deleted file mode 100644
index 69a0a2d..0000000
--- a/src/.github/workflows/test.yml
+++ /dev/null
@@ -1,129 +0,0 @@
-name: Tests
-
-on:
- push:
- branches: [ main, develop ]
- pull_request:
- branches: [ main ]
- schedule:
- - cron: '0 2 * * *'
-
-jobs:
- test:
- runs-on: ubuntu-latest
- strategy:
- matrix:
- python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
-
- steps:
- - uses: actions/checkout@v4
-
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v4
- with:
- python-version: ${{ matrix.python-version }}
-
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install -r requirements.txt
- pip install pytest pytest-cov
-
- - name: Test package import
- run: |
- python -c "import brightdata; print('Import successful')"
-
- - name: Run tests
- run: |
- python -m pytest tests/ -v --cov=brightdata --cov-report=xml
-
- - name: Upload coverage to Codecov
- if: matrix.python-version == '3.8'
- uses: codecov/codecov-action@v3
- with:
- file: ./coverage.xml
-
- test-pypi-package:
- runs-on: ubuntu-latest
- if: github.event_name == 'schedule'
- strategy:
- matrix:
- python-version: ['3.8', '3.11']
-
- steps:
- - uses: actions/checkout@v4
-
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v4
- with:
- python-version: ${{ matrix.python-version }}
-
- - name: Install PyPI package
- run: |
- python -m pip install --upgrade pip
- pip install brightdata-sdk
- pip install pytest
-
- - name: Test PyPI package import
- run: |
- python -c "import brightdata; print('PyPI package import successful')"
- python -c "from brightdata import bdclient; print('bdclient import successful')"
-
- - name: Test PyPI package basic functionality
- run: |
- python -c "
- import sys
- from brightdata import bdclient, __version__
- print(f'PyPI package version: {__version__}')
-
- # Test that validation works (accept any validation error as success)
- try:
- client = bdclient(api_token='test_token_too_short')
- print('WARNING: No validation error - this might indicate an issue')
- except Exception as e:
- print(f'Validation error caught: {str(e)[:100]}...')
- print('PyPI package validation working correctly')
-
- # Test basic client creation with disabled auto-zone creation
- try:
- client = bdclient(api_token='test_token_123456789', auto_create_zones=False)
- print('Client creation successful')
-
- # Test that basic methods exist
- methods = ['scrape', 'search', 'download_content']
- for method in methods:
- if hasattr(client, method):
- print(f'Method {method} exists')
- else:
- print(f'Method {method} missing (might be version difference)')
-
- except Exception as e:
- print(f'ERROR: Client creation failed: {e}')
- sys.exit(1)
-
- print('PyPI package basic functionality test completed')
- "
-
- - name: Test PyPI package compatibility
- run: |
- python -c "
- print('Running PyPI package compatibility tests...')
-
- # Test import compatibility
- try:
- from brightdata import bdclient, __version__
- from brightdata.exceptions import ValidationError
- print('Core imports working')
- except ImportError as e:
- print(f'ERROR: Import failed: {e}')
- exit(1)
-
- # Test that client requires token
- try:
- client = bdclient() # Should fail without token
- print('WARNING: Client created without token - unexpected')
- except Exception:
- print('Token requirement validated')
-
- print('PyPI package compatibility tests completed')
- "
\ No newline at end of file
diff --git a/src/.github/workflows/tst b/src/.github/workflows/tst
deleted file mode 100644
index 8b13789..0000000
--- a/src/.github/workflows/tst
+++ /dev/null
@@ -1 +0,0 @@
-
From e7a5e316efa02f6b447b30009f070bc5d3c8aac7 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:27:17 +0200
Subject: [PATCH 19/70] Delete .github/workflows/tst
---
.github/workflows/tst | 1 -
1 file changed, 1 deletion(-)
delete mode 100644 .github/workflows/tst
diff --git a/.github/workflows/tst b/.github/workflows/tst
deleted file mode 100644
index 8b13789..0000000
--- a/.github/workflows/tst
+++ /dev/null
@@ -1 +0,0 @@
-
From 35f434b0a5ad6400ae37e34da642b78cf6b49a16 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:35:08 +0200
Subject: [PATCH 20/70] Create tst
---
src/utils/tst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 src/utils/tst
diff --git a/src/utils/tst b/src/utils/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/utils/tst
@@ -0,0 +1 @@
+
From 916467aa1209d846d639c19d88ca5803cb29e225 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:35:31 +0200
Subject: [PATCH 21/70] Add files via upload
---
src/utils/__init__.py | 35 +++++
src/utils/logging_config.py | 177 +++++++++++++++++++++
src/utils/parser.py | 264 ++++++++++++++++++++++++++++++++
src/utils/response_validator.py | 49 ++++++
src/utils/retry.py | 90 +++++++++++
src/utils/validation.py | 183 ++++++++++++++++++++++
src/utils/zone_manager.py | 174 +++++++++++++++++++++
7 files changed, 972 insertions(+)
create mode 100644 src/utils/__init__.py
create mode 100644 src/utils/logging_config.py
create mode 100644 src/utils/parser.py
create mode 100644 src/utils/response_validator.py
create mode 100644 src/utils/retry.py
create mode 100644 src/utils/validation.py
create mode 100644 src/utils/zone_manager.py
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000..75a2f6c
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1,35 @@
+from .validation import (
+ validate_url, validate_zone_name, validate_country_code,
+ validate_timeout, validate_max_workers, validate_url_list,
+ validate_search_engine, validate_query, validate_response_format,
+ validate_http_method
+)
+from .retry import retry_request
+from .zone_manager import ZoneManager
+from .logging_config import setup_logging, get_logger, log_request
+from .response_validator import safe_json_parse, validate_response_size, check_response_not_empty
+from .parser import parse_content, parse_multiple, extract_structured_data
+
+__all__ = [
+ 'validate_url',
+ 'validate_zone_name',
+ 'validate_country_code',
+ 'validate_timeout',
+ 'validate_max_workers',
+ 'validate_url_list',
+ 'validate_search_engine',
+ 'validate_query',
+ 'validate_response_format',
+ 'validate_http_method',
+ 'retry_request',
+ 'ZoneManager',
+ 'setup_logging',
+ 'get_logger',
+ 'log_request',
+ 'safe_json_parse',
+ 'validate_response_size',
+ 'check_response_not_empty',
+ 'parse_content',
+ 'parse_multiple',
+ 'extract_structured_data'
+]
\ No newline at end of file
diff --git a/src/utils/logging_config.py b/src/utils/logging_config.py
new file mode 100644
index 0000000..89289da
--- /dev/null
+++ b/src/utils/logging_config.py
@@ -0,0 +1,177 @@
+"""
+Structured logging configuration for Bright Data SDK
+"""
+import logging
+import json
+import time
+from typing import Dict, Any
+import uuid
+
+
+class StructuredFormatter(logging.Formatter):
+ """Custom formatter that outputs structured JSON logs"""
+
+ def __init__(self):
+ super().__init__()
+ self.start_time = time.time()
+
+ def format(self, record):
+ log_data = {
+ 'timestamp': self.formatTime(record),
+ 'level': record.levelname,
+ 'logger': record.name,
+ 'message': record.getMessage(),
+ 'module': record.module,
+ 'function': record.funcName,
+ 'line': record.lineno
+ }
+
+ correlation_id = getattr(record, 'correlation_id', None)
+ if correlation_id:
+ log_data['correlation_id'] = correlation_id
+
+ if hasattr(record, 'url'):
+ log_data['url'] = record.url
+ if hasattr(record, 'method'):
+ log_data['method'] = record.method
+ if hasattr(record, 'status_code'):
+ log_data['status_code'] = record.status_code
+ if hasattr(record, 'response_time'):
+ log_data['response_time_ms'] = record.response_time
+
+ if record.exc_info:
+ log_data['exception'] = {
+ 'type': record.exc_info[0].__name__ if record.exc_info[0] else None,
+ 'message': str(record.exc_info[1]) if record.exc_info[1] else None,
+ 'traceback': self.formatException(record.exc_info)
+ }
+
+ log_data = self._sanitize_log_data(log_data)
+
+ return json.dumps(log_data, default=str)
+
+ def _sanitize_log_data(self, log_data: Dict[str, Any]) -> Dict[str, Any]:
+ """Remove or mask sensitive information from log data"""
+ sensitive_keys = ['authorization', 'token', 'api_token', 'password', 'secret']
+
+ def sanitize_value(key: str, value: Any) -> Any:
+ if isinstance(key, str) and any(sensitive in key.lower() for sensitive in sensitive_keys):
+ return "***REDACTED***"
+ elif isinstance(value, str) and len(value) > 20:
+ if value.isalnum() and len(value) > 32:
+ return f"{value[:8]}***REDACTED***{value[-4:]}"
+ return value
+
+ def recursive_sanitize(obj):
+ if isinstance(obj, dict):
+ return {k: recursive_sanitize(sanitize_value(k, v)) for k, v in obj.items()}
+ elif isinstance(obj, list):
+ return [recursive_sanitize(item) for item in obj]
+ else:
+ return obj
+
+ return recursive_sanitize(log_data)
+
+
+def setup_logging(level: str = "INFO", structured: bool = True, verbose: bool = True) -> None:
+ """
+ Setup logging configuration for the SDK
+
+ Args:
+ level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+ structured: Whether to use structured JSON logging
+ verbose: Whether to show verbose logging (default: True)
+ When False, only WARNING and above are shown
+ When True, uses the specified level
+ """
+ if not verbose:
+ log_level = logging.WARNING
+ else:
+ log_level = getattr(logging, level.upper(), logging.INFO)
+
+ root_logger = logging.getLogger('brightdata')
+ root_logger.handlers.clear()
+
+ handler = logging.StreamHandler()
+ handler.setLevel(log_level)
+
+ if structured:
+ formatter = StructuredFormatter()
+ else:
+ formatter = logging.Formatter(
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ )
+
+ handler.setFormatter(formatter)
+ root_logger.addHandler(handler)
+ root_logger.setLevel(log_level)
+
+ root_logger.propagate = False
+
+
+def get_logger(name: str) -> logging.Logger:
+ """
+ Get a logger instance with the specified name
+
+ Args:
+ name: Logger name
+
+ Returns:
+ Configured logger instance
+ """
+ return logging.getLogger(f'brightdata.{name}')
+
+
+def log_request(logger: logging.Logger, method: str, url: str,
+ status_code: int = None, response_time: float = None,
+ correlation_id: str = None) -> None:
+ """
+ Log HTTP request details
+
+ Args:
+ logger: Logger instance
+ method: HTTP method
+ url: Request URL (will be sanitized)
+ status_code: HTTP response status code
+ response_time: Response time in milliseconds
+ correlation_id: Request correlation ID
+ """
+ extra = {
+ 'method': method,
+ 'url': _sanitize_url(url),
+ 'correlation_id': correlation_id or str(uuid.uuid4())
+ }
+
+ if status_code is not None:
+ extra['status_code'] = status_code
+ if response_time is not None:
+ extra['response_time'] = response_time
+
+ if status_code and status_code >= 400:
+ logger.error(f"HTTP request failed: {method} {_sanitize_url(url)}", extra=extra)
+ else:
+ logger.info(f"HTTP request: {method} {_sanitize_url(url)}", extra=extra)
+
+
+def _sanitize_url(url: str) -> str:
+ """Sanitize URL to remove sensitive query parameters"""
+ try:
+ from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
+
+ parsed = urlparse(url)
+ query_params = parse_qs(parsed.query)
+
+ sensitive_params = ['token', 'api_key', 'secret', 'password']
+ for param in sensitive_params:
+ if param in query_params:
+ query_params[param] = ['***REDACTED***']
+
+ sanitized_query = urlencode(query_params, doseq=True)
+ sanitized = urlunparse((
+ parsed.scheme, parsed.netloc, parsed.path,
+ parsed.params, sanitized_query, parsed.fragment
+ ))
+
+ return sanitized
+ except Exception:
+ return url.split('?')[0] + ('?***PARAMS_REDACTED***' if '?' in url else '')
\ No newline at end of file
diff --git a/src/utils/parser.py b/src/utils/parser.py
new file mode 100644
index 0000000..686ad39
--- /dev/null
+++ b/src/utils/parser.py
@@ -0,0 +1,264 @@
+"""
+Content parsing utilities for Bright Data SDK responses
+
+Provides functions to extract and parse content from scraping and search results.
+"""
+import json
+import re
+from typing import Any, Dict, List, Union, Optional
+
+from bs4 import BeautifulSoup
+
+
+def parse_content(data: Union[str, Dict, List], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+ """
+ Parse content from Bright Data API responses
+
+ Automatically detects and handles both single and multiple results from scrape/search operations.
+ Can be used as a standalone function or called from the client.
+
+ Args:
+ data: Response data from scrape() or search() - can be JSON dict/list or HTML string
+ extract_text: Extract clean text content (default: True)
+ extract_links: Extract all links from content (default: False)
+ extract_images: Extract image URLs from content (default: False)
+
+ Returns:
+ Dict containing parsed content for single results, or List[Dict] for multiple results with keys:
+ - 'type': 'json' or 'html'
+ - 'text': Cleaned text content (if extract_text=True)
+ - 'links': List of extracted links (if extract_links=True)
+ - 'images': List of image URLs (if extract_images=True)
+ - 'title': Page title (if available)
+ - 'raw_length': Length of original content
+ - 'structured_data': Original JSON data (if type='json')
+ """
+ if _is_multiple_results(data):
+ return parse_multiple(data, extract_text=extract_text, extract_links=extract_links, extract_images=extract_images)
+
+ return _parse_single_content(data, extract_text, extract_links, extract_images)
+
+
+def parse_multiple(data_list: List[Union[str, Dict]], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> List[Dict[str, Any]]:
+ """
+ Parse multiple content items (useful for batch scraping results)
+
+ Args:
+ data_list: List of response data items
+ extract_text: Extract clean text content (default: True)
+ extract_links: Extract all links from content (default: False)
+ extract_images: Extract image URLs from content (default: False)
+
+ Returns:
+ List of parsed content dictionaries
+ """
+ if not isinstance(data_list, list):
+ return []
+
+ return [_parse_single_content(item, extract_text, extract_links, extract_images) for item in data_list]
+
+
+def _is_multiple_results(data: Union[str, Dict, List]) -> bool:
+ """
+ Detect if data contains multiple scraping/search results
+
+ Args:
+ data: Response data to analyze
+
+ Returns:
+ True if data appears to be multiple results, False otherwise
+ """
+ if not isinstance(data, list):
+ return False
+
+ if len(data) <= 1:
+ return False
+
+ multiple_result_indicators = 0
+
+ for item in data[:3]:
+ if isinstance(item, dict):
+ common_keys = {'html', 'body', 'content', 'page_html', 'raw_html', 'url', 'status_code'}
+ if any(key in item for key in common_keys):
+ multiple_result_indicators += 1
+ elif isinstance(item, str) and len(item) > 100:
+ if '= 2
+
+
+def _parse_single_content(data: Union[str, Dict, List], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> Dict[str, Any]:
+ """
+ Parse single content item from Bright Data API responses
+
+ Args:
+ data: Single response data item - can be JSON dict or HTML string
+ extract_text: Extract clean text content (default: True)
+ extract_links: Extract all links from content (default: False)
+ extract_images: Extract image URLs from content (default: False)
+
+ Returns:
+ Dict containing parsed content
+ """
+ result = {
+ 'type': None,
+ 'raw_length': 0,
+ 'title': None
+ }
+
+ if data is None:
+ return result
+
+ if isinstance(data, (dict, list)):
+ result['type'] = 'json'
+ result['structured_data'] = data
+ result['raw_length'] = len(str(data))
+
+ html_content = _extract_html_from_json(data)
+ if html_content and (extract_text or extract_links or extract_images):
+ _parse_html_content(html_content, result, extract_text, extract_links, extract_images)
+
+ result['title'] = _extract_title_from_json(data)
+
+ elif isinstance(data, str):
+ result['type'] = 'html'
+ result['raw_length'] = len(data)
+
+ if extract_text or extract_links or extract_images:
+ _parse_html_content(data, result, extract_text, extract_links, extract_images)
+
+ return result
+
+
+def extract_structured_data(data: Union[str, Dict, List]) -> Optional[Dict]:
+ """
+ Extract structured data (JSON-LD, microdata) from content
+
+ Args:
+ data: Response data
+
+ Returns:
+ Structured data if found, None otherwise
+ """
+ html_content = None
+
+ if isinstance(data, str):
+ html_content = data
+ elif isinstance(data, (dict, list)):
+ html_content = _extract_html_from_json(data)
+
+ if not html_content:
+ return None
+
+ try:
+ soup = BeautifulSoup(html_content, 'html.parser')
+
+ scripts = soup.find_all('script', type='application/ld+json')
+ if scripts:
+ structured_data = []
+ for script in scripts:
+ try:
+ data = json.loads(script.string)
+ structured_data.append(data)
+ except json.JSONDecodeError:
+ continue
+ if structured_data:
+ return {'json_ld': structured_data}
+
+ except Exception:
+ pass
+
+ return None
+
+
+def _extract_html_from_json(data: Union[Dict, List]) -> Optional[str]:
+ """Extract HTML content from JSON response structure"""
+ if isinstance(data, dict):
+ html_keys = ['html', 'body', 'content', 'page_html', 'raw_html']
+ for key in html_keys:
+ if key in data and isinstance(data[key], str):
+ return data[key]
+
+ for value in data.values():
+ if isinstance(value, (dict, list)):
+ html = _extract_html_from_json(value)
+ if html:
+ return html
+
+ elif isinstance(data, list):
+ for item in data:
+ if isinstance(item, (dict, list)):
+ html = _extract_html_from_json(item)
+ if html:
+ return html
+
+ return None
+
+
+def _extract_title_from_json(data: Union[Dict, List]) -> Optional[str]:
+ """Extract title from JSON response structure"""
+ if isinstance(data, dict):
+ title_keys = ['title', 'page_title', 'name']
+ for key in title_keys:
+ if key in data and isinstance(data[key], str):
+ return data[key].strip()
+
+ for value in data.values():
+ if isinstance(value, (dict, list)):
+ title = _extract_title_from_json(value)
+ if title:
+ return title
+
+ elif isinstance(data, list):
+ for item in data:
+ if isinstance(item, (dict, list)):
+ title = _extract_title_from_json(item)
+ if title:
+ return title
+
+ return None
+
+
+def _parse_html_content(html: str, result: Dict, extract_text: bool, extract_links: bool, extract_images: bool):
+ """Parse HTML content and update result dictionary"""
+ try:
+ soup = BeautifulSoup(html, 'html.parser')
+
+ if not result.get('title'):
+ title_tag = soup.find('title')
+ if title_tag:
+ result['title'] = title_tag.get_text().strip()
+
+ if extract_text:
+ for script in soup(["script", "style"]):
+ script.decompose()
+
+ text = soup.get_text()
+ lines = (line.strip() for line in text.splitlines())
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
+ result['text'] = '\n'.join(chunk for chunk in chunks if chunk)
+
+ if extract_links:
+ links = []
+ for a_tag in soup.find_all('a', href=True):
+ href = a_tag['href']
+ text = a_tag.get_text().strip()
+ links.append({'url': href, 'text': text})
+ result['links'] = links
+
+ if extract_images:
+ images = []
+ for img_tag in soup.find_all('img', src=True):
+ src = img_tag['src']
+ alt = img_tag.get('alt', '').strip()
+ images.append({'url': src, 'alt': alt})
+ result['images'] = images
+
+ except Exception as e:
+ if extract_text:
+ result['text'] = f"HTML parsing failed: {str(e)}"
+ if extract_links:
+ result['links'] = []
+ if extract_images:
+ result['images'] = []
\ No newline at end of file
diff --git a/src/utils/response_validator.py b/src/utils/response_validator.py
new file mode 100644
index 0000000..83a9aa7
--- /dev/null
+++ b/src/utils/response_validator.py
@@ -0,0 +1,49 @@
+"""
+Minimal response validation utilities for Bright Data SDK
+"""
+import json
+from typing import Any, Dict, Union
+from ..exceptions import ValidationError
+
+
+def safe_json_parse(response_text: str) -> Dict[str, Any]:
+ """
+ Safely parse JSON response with minimal validation
+
+ Args:
+ response_text: Raw response text from API
+
+ Returns:
+ Parsed JSON data or original text if parsing fails
+ """
+ if not response_text:
+ return {}
+
+ try:
+ return json.loads(response_text)
+ except (json.JSONDecodeError, TypeError):
+ # Return original text if JSON parsing fails
+ return response_text
+
+
+def validate_response_size(response_text: str, max_size_mb: float = 100.0) -> None:
+ """
+ Quick size check to prevent memory issues
+
+ Args:
+ response_text: Response text to validate
+ max_size_mb: Maximum allowed size in megabytes
+ """
+ if response_text and len(response_text) > (max_size_mb * 1024 * 1024):
+ raise ValidationError(f"Response too large (>{max_size_mb}MB)")
+
+
+def check_response_not_empty(data: Any) -> None:
+ """
+ Minimal check that response contains data
+
+ Args:
+ data: Response data to check
+ """
+ if data is None or (isinstance(data, str) and len(data.strip()) == 0):
+ raise ValidationError("Empty response received")
\ No newline at end of file
diff --git a/src/utils/retry.py b/src/utils/retry.py
new file mode 100644
index 0000000..361645a
--- /dev/null
+++ b/src/utils/retry.py
@@ -0,0 +1,90 @@
+import time
+import random
+import requests
+from functools import wraps
+from ..exceptions import NetworkError, APIError
+
+
+def retry_request(max_retries=3, backoff_factor=1.5, retry_statuses=None, max_backoff=60):
+ """
+ Decorator for retrying requests with exponential backoff and jitter
+
+ Args:
+ max_retries: Maximum number of retry attempts
+ backoff_factor: Exponential backoff multiplier
+ retry_statuses: HTTP status codes that should trigger retries
+ max_backoff: Maximum backoff time in seconds
+ """
+ if retry_statuses is None:
+ retry_statuses = {429, 500, 502, 503, 504}
+
+ def decorator(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ last_exception = None
+
+ for attempt in range(max_retries + 1): # +1 to include initial attempt
+ try:
+ response = func(*args, **kwargs)
+
+ # Check if we should retry based on status code
+ if hasattr(response, 'status_code') and response.status_code in retry_statuses:
+ if attempt >= max_retries:
+ raise APIError(
+ f"Server error after {max_retries} retries: HTTP {response.status_code}",
+ status_code=response.status_code,
+ response_text=getattr(response, 'text', '')
+ )
+
+ # Calculate backoff with jitter
+ backoff_time = min(backoff_factor ** attempt, max_backoff)
+ jitter = backoff_time * 0.1 * random.random() # Add up to 10% jitter
+ total_delay = backoff_time + jitter
+
+ time.sleep(total_delay)
+ continue
+
+ return response
+
+ except requests.exceptions.ConnectTimeout as e:
+ last_exception = NetworkError(f"Connection timeout: {str(e)}")
+ except requests.exceptions.ReadTimeout as e:
+ last_exception = NetworkError(f"Read timeout: {str(e)}")
+ except requests.exceptions.Timeout as e:
+ last_exception = NetworkError(f"Request timeout: {str(e)}")
+ except requests.exceptions.ConnectionError as e:
+ # Handle DNS resolution, connection refused, etc.
+ if "Name or service not known" in str(e):
+ last_exception = NetworkError(f"DNS resolution failed: {str(e)}")
+ elif "Connection refused" in str(e):
+ last_exception = NetworkError(f"Connection refused: {str(e)}")
+ else:
+ last_exception = NetworkError(f"Connection error: {str(e)}")
+ except requests.exceptions.SSLError as e:
+ last_exception = NetworkError(f"SSL/TLS error: {str(e)}")
+ except requests.exceptions.ProxyError as e:
+ last_exception = NetworkError(f"Proxy error: {str(e)}")
+ except requests.exceptions.RequestException as e:
+ last_exception = NetworkError(f"Network error: {str(e)}")
+ except Exception as e:
+ # Catch any other unexpected exceptions
+ last_exception = NetworkError(f"Unexpected error: {str(e)}")
+
+ # If this was the last attempt, raise the exception
+ if attempt >= max_retries:
+ raise last_exception
+
+ # Calculate backoff with jitter for network errors
+ backoff_time = min(backoff_factor ** attempt, max_backoff)
+ jitter = backoff_time * 0.1 * random.random()
+ total_delay = backoff_time + jitter
+
+ time.sleep(total_delay)
+
+ # This should never be reached, but just in case
+ if last_exception:
+ raise last_exception
+ return None
+
+ return wrapper
+ return decorator
\ No newline at end of file
diff --git a/src/utils/validation.py b/src/utils/validation.py
new file mode 100644
index 0000000..938cb43
--- /dev/null
+++ b/src/utils/validation.py
@@ -0,0 +1,183 @@
+from urllib.parse import urlparse
+from typing import Union, List
+from ..exceptions import ValidationError
+
+
+def validate_url(url: str) -> None:
+ """Validate URL format with comprehensive checks"""
+ if not isinstance(url, str):
+ raise ValidationError(f"URL must be a string, got {type(url).__name__}")
+
+ if not url.strip():
+ raise ValidationError("URL cannot be empty or whitespace")
+
+ # Check URL length
+ if len(url) > 8192: # Common URL length limit
+ raise ValidationError("URL exceeds maximum length of 8192 characters")
+
+ try:
+ parsed = urlparse(url.strip())
+ if not parsed.scheme:
+ raise ValidationError(f"URL must include a scheme (http/https): {url}")
+ if parsed.scheme.lower() not in ['http', 'https']:
+ raise ValidationError(f"URL scheme must be http or https, got: {parsed.scheme}")
+ if not parsed.netloc:
+ raise ValidationError(f"URL must include a valid domain: {url}")
+ # Check for suspicious characters
+ if any(char in url for char in ['<', '>', '"', "'"]):
+ raise ValidationError("URL contains invalid characters")
+ except Exception as e:
+ if isinstance(e, ValidationError):
+ raise
+ raise ValidationError(f"Invalid URL format '{url}': {str(e)}")
+
+
+def validate_zone_name(zone: str = None) -> None:
+ """Validate zone name format with enhanced checks"""
+ if zone is None:
+ return # Zone can be None (optional parameter)
+
+ if not isinstance(zone, str):
+ raise ValidationError(f"Zone name must be a string, got {type(zone).__name__}")
+
+ zone = zone.strip()
+ if not zone:
+ raise ValidationError("Zone name cannot be empty or whitespace")
+
+ if len(zone) < 3:
+ raise ValidationError("Zone name must be at least 3 characters long")
+
+ if len(zone) > 63:
+ raise ValidationError("Zone name must not exceed 63 characters")
+
+ if not zone.replace('_', '').replace('-', '').isalnum():
+ raise ValidationError("Zone name can only contain letters, numbers, hyphens, and underscores")
+
+ if zone.startswith('-') or zone.endswith('-'):
+ raise ValidationError("Zone name cannot start or end with a hyphen")
+
+ if zone.startswith('_') or zone.endswith('_'):
+ raise ValidationError("Zone name cannot start or end with an underscore")
+
+
+def validate_country_code(country: str) -> None:
+ """Validate ISO country code format"""
+ if not isinstance(country, str):
+ raise ValidationError(f"Country code must be a string, got {type(country).__name__}")
+
+ country = country.strip().lower()
+ if len(country) == 0:
+ return
+
+ if len(country) != 2:
+ raise ValidationError("Country code must be exactly 2 characters (ISO 3166-1 alpha-2) or empty")
+
+ if not country.isalpha():
+ raise ValidationError("Country code must contain only letters")
+
+
+def validate_timeout(timeout: int) -> None:
+ """Validate timeout value"""
+ if timeout is None:
+ return # Timeout can be None (use default)
+
+ if not isinstance(timeout, int):
+ raise ValidationError(f"Timeout must be an integer, got {type(timeout).__name__}")
+
+ if timeout <= 0:
+ raise ValidationError("Timeout must be greater than 0 seconds")
+
+ if timeout > 300: # 5 minutes max
+ raise ValidationError("Timeout cannot exceed 300 seconds (5 minutes)")
+
+
+def validate_max_workers(max_workers: int) -> None:
+ """Validate max_workers parameter"""
+ if max_workers is None:
+ return # Can be None (use default)
+
+ if not isinstance(max_workers, int):
+ raise ValidationError(f"max_workers must be an integer, got {type(max_workers).__name__}")
+
+ if max_workers <= 0:
+ raise ValidationError("max_workers must be greater than 0")
+
+ if max_workers > 50: # Reasonable upper limit
+ raise ValidationError("max_workers cannot exceed 50 (to prevent resource exhaustion)")
+
+
+def validate_url_list(urls: List[str], max_urls: int = 100) -> None:
+ """Validate list of URLs with size limits"""
+ if not isinstance(urls, list):
+ raise ValidationError(f"URL list must be a list, got {type(urls).__name__}")
+
+ if len(urls) == 0:
+ raise ValidationError("URL list cannot be empty")
+
+ if len(urls) > max_urls:
+ raise ValidationError(f"URL list cannot contain more than {max_urls} URLs")
+
+ for i, url in enumerate(urls):
+ try:
+ validate_url(url)
+ except ValidationError as e:
+ raise ValidationError(f"Invalid URL at index {i}: {str(e)}")
+
+
+def validate_search_engine(search_engine: str) -> None:
+ """Validate search engine parameter"""
+ if not isinstance(search_engine, str):
+ raise ValidationError(f"Search engine must be a string, got {type(search_engine).__name__}")
+
+ valid_engines = ['google', 'bing', 'yandex']
+ search_engine = search_engine.strip().lower()
+
+ if search_engine not in valid_engines:
+ raise ValidationError(f"Invalid search engine '{search_engine}'. Valid options: {', '.join(valid_engines)}")
+
+
+def validate_query(query: Union[str, List[str]]) -> None:
+ """Validate search query parameter"""
+ if isinstance(query, str):
+ if not query.strip():
+ raise ValidationError("Search query cannot be empty or whitespace")
+ if len(query) > 2048:
+ raise ValidationError("Search query cannot exceed 2048 characters")
+ elif isinstance(query, list):
+ if len(query) == 0:
+ raise ValidationError("Query list cannot be empty")
+ if len(query) > 50: # Reasonable limit
+ raise ValidationError("Query list cannot contain more than 50 queries")
+ for i, q in enumerate(query):
+ if not isinstance(q, str):
+ raise ValidationError(f"Query at index {i} must be a string, got {type(q).__name__}")
+ if not q.strip():
+ raise ValidationError(f"Query at index {i} cannot be empty or whitespace")
+ if len(q) > 2048:
+ raise ValidationError(f"Query at index {i} cannot exceed 2048 characters")
+ else:
+ raise ValidationError(f"Query must be a string or list of strings, got {type(query).__name__}")
+
+
+def validate_response_format(response_format: str) -> None:
+ """Validate response format parameter"""
+ if not isinstance(response_format, str):
+ raise ValidationError(f"Response format must be a string, got {type(response_format).__name__}")
+
+ valid_formats = ['json', 'raw']
+ response_format = response_format.strip().lower()
+
+ if response_format not in valid_formats:
+ raise ValidationError(f"Invalid response format '{response_format}'. Valid options: {', '.join(valid_formats)}")
+
+
+def validate_http_method(method: str) -> None:
+ """Validate HTTP method parameter"""
+ if not isinstance(method, str):
+ raise ValidationError(f"HTTP method must be a string, got {type(method).__name__}")
+
+ valid_methods = ['GET', 'POST', 'PUT', 'DELETE', 'PATCH']
+ method = method.strip().upper()
+
+ if method not in valid_methods:
+ raise ValidationError(f"Invalid HTTP method '{method}'. Valid options: {', '.join(valid_methods)}")
\ No newline at end of file
diff --git a/src/utils/zone_manager.py b/src/utils/zone_manager.py
new file mode 100644
index 0000000..82a1205
--- /dev/null
+++ b/src/utils/zone_manager.py
@@ -0,0 +1,174 @@
+import requests
+import json
+import logging
+import time
+from ..exceptions import ZoneError, NetworkError, APIError
+from .retry import retry_request
+
+logger = logging.getLogger(__name__)
+
+
+class ZoneManager:
+ """Manages Bright Data zones - creation and validation"""
+
+ def __init__(self, session: requests.Session):
+ self.session = session
+
+ def ensure_required_zones(self, web_unlocker_zone: str, serp_zone: str):
+ """
+ Check if required zones exist and create them if they don't.
+ Raises exceptions on failure instead of silently continuing.
+ """
+ try:
+ logger.info("Checking existing zones...")
+ zones = self._get_zones_with_retry()
+ zone_names = {zone.get('name') for zone in zones}
+ logger.info(f"Found {len(zones)} existing zones")
+
+ zones_to_create = []
+ if web_unlocker_zone not in zone_names:
+ zones_to_create.append((web_unlocker_zone, 'unblocker'))
+ logger.info(f"Need to create web unlocker zone: {web_unlocker_zone}")
+
+ if serp_zone not in zone_names:
+ zones_to_create.append((serp_zone, 'serp'))
+ logger.info(f"Need to create SERP zone: {serp_zone}")
+
+ if not zones_to_create:
+ logger.info("All required zones already exist")
+ return
+
+ for zone_name, zone_type in zones_to_create:
+ logger.info(f"Creating zone: {zone_name} (type: {zone_type})")
+ self._create_zone_with_retry(zone_name, zone_type)
+ logger.info(f"Successfully created zone: {zone_name}")
+
+ self._verify_zones_created([zone[0] for zone in zones_to_create])
+
+ except (ZoneError, NetworkError, APIError):
+ raise
+ except requests.exceptions.RequestException as e:
+ logger.error(f"Network error while ensuring zones exist: {e}")
+ raise NetworkError(f"Failed to ensure zones due to network error: {str(e)}")
+ except json.JSONDecodeError as e:
+ logger.error(f"Invalid JSON response while checking zones: {e}")
+ raise ZoneError(f"Invalid response format from zones API: {str(e)}")
+ except Exception as e:
+ logger.error(f"Unexpected error while ensuring zones exist: {e}")
+ raise ZoneError(f"Unexpected error during zone creation: {str(e)}")
+
+ @retry_request(max_retries=3, backoff_factor=1.5, retry_statuses={429, 500, 502, 503, 504})
+ def _get_zones_with_retry(self):
+ """Get zones list with retry logic for network issues"""
+ response = self.session.get('https://api.brightdata.com/zone/get_active_zones')
+
+ if response.status_code == 200:
+ try:
+ return response.json() or []
+ except json.JSONDecodeError as e:
+ raise ZoneError(f"Invalid JSON response from zones API: {str(e)}")
+ elif response.status_code == 401:
+ raise ZoneError("Unauthorized (401): Check your API token and ensure it has proper permissions")
+ elif response.status_code == 403:
+ raise ZoneError("Forbidden (403): API token lacks sufficient permissions for zone operations")
+ else:
+ raise ZoneError(f"Failed to list zones ({response.status_code}): {response.text}")
+
+ @retry_request(max_retries=3, backoff_factor=1.5, retry_statuses={429, 500, 502, 503, 504})
+ def _create_zone_with_retry(self, zone_name: str, zone_type: str):
+ """
+ Create a new zone in Bright Data with retry logic
+
+ Args:
+ zone_name: Name for the new zone
+ zone_type: Type of zone ('unblocker' or 'serp')
+ """
+ if zone_type == "serp":
+ plan_config = {
+ "type": "unblocker",
+ "serp": True
+ }
+ else:
+ plan_config = {
+ "type": zone_type
+ }
+
+ payload = {
+ "plan": plan_config,
+ "zone": {
+ "name": zone_name,
+ "type": zone_type
+ }
+ }
+
+ response = self.session.post(
+ 'https://api.brightdata.com/zone',
+ json=payload
+ )
+
+ if response.status_code in [200, 201]:
+ logger.info(f"Zone creation successful: {zone_name}")
+ return response
+ elif response.status_code == 409 or "Duplicate zone name" in response.text or "already exists" in response.text.lower():
+ logger.info(f"Zone {zone_name} already exists - this is expected")
+ return response
+ elif response.status_code == 401:
+ raise ZoneError(f"Unauthorized (401): API token invalid or lacks permissions to create zone '{zone_name}'")
+ elif response.status_code == 403:
+ raise ZoneError(f"Forbidden (403): API token lacks permissions to create zone '{zone_name}'. Note: sdk_unlocker and sdk_serp zones should be allowed for all permissions.")
+ elif response.status_code == 400:
+ raise ZoneError(f"Bad request (400) creating zone '{zone_name}': {response.text}")
+ else:
+ raise ZoneError(f"Failed to create zone '{zone_name}' ({response.status_code}): {response.text}")
+
+ def _verify_zones_created(self, zone_names: list):
+ """
+ Verify that zones were successfully created by checking the zones list
+ """
+ max_attempts = 3
+ for attempt in range(max_attempts):
+ try:
+ logger.info(f"Verifying zone creation (attempt {attempt + 1}/{max_attempts})")
+ time.sleep(1)
+
+ zones = self._get_zones_with_retry()
+ existing_zone_names = {zone.get('name') for zone in zones}
+
+ missing_zones = [name for name in zone_names if name not in existing_zone_names]
+
+ if not missing_zones:
+ logger.info("All zones verified successfully")
+ return
+
+ if attempt == max_attempts - 1:
+ raise ZoneError(f"Zone verification failed: zones {missing_zones} not found after creation")
+
+ logger.warning(f"Zones not yet visible: {missing_zones}. Retrying verification...")
+
+ except (ZoneError, NetworkError):
+ if attempt == max_attempts - 1:
+ raise
+ logger.warning(f"Zone verification attempt {attempt + 1} failed, retrying...")
+ time.sleep(2 ** attempt)
+
+ def _create_zone(self, zone_name: str, zone_type: str):
+ """
+ Legacy method - kept for backward compatibility
+ Use _create_zone_with_retry instead for new code
+ """
+ return self._create_zone_with_retry(zone_name, zone_type)
+
+ def list_zones(self):
+ """
+ List all active zones in your Bright Data account
+
+ Returns:
+ List of zone dictionaries with their configurations
+ """
+ try:
+ return self._get_zones_with_retry()
+ except (ZoneError, NetworkError):
+ raise
+ except Exception as e:
+ logger.error(f"Unexpected error listing zones: {e}")
+ raise ZoneError(f"Unexpected error while listing zones: {str(e)}")
\ No newline at end of file
From cf238011d4bc65db5f8aa7461d7b9ffa14b7f6a1 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:38:01 +0200
Subject: [PATCH 22/70] Create tst
---
src/schemas/tst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 src/schemas/tst
diff --git a/src/schemas/tst b/src/schemas/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/schemas/tst
@@ -0,0 +1 @@
+
From 50f548aafb7e4fd1e5618a906ab7930dfb8d9d7f Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:38:18 +0200
Subject: [PATCH 23/70] Create tst
---
src/types/tst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 src/types/tst
diff --git a/src/types/tst b/src/types/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/types/tst
@@ -0,0 +1 @@
+
From a714d3b55374080ab908a754c056009e9dc1fe91 Mon Sep 17 00:00:00 2001
From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com>
Date: Tue, 28 Oct 2025 11:40:44 +0200
Subject: [PATCH 24/70] Add files via upload
---
README.md | 409 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 409 insertions(+)
create mode 100644 README.md
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..04fa2cc
--- /dev/null
+++ b/README.md
@@ -0,0 +1,409 @@
+
+
+
+