From feafc3f90c0f2ba3d03dc291e142f9a65d868199 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:12:22 +0200 Subject: [PATCH 01/70] Create client.ts --- src/client.ts | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/client.ts diff --git a/src/client.ts b/src/client.ts new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/client.ts @@ -0,0 +1 @@ + From 292df2dfaa889bf73eec7857593638de5653e589 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:13:46 +0200 Subject: [PATCH 02/70] Add files via upload --- src/__init__.py | 82 +++++ src/client.py | 897 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 979 insertions(+) create mode 100644 src/__init__.py create mode 100644 src/client.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..c815a6c --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,82 @@ +""" +## Bright Data SDK for Python + +A comprehensive SDK for Bright Data's Web Scraping and SERP APIs, providing +easy-to-use methods for web scraping, search engine result parsing, and data management. +## Functions: +First import the package and create a client: +```python +from brightdata import bdclient +client = bdclient(your-apy-key) +``` +Then use the client to call the desired functions: +#### scrape() +- Scrapes a website using Bright Data Web Unblocker API with proxy support (or multiple websites sequentially) +- syntax: `results = client.scrape(url, country, max_workers, ...)` +#### .scrape_linkedin. class +- Scrapes LinkedIn data including posts, jobs, companies, and profiles, recieve structured data as a result +- syntax: `results = client.scrape_linkedin.posts()/jobs()/companies()/profiles() # insert parameters per function` +#### search() +- Performs web searches using Bright Data SERP API with customizable search engines (or multiple search queries sequentially) +- syntax: `results = client.search(query, search_engine, country, ...)` +#### .search_linkedin. class +- Search LinkedIn data including for specific posts, jobs, profiles. recieve the relevent data as a result +- syntax: `results = client.search_linkedin.posts()/jobs()/profiles() # insert parameters per function` +#### search_chatGPT() +- Interact with ChatGPT using Bright Data's ChatGPT API, sending prompts and receiving responses +- syntax: `results = client.search_chatGPT(prompt, additional_prompt, max_workers, ...)` +#### download_content() / download_snapshot() +- Saves the scraped content to local files in various formats (JSON, CSV, etc.) +- syntax: `client.download_content(results)` +- syntax: `client.download_snapshot(results)` +#### connect_browser() +- Get WebSocket endpoint for connecting to Bright Data's scraping browser with Playwright/Selenium +- syntax: `endpoint_url = client.connect_browser()` then use with browser automation tools +#### crawl() +- Crawl websites to discover and scrape multiple pages using Bright Data's Web Crawl API +- syntax: `result = client.crawl(url, filter, exclude_filter, depth, ...)` +#### parse_content() +- Parse and extract useful information from API responses (JSON or HTML) +- syntax: `parsed = client.parse_content(data, extract_text=True, extract_links=True)` + +### Features: +- Web Scraping: Scrape websites using Bright Data Web Unlocker API with proxy support +- Search Engine Results: Perform web searches using Bright Data SERP API +- Web Crawling: Discover and scrape multiple pages from websites with advanced filtering +- Content Parsing: Extract text, links, images, and structured data from API responses +- Browser Automation: Simple authentication for Bright Data's scraping browser with Playwright/Selenium +- Multiple Search Engines: Support for Google, Bing, and Yandex +- Parallel Processing: Concurrent processing for multiple URLs or queries +- Robust Error Handling: Comprehensive error handling with retry logic +- Input Validation: Automatic validation of URLs, zone names, and parameters +- Zone Management: Automatic zone creation and management +- Multiple Output Formats: JSON, raw HTML, markdown, and more +""" + +from .client import bdclient +from .exceptions import ( + BrightDataError, + ValidationError, + AuthenticationError, + ZoneError, + NetworkError, + APIError +) +from .utils import parse_content, parse_multiple, extract_structured_data + +__version__ = "1.1.3" +__author__ = "Bright Data" +__email__ = "support@brightdata.com" + +__all__ = [ + 'bdclient', + 'BrightDataError', + 'ValidationError', + 'AuthenticationError', + 'ZoneError', + 'NetworkError', + 'APIError', + 'parse_content', + 'parse_multiple', + 'extract_structured_data' +] \ No newline at end of file diff --git a/src/client.py b/src/client.py new file mode 100644 index 0000000..b148792 --- /dev/null +++ b/src/client.py @@ -0,0 +1,897 @@ +import os +import json +import requests +from datetime import datetime +from typing import Union, Dict, Any, List + +from .api import WebScraper, SearchAPI +from .api.chatgpt import ChatGPTAPI +from .api.linkedin import LinkedInAPI, LinkedInScraper, LinkedInSearcher +from .api.download import DownloadAPI +from .api.crawl import CrawlAPI +from .api.extract import ExtractAPI +from .utils import ZoneManager, setup_logging, get_logger, parse_content +from .exceptions import ValidationError, AuthenticationError, APIError + +def _get_version(): + """Get version from __init__.py, cached at module import time.""" + try: + import os + init_file = os.path.join(os.path.dirname(__file__), '__init__.py') + with open(init_file, 'r', encoding='utf-8') as f: + for line in f: + if line.startswith('__version__'): + return line.split('"')[1] + except (OSError, IndexError): + pass + return "unknown" + +__version__ = _get_version() + +logger = get_logger('client') + + +class bdclient: + """Main client for the Bright Data SDK""" + + DEFAULT_MAX_WORKERS = 10 + DEFAULT_TIMEOUT = 65 + CONNECTION_POOL_SIZE = 20 + MAX_RETRIES = 3 + RETRY_BACKOFF_FACTOR = 1.5 + RETRY_STATUSES = {429, 500, 502, 503, 504} + + def __init__( + self, + api_token: str = None, + auto_create_zones: bool = True, + web_unlocker_zone: str = None, + serp_zone: str = None, + browser_zone: str = None, + browser_username: str = None, + browser_password: str = None, + browser_type: str = "playwright", + log_level: str = "INFO", + structured_logging: bool = True, + verbose: bool = None + ): + """ + Initialize the Bright Data client with your API token + + Create an account at https://brightdata.com/ to get your API token. + Go to settings > API keys , and verify that your API key have "Admin" permissions. + + Args: + api_token: Your Bright Data API token (can also be set via BRIGHTDATA_API_TOKEN env var) + auto_create_zones: Automatically create required zones if they don't exist (default: True) + web_unlocker_zone: Custom zone name for web unlocker (default: from env or 'sdk_unlocker') + serp_zone: Custom zone name for SERP API (default: from env or 'sdk_serp') + browser_zone: Custom zone name for Browser API (default: from env or 'sdk_browser') + browser_username: Username for Browser API in format "username-zone-{zone_name}" (can also be set via BRIGHTDATA_BROWSER_USERNAME env var) + browser_password: Password for Browser API authentication (can also be set via BRIGHTDATA_BROWSER_PASSWORD env var) + browser_type: Browser automation tool type - "playwright", "puppeteer", or "selenium" (default: "playwright") + log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + structured_logging: Whether to use structured JSON logging (default: True) + verbose: Enable verbose logging (default: False). Can also be set via BRIGHTDATA_VERBOSE env var. + When False, only shows WARNING and above. When True, shows all logs per log_level. + """ + try: + from dotenv import load_dotenv + load_dotenv() + except ImportError: + pass + + if verbose is None: + env_verbose = os.getenv('BRIGHTDATA_VERBOSE', '').lower() + verbose = env_verbose in ('true', '1', 'yes', 'on') + + setup_logging(log_level, structured_logging, verbose) + logger.info("Initializing Bright Data SDK client") + + self.api_token = api_token or os.getenv('BRIGHTDATA_API_TOKEN') + if not self.api_token: + logger.error("API token not provided") + raise ValidationError("API token is required. Provide it as parameter or set BRIGHTDATA_API_TOKEN environment variable") + + if not isinstance(self.api_token, str): + logger.error("API token must be a string") + raise ValidationError("API token must be a string") + + if len(self.api_token.strip()) < 10: + logger.error("API token appears to be invalid (too short)") + raise ValidationError("API token appears to be invalid") + + token_preview = f"{self.api_token[:4]}***{self.api_token[-4:]}" if len(self.api_token) > 8 else "***" + logger.info(f"API token validated successfully: {token_preview}") + + self.web_unlocker_zone = web_unlocker_zone or os.getenv('WEB_UNLOCKER_ZONE', 'sdk_unlocker') + self.serp_zone = serp_zone or os.getenv('SERP_ZONE', 'sdk_serp') + self.browser_zone = browser_zone or os.getenv('BROWSER_ZONE', 'sdk_browser') + self.auto_create_zones = auto_create_zones + + self.browser_username = browser_username or os.getenv('BRIGHTDATA_BROWSER_USERNAME') + self.browser_password = browser_password or os.getenv('BRIGHTDATA_BROWSER_PASSWORD') + + + + valid_browser_types = ["playwright", "puppeteer", "selenium"] + if browser_type not in valid_browser_types: + raise ValidationError(f"Invalid browser_type '{browser_type}'. Must be one of: {valid_browser_types}") + self.browser_type = browser_type + + if self.browser_username and self.browser_password: + browser_preview = f"{self.browser_username[:3]}***" + logger.info(f"Browser credentials configured: {browser_preview} (type: {self.browser_type})") + elif self.browser_username or self.browser_password: + logger.warning("Incomplete browser credentials: both username and password are required for browser API") + else: + logger.debug("No browser credentials provided - browser API will not be available") + + self.session = requests.Session() + + auth_header = f'Bearer {self.api_token}' + self.session.headers.update({ + 'Authorization': auth_header, + 'Content-Type': 'application/json', + 'User-Agent': f'brightdata-sdk/{__version__}' + }) + + logger.info("HTTP session configured with secure headers") + + adapter = requests.adapters.HTTPAdapter( + pool_connections=self.CONNECTION_POOL_SIZE, + pool_maxsize=self.CONNECTION_POOL_SIZE, + max_retries=0 + ) + self.session.mount('https://', adapter) + self.session.mount('http://', adapter) + + self.zone_manager = ZoneManager(self.session) + self.web_scraper = WebScraper( + self.session, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + self.search_api = SearchAPI( + self.session, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + self.chatgpt_api = ChatGPTAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + self.linkedin_api = LinkedInAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + self.download_api = DownloadAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT + ) + self.crawl_api = CrawlAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + self.extract_api = ExtractAPI(self) + + if self.auto_create_zones: + self.zone_manager.ensure_required_zones( + self.web_unlocker_zone, + self.serp_zone + ) + + def scrape( + self, + url: Union[str, List[str]], + zone: str = None, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "html", + async_request: bool = False, + max_workers: int = None, + timeout: int = None + ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]: + """ + ## Unlock and scrape websites using Bright Data Web Unlocker API + + Scrapes one or multiple URLs through Bright Data's proxy network with anti-bot detection bypass. + + ### Parameters: + - `url` (str | List[str]): Single URL string or list of URLs to scrape + - `zone` (str, optional): Zone identifier (default: auto-configured web_unlocker_zone) + - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`) + - `method` (str, optional): HTTP method for the request (default: `"GET"`) + - `country` (str, optional): Two-letter ISO country code for proxy location (defaults to fastest connection) + - `data_format` (str, optional): Additional format transformation (default: `"html"`) + - `async_request` (bool, optional): Enable asynchronous processing (default: `False`) + - `max_workers` (int, optional): Maximum parallel workers for multiple URLs (default: `10`) + - `timeout` (int, optional): Request timeout in seconds (default: `30`) + + ### Returns: + - Single URL: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"` + - Multiple URLs: `List[Union[Dict[str, Any], str]]` corresponding to each input URL + + ### Example Usage: + ```python + # Single URL scraping + result = client.scrape( + url="https://example.com", + response_format="json" + ) + + # Multiple URLs scraping + urls = ["https://site1.com", "https://site2.com"] + results = client.scrape( + url=urls, + response_format="raw", + max_workers=5 + ) + ``` + + ### Raises: + - `ValidationError`: Invalid URL format or empty URL list + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + zone = zone or self.web_unlocker_zone + max_workers = max_workers or self.DEFAULT_MAX_WORKERS + + return self.web_scraper.scrape( + url, zone, response_format, method, country, data_format, + async_request, max_workers, timeout + ) + + def search( + self, + query: Union[str, List[str]], + search_engine: str = "google", + zone: str = None, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "html", + async_request: bool = False, + max_workers: int = None, + timeout: int = None, + parse: bool = False + ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]: + """ + ## Search the web using Bright Data SERP API + + Performs web searches through major search engines using Bright Data's proxy network + for reliable, bot-detection-free results. + + ### Parameters: + - `query` (str | List[str]): Search query string or list of search queries + - `search_engine` (str, optional): Search engine to use - `"google"`, `"bing"`, or `"yandex"` (default: `"google"`) + - `zone` (str, optional): Zone identifier (default: auto-configured serp_zone) + - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`) + - `method` (str, optional): HTTP method for the request (default: `"GET"`) + - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`) + - `data_format` (str, optional): Additional format transformation (default: `"html"`) + - `async_request` (bool, optional): Enable asynchronous processing (default: `False`) + - `max_workers` (int, optional): Maximum parallel workers for multiple queries (default: `10`) + - `timeout` (int, optional): Request timeout in seconds (default: `30`) + - `parse` (bool, optional): Enable JSON parsing by adding brd_json=1 to URL (default: `False`) + + ### Returns: + - Single query: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"` + - Multiple queries: `List[Union[Dict[str, Any], str]]` corresponding to each input query + + ### Example Usage: + ```python + # Single search query + result = client.search( + query="best laptops 2024", + search_engine="google", + response_format="json" + ) + + # Multiple search queries + queries = ["python tutorials", "machine learning courses", "web development"] + results = client.search( + query=queries, + search_engine="bing", + max_workers=3 + ) + ``` + + ### Supported Search Engines: + - `"google"` - Google Search + - `"bing"` - Microsoft Bing + - `"yandex"` - Yandex Search + + ### Raises: + - `ValidationError`: Invalid search engine, empty query, or validation errors + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + zone = zone or self.serp_zone + max_workers = max_workers or self.DEFAULT_MAX_WORKERS + + return self.search_api.search( + query, search_engine, zone, response_format, method, country, + data_format, async_request, max_workers, timeout, parse + ) + + def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str: + """ + ## Download content to a file based on its format + + ### Args: + content: The content to download (dict for JSON, string for other formats) + filename: Optional filename. If not provided, generates one with timestamp + format: Format of the content ("json", "csv", "ndjson", "jsonl", "txt") + parse: If True, automatically parse JSON strings in 'body' fields to objects (default: False) + + ### Returns: + Path to the downloaded file + """ + return self.download_api.download_content(content, filename, format, parse) + + + def search_chatGPT( + self, + prompt: Union[str, List[str]], + country: Union[str, List[str]] = "", + additional_prompt: Union[str, List[str]] = "", + web_search: Union[bool, List[bool]] = False, + sync: bool = True + ) -> Dict[str, Any]: + """ + ## Search ChatGPT responses using Bright Data's ChatGPT dataset API + + Sends one or multiple prompts to ChatGPT through Bright Data's proxy network + with support for both synchronous and asynchronous processing. + + ### Parameters: + - `prompt` (str | List[str]): Single prompt string or list of prompts to send to ChatGPT + - `country` (str | List[str], optional): Two-letter ISO country code(s) for proxy location (default: "") + - `additional_prompt` (str | List[str], optional): Follow-up prompt(s) after receiving the first answer (default: "") + - `web_search` (bool | List[bool], optional): Whether to click the web search button in ChatGPT (default: False) + - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing + + ### Returns: + - `Dict[str, Any]`: If sync=True, returns ChatGPT response data directly. If sync=False, returns response with snapshot_id for async processing + + ### Example Usage: + ```python + # Single prompt (synchronous - returns data immediately) + result = client.search_chatGPT(prompt="Top hotels in New York") + + # Multiple prompts (synchronous - returns data immediately) + result = client.search_chatGPT( + prompt=["Top hotels in New York", "Best restaurants in Paris", "Tourist attractions in Tokyo"], + additional_prompt=["Are you sure?", "", "What about hidden gems?"] + ) + + # Asynchronous with web search enabled (returns snapshot_id) + result = client.search_chatGPT( + prompt="Latest AI developments", + web_search=True, + sync=False + ) + # Snapshot ID is automatically printed for async requests + ``` + + ### Raises: + - `ValidationError`: Invalid prompt or parameters + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + if isinstance(prompt, str): + prompts = [prompt] + else: + prompts = prompt + + if not prompts or len(prompts) == 0: + raise ValidationError("At least one prompt is required") + + for p in prompts: + if not p or not isinstance(p, str): + raise ValidationError("All prompts must be non-empty strings") + + def normalize_param(param, param_name): + if isinstance(param, list): + if len(param) != len(prompts): + raise ValidationError(f"{param_name} list must have same length as prompts list") + return param + else: + return [param] * len(prompts) + + countries = normalize_param(country, "country") + additional_prompts = normalize_param(additional_prompt, "additional_prompt") + web_searches = normalize_param(web_search, "web_search") + + for c in countries: + if not isinstance(c, str): + raise ValidationError("All countries must be strings") + + for ap in additional_prompts: + if not isinstance(ap, str): + raise ValidationError("All additional_prompts must be strings") + + for ws in web_searches: + if not isinstance(ws, bool): + raise ValidationError("All web_search values must be booleans") + + return self.chatgpt_api.scrape_chatgpt( + prompts, + countries, + additional_prompts, + web_searches, + sync, + self.DEFAULT_TIMEOUT + ) + + @property + def scrape_linkedin(self): + """ + ## LinkedIn Data Scraping Interface + + Provides specialized methods for scraping different types of LinkedIn data + using Bright Data's collect API with pre-configured dataset IDs. + + ### Available Methods: + - `profiles(url)` - Scrape LinkedIn profile data + - `companies(url)` - Scrape LinkedIn company data + - `jobs(url)` - Scrape LinkedIn job listing data + - `posts(url)` - Scrape LinkedIn post content + + ### Example Usage: + ```python + # Scrape LinkedIn profiles + result = client.scrape_linkedin.profiles("https://www.linkedin.com/in/username/") + + # Scrape multiple companies + companies = [ + "https://www.linkedin.com/company/ibm", + "https://www.linkedin.com/company/bright-data" + ] + result = client.scrape_linkedin.companies(companies) + + # Scrape job listings + result = client.scrape_linkedin.jobs("https://www.linkedin.com/jobs/view/123456/") + + # Scrape posts + result = client.scrape_linkedin.posts("https://www.linkedin.com/posts/user-activity-123/") + ``` + + ### Returns: + Each method returns a `Dict[str, Any]` containing snapshot_id and metadata for tracking the request. + Use the snapshot_id with `download_snapshot()` to retrieve the collected data. + """ + if not hasattr(self, '_linkedin_scraper'): + self._linkedin_scraper = LinkedInScraper(self.linkedin_api) + return self._linkedin_scraper + + @property + def search_linkedin(self): + """ + ## LinkedIn Data Search Interface + + Provides specialized methods for discovering new LinkedIn data by various search criteria + using Bright Data's collect API with pre-configured dataset IDs. + + ### Available Methods: + - `profiles(first_name, last_name)` - Search LinkedIn profiles by name + - `jobs(url=..., location=...)` - Search LinkedIn jobs by URL or keyword criteria + - `posts(profile_url=..., company_url=..., url=...)` - Search LinkedIn posts by various methods + + ### Example Usage: + ```python + # Search profiles by name + result = client.search_linkedin.profiles("James", "Smith") + + # Search jobs by location and keywords + result = client.search_linkedin.jobs( + location="Paris", + keyword="product manager", + country="FR" + ) + + # Search posts by profile URL with date range + result = client.search_linkedin.posts( + profile_url="https://www.linkedin.com/in/username", + start_date="2018-04-25T00:00:00.000Z", + end_date="2021-05-25T00:00:00.000Z" + ) + ``` + + ### Returns: + Each method returns a `Dict[str, Any]` containing snapshot_id (async) or direct data (sync) for tracking the request. + Use the snapshot_id with `download_snapshot()` to retrieve the collected data. + """ + if not hasattr(self, '_linkedin_searcher'): + self._linkedin_searcher = LinkedInSearcher(self.linkedin_api) + return self._linkedin_searcher + + def download_snapshot( + self, + snapshot_id: str, + format: str = "json", + compress: bool = False, + batch_size: int = None, + part: int = None + ) -> Union[Dict[str, Any], List[Dict[str, Any]], str]: + """ + ## Download snapshot content from Bright Data dataset API + + Downloads the snapshot content using the snapshot ID returned from scrape_chatGPT() + or other dataset collection triggers. + + ### Parameters: + - `snapshot_id` (str): The snapshot ID returned when collection was triggered (required) + - `format` (str, optional): Format of the data - "json", "ndjson", "jsonl", or "csv" (default: "json") + - `compress` (bool, optional): Whether the result should be compressed (default: False) + - `batch_size` (int, optional): Divide into batches of X records (minimum: 1000) + - `part` (int, optional): If batch_size provided, specify which part to download + + ### Returns: + - `Union[Dict, List, str]`: Snapshot data in the requested format, OR + - `Dict`: Status response if snapshot is not ready yet (status="not_ready") + + ### Example Usage: + ```python + # Download complete snapshot + result = client.download_snapshot("s_m4x7enmven8djfqak") + + # Check if snapshot is ready + if isinstance(result, dict) and result.get('status') == 'not_ready': + print(f"Not ready: {result['message']}") + # Try again later + else: + # Snapshot data is ready + data = result + + # Download as CSV format + csv_data = client.download_snapshot("s_m4x7enmven8djfqak", format="csv") + ``` + + ### Raises: + - `ValidationError`: Invalid parameters or snapshot_id format + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed, snapshot not found, or server error + """ + return self.download_api.download_snapshot(snapshot_id, format, compress, batch_size, part) + + + def list_zones(self) -> List[Dict[str, Any]]: + """ + ## List all active zones in your Bright Data account + + ### Returns: + List of zone dictionaries with their configurations + """ + return self.zone_manager.list_zones() + + def connect_browser(self) -> str: + """ + ## Get WebSocket endpoint URL for connecting to Bright Data's scraping browser + + Returns the WebSocket endpoint URL that can be used with Playwright or Selenium + to connect to Bright Data's scraping browser service. + + ### Returns: + WebSocket endpoint URL string for browser connection + + ### Example Usage: + ```python + # For Playwright (default) + client = bdclient( + api_token="your_token", + browser_username="username-zone-browser_zone1", + browser_password="your_password", + browser_type="playwright" # Playwright/ Puppeteer (default) + ) + endpoint_url = client.connect_browser() # Returns: wss://...@brd.superproxy.io:9222 + + # For Selenium + client = bdclient( + api_token="your_token", + browser_username="username-zone-browser_zone1", + browser_password="your_password", + browser_type="selenium" + ) + endpoint_url = client.connect_browser() # Returns: https://...@brd.superproxy.io:9515 + ``` + + ### Raises: + - `ValidationError`: Browser credentials not provided or invalid + - `AuthenticationError`: Invalid browser credentials + """ + if not self.browser_username or not self.browser_password: + logger.error("Browser credentials not configured") + raise ValidationError( + "Browser credentials are required. Provide browser_username and browser_password " + "parameters or set BRIGHTDATA_BROWSER_USERNAME and BRIGHTDATA_BROWSER_PASSWORD " + "environment variables." + ) + + if not isinstance(self.browser_username, str) or not isinstance(self.browser_password, str): + logger.error("Browser credentials must be strings") + raise ValidationError("Browser username and password must be strings") + + if len(self.browser_username.strip()) == 0 or len(self.browser_password.strip()) == 0: + logger.error("Browser credentials cannot be empty") + raise ValidationError("Browser username and password cannot be empty") + + auth_string = f"{self.browser_username}:{self.browser_password}" + + if self.browser_type == "selenium": + endpoint_url = f"https://{auth_string}@brd.superproxy.io:9515" + logger.debug(f"Browser endpoint URL: https://***:***@brd.superproxy.io:9515") + else: + endpoint_url = f"wss://{auth_string}@brd.superproxy.io:9222" + logger.debug(f"Browser endpoint URL: wss://***:***@brd.superproxy.io:9222") + + logger.info(f"Generated {self.browser_type} connection endpoint for user: {self.browser_username[:3]}***") + + return endpoint_url + + def crawl( + self, + url: Union[str, List[str]], + ignore_sitemap: bool = None, + depth: int = None, + filter: str = None, + exclude_filter: str = None, + custom_output_fields: List[str] = None, + include_errors: bool = True + ) -> Dict[str, Any]: + """ + ## Crawl websites using Bright Data's Web Crawl API + + Performs web crawling to discover and scrape multiple pages from a website + starting from the specified URL(s). Returns a snapshot_id for tracking the crawl progress. + + ### Parameters: + - `url` (str | List[str]): Domain URL(s) to crawl (required) + - `ignore_sitemap` (bool, optional): Ignore sitemap when crawling + - `depth` (int, optional): Maximum depth to crawl relative to the entered URL + - `filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/") + - `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/") + - `custom_output_fields` (List[str], optional): Custom output schema fields to include + - `include_errors` (bool, optional): Include errors in response (default: True) + + ### Returns: + - `Dict[str, Any]`: Crawl response with snapshot_id for tracking + + ### Example Usage: + ```python + # Single URL crawl + result = client.crawl("https://example.com/") + snapshot_id = result['snapshot_id'] + + # Multiple URLs with filters + urls = ["https://example.com/", "https://example2.com/"] + result = client.crawl( + url=urls, + filter="/product/", + exclude_filter="/ads/", + depth=2, + ignore_sitemap=True + ) + + # Custom output schema + result = client.crawl( + url="https://example.com/", + custom_output_fields=["markdown", "url", "page_title"] + ) + + # Download results using snapshot_id + data = client.download_snapshot(result['snapshot_id']) + ``` + + ### Available Output Fields: + - `markdown` - Page content in markdown format + - `url` - Page URL + - `html2text` - Page content as plain text + - `page_html` - Raw HTML content + - `ld_json` - Structured data (JSON-LD) + - `page_title` - Page title + - `timestamp` - Crawl timestamp + - `input` - Input parameters used + - `discovery_input` - Discovery parameters + - `error` - Error information (if any) + - `error_code` - Error code (if any) + - `warning` - Warning information (if any) + - `warning_code` - Warning code (if any) + + ### Raises: + - `ValidationError`: Invalid URL or parameters + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + return self.crawl_api.crawl( + url=url, + ignore_sitemap=ignore_sitemap, + depth=depth, + filter=filter, + exclude_filter=exclude_filter, + custom_output_fields=custom_output_fields, + include_errors=include_errors + ) + + def parse_content( + self, + data: Union[str, Dict, List], + extract_text: bool = True, + extract_links: bool = False, + extract_images: bool = False + ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: + """ + ## Parse content from API responses + + Extract and parse useful information from scraping, search, or crawling results. + Automatically detects and handles both single and multiple results from batch operations. + + ### Parameters: + - `data` (str | Dict | List): Response data from scrape(), search(), or crawl() methods + - `extract_text` (bool, optional): Extract clean text content (default: True) + - `extract_links` (bool, optional): Extract all links from content (default: False) + - `extract_images` (bool, optional): Extract image URLs from content (default: False) + + ### Returns: + - `Dict[str, Any]`: Parsed content for single results + - `List[Dict[str, Any]]`: List of parsed content for multiple results (auto-detected) + + ### Example Usage: + ```python + # Parse single URL results + scraped_data = client.scrape("https://example.com") + parsed = client.parse_content(scraped_data, extract_text=True, extract_links=True) + print(f"Title: {parsed['title']}") + + # Parse multiple URL results (auto-detected) + scraped_data = client.scrape(["https://example1.com", "https://example2.com"]) + parsed_list = client.parse_content(scraped_data, extract_text=True) + for result in parsed_list: + print(f"Title: {result['title']}") + ``` + + ### Available Fields in Each Result: + - `type`: 'json' or 'html' - indicates the source data type + - `text`: Cleaned text content (if extract_text=True) + - `links`: List of {'url': str, 'text': str} objects (if extract_links=True) + - `images`: List of {'url': str, 'alt': str} objects (if extract_images=True) + - `title`: Page title (if available) + - `raw_length`: Length of original content + - `structured_data`: Original JSON data (if type='json') + """ + return parse_content( + data=data, + extract_text=extract_text, + extract_links=extract_links, + extract_images=extract_images + ) + + def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: Dict[str, Any] = None, llm_key: str = None) -> str: + """ + ## Extract specific information from websites using AI + + Combines web scraping with OpenAI's language models to extract targeted information + from web pages based on natural language queries. Automatically parses URLs and + optimizes content for efficient LLM processing. + + ### Parameters: + - `query` (str): Natural language query describing what to extract. If `url` parameter is provided, + this becomes the pure extraction query. If `url` is not provided, this should include + the URL (e.g. "extract the most recent news from cnn.com") + - `url` (str | List[str], optional): Direct URL(s) to scrape. If provided, bypasses URL extraction + from query and sends these URLs to the web unlocker API + - `output_scheme` (dict, optional): JSON Schema defining the expected structure for the LLM response. + Uses OpenAI's Structured Outputs for reliable type-safe responses. + Example: {"type": "object", "properties": {"title": {"type": "string"}, "date": {"type": "string"}}, "required": ["title", "date"]} + - `llm_key` (str, optional): OpenAI API key. If not provided, uses OPENAI_API_KEY env variable + + ### Returns: + - `str`: Extracted content (also provides access to metadata via attributes) + + ### Example Usage: + ```python + # Using URL parameter with structured output (new) + result = client.extract( + query="extract the most recent news headlines", + url="https://cnn.com", + output_scheme={ + "type": "object", + "properties": { + "headlines": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "date": {"type": "string"} + }, + "required": ["title", "date"] + } + } + }, + "required": ["headlines"] + } + ) + print(result) # Prints the extracted news content + + # Using URL in query (original behavior) + result = client.extract("extract the most recent news from cnn.com") + + # Multiple URLs with structured schema + result = client.extract( + query="extract main headlines", + url=["https://cnn.com", "https://bbc.com"], + output_scheme={ + "type": "object", + "properties": { + "sources": { + "type": "array", + "items": { + "type": "object", + "properties": { + "source_name": {"type": "string"}, + "headlines": {"type": "array", "items": {"type": "string"}} + }, + "required": ["source_name", "headlines"] + } + } + }, + "required": ["sources"] + } + ) + + # Access metadata attributes + print(f"Source: {result.url}") + print(f"Title: {result.source_title}") + print(f"Tokens used: {result.token_usage['total_tokens']}") + + # Use with custom OpenAI key + result = client.extract( + query="get the price and description", + url="https://amazon.com/dp/B079QHML21", + llm_key="your-openai-api-key" + ) + ``` + + ### Environment Variable Setup: + ```bash + # Set in .env file + OPENAI_API_KEY=your-openai-api-key + ``` + + ### Available Attributes: + ```python + result = client.extract("extract news from cnn.com") + + # String value (default behavior) + str(result) # Extracted content + + # Metadata attributes + result.query # 'extract news' + result.url # 'https://www.cnn.com' + result.source_title # 'CNN - Breaking News...' + result.content_length # 1234 + result.token_usage # {'total_tokens': 2998, ...} + result.success # True + result.metadata # Full metadata dictionary + ``` + + ### Raises: + - `ValidationError`: Invalid query format, missing URL, or invalid LLM key + - `APIError`: Web scraping failed or LLM processing error + """ + return self.extract_api.extract(query, url, output_scheme, llm_key) \ No newline at end of file From cc78cc0c18fde28e3c2222f7702ca791bb03913b Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:14:03 +0200 Subject: [PATCH 03/70] Delete src/client.ts --- src/client.ts | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/client.ts diff --git a/src/client.ts b/src/client.ts deleted file mode 100644 index 8b13789..0000000 --- a/src/client.ts +++ /dev/null @@ -1 +0,0 @@ - From 94cce094c221c5168796ca3890d1c52e95ee5bfc Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:14:58 +0200 Subject: [PATCH 04/70] Create tst --- src/api/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/api/tst diff --git a/src/api/tst b/src/api/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/api/tst @@ -0,0 +1 @@ + From db06632ab8569fb6860837d17d41265be3c3dce2 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:15:23 +0200 Subject: [PATCH 05/70] Add files via upload --- src/api/__init__.py | 13 + src/api/chatgpt.py | 126 +++++++ src/api/crawl.py | 175 ++++++++++ src/api/download.py | 265 +++++++++++++++ src/api/extract.py | 419 +++++++++++++++++++++++ src/api/linkedin.py | 803 ++++++++++++++++++++++++++++++++++++++++++++ src/api/scraper.py | 205 +++++++++++ src/api/search.py | 212 ++++++++++++ 8 files changed, 2218 insertions(+) create mode 100644 src/api/__init__.py create mode 100644 src/api/chatgpt.py create mode 100644 src/api/crawl.py create mode 100644 src/api/download.py create mode 100644 src/api/extract.py create mode 100644 src/api/linkedin.py create mode 100644 src/api/scraper.py create mode 100644 src/api/search.py diff --git a/src/api/__init__.py b/src/api/__init__.py new file mode 100644 index 0000000..a79c0fd --- /dev/null +++ b/src/api/__init__.py @@ -0,0 +1,13 @@ +from .scraper import WebScraper +from .search import SearchAPI +from .chatgpt import ChatGPTAPI +from .linkedin import LinkedInAPI +from .crawl import CrawlAPI + +__all__ = [ + 'WebScraper', + 'SearchAPI', + 'ChatGPTAPI', + 'LinkedInAPI', + 'CrawlAPI' +] \ No newline at end of file diff --git a/src/api/chatgpt.py b/src/api/chatgpt.py new file mode 100644 index 0000000..e9edb90 --- /dev/null +++ b/src/api/chatgpt.py @@ -0,0 +1,126 @@ +import json +import requests +from typing import Union, Dict, Any, List + +from ..utils import get_logger +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.chatgpt') + + +class ChatGPTAPI: + """Handles ChatGPT scraping operations using Bright Data's ChatGPT dataset API""" + + def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5): + self.session = session + self.api_token = api_token + self.default_timeout = default_timeout + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + def scrape_chatgpt( + self, + prompts: List[str], + countries: List[str], + additional_prompts: List[str], + web_searches: List[bool], + sync: bool = True, + timeout: int = None + ) -> Dict[str, Any]: + """ + Internal method to handle ChatGPT scraping API requests + + Parameters: + - prompts: List of prompts to send to ChatGPT + - countries: List of country codes matching prompts + - additional_prompts: List of follow-up prompts matching prompts + - web_searches: List of web_search flags matching prompts + - sync: If True, uses synchronous API for immediate results + - timeout: Request timeout in seconds + + Returns: + - Dict containing response with snapshot_id or direct data (if sync=True) + """ + url = "https://api.brightdata.com/datasets/v3/scrape" if sync else "https://api.brightdata.com/datasets/v3/trigger" + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": "gd_m7aof0k82r803d5bjm", + "include_errors": "true" + } + + data = [ + { + "url": "https://chatgpt.com/", + "prompt": prompts[i], + "country": countries[i], + "additional_prompt": additional_prompts[i], + "web_search": web_searches[i] + } + for i in range(len(prompts)) + ] + + try: + response = self.session.post( + url, + headers=headers, + params=params, + json=data, + timeout=timeout or (65 if sync else self.default_timeout) + ) + + if response.status_code == 401: + raise AuthenticationError("Invalid API token or insufficient permissions") + elif response.status_code != 200: + raise APIError(f"ChatGPT scraping request failed with status {response.status_code}: {response.text}") + + if sync: + response_text = response.text + if '\n{' in response_text and response_text.strip().startswith('{'): + json_objects = [] + for line in response_text.strip().split('\n'): + if line.strip(): + try: + json_objects.append(json.loads(line)) + except json.JSONDecodeError: + continue + result = json_objects + else: + try: + result = response.json() + except json.JSONDecodeError: + result = response_text + + logger.info(f"ChatGPT data retrieved synchronously for {len(prompts)} prompt(s)") + print(f"Retrieved {len(result) if isinstance(result, list) else 1} ChatGPT response(s)") + else: + result = response.json() + snapshot_id = result.get('snapshot_id') + if snapshot_id: + logger.info(f"ChatGPT scraping job initiated successfully for {len(prompts)} prompt(s)") + print("") + print("Snapshot ID:") + print(snapshot_id) + print("") + + return result + + except requests.exceptions.Timeout: + raise APIError("Timeout while initiating ChatGPT scraping") + except requests.exceptions.RequestException as e: + raise APIError(f"Network error during ChatGPT scraping: {str(e)}") + except json.JSONDecodeError as e: + raise APIError(f"Failed to parse ChatGPT scraping response: {str(e)}") + except Exception as e: + if isinstance(e, (ValidationError, AuthenticationError, APIError)): + raise + raise APIError(f"Unexpected error during ChatGPT scraping: {str(e)}") \ No newline at end of file diff --git a/src/api/crawl.py b/src/api/crawl.py new file mode 100644 index 0000000..4fe047a --- /dev/null +++ b/src/api/crawl.py @@ -0,0 +1,175 @@ +import json +from typing import Union, Dict, Any, List, Optional +from ..utils import get_logger, validate_url +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.crawl') + + +class CrawlAPI: + """Handles crawl operations using Bright Data's Web Crawl API""" + + CRAWL_DATASET_ID = "gd_m6gjtfmeh43we6cqc" + + AVAILABLE_OUTPUT_FIELDS = [ + "markdown", "url", "html2text", "page_html", "ld_json", + "page_title", "timestamp", "input", "discovery_input", + "error", "error_code", "warning", "warning_code" + ] + + def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5): + self.session = session + self.api_token = api_token + self.default_timeout = default_timeout + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + def crawl( + self, + url: Union[str, List[str]], + ignore_sitemap: Optional[bool] = None, + depth: Optional[int] = None, + filter: Optional[str] = None, + exclude_filter: Optional[str] = None, + custom_output_fields: Optional[List[str]] = None, + include_errors: bool = True + ) -> Dict[str, Any]: + """ + ## Crawl websites using Bright Data's Web Crawl API + + Performs web crawling to discover and scrape multiple pages from a website + starting from the specified URL(s). + + ### Parameters: + - `url` (str | List[str]): Domain URL(s) to crawl (required) + - `ignore_sitemap` (bool, optional): Ignore sitemap when crawling + - `depth` (int, optional): Maximum depth to crawl relative to the entered URL + - `filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/") + - `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/") + - `custom_output_fields` (List[str], optional): Custom output schema fields to include + - `include_errors` (bool, optional): Include errors in response (default: True) + + ### Returns: + - `Dict[str, Any]`: Crawl response with snapshot_id for tracking + + ### Example Usage: + ```python + # Single URL crawl + result = client.crawl("https://example.com/") + + # Multiple URLs with filters + urls = ["https://example.com/", "https://example2.com/"] + result = client.crawl( + url=urls, + filter="/product/", + exclude_filter="/ads/", + depth=2, + ignore_sitemap=True + ) + + # Custom output schema + result = client.crawl( + url="https://example.com/", + custom_output_fields=["markdown", "url", "page_title"] + ) + ``` + + ### Raises: + - `ValidationError`: Invalid URL or parameters + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + if isinstance(url, str): + urls = [url] + elif isinstance(url, list): + urls = url + else: + raise ValidationError("URL must be a string or list of strings") + + if not urls: + raise ValidationError("At least one URL is required") + + for u in urls: + if not isinstance(u, str) or not u.strip(): + raise ValidationError("All URLs must be non-empty strings") + validate_url(u) + + if custom_output_fields is not None: + if not isinstance(custom_output_fields, list): + raise ValidationError("custom_output_fields must be a list") + + invalid_fields = [field for field in custom_output_fields if field not in self.AVAILABLE_OUTPUT_FIELDS] + if invalid_fields: + raise ValidationError(f"Invalid output fields: {invalid_fields}. Available fields: {self.AVAILABLE_OUTPUT_FIELDS}") + + crawl_inputs = [] + for u in urls: + crawl_input = {"url": u} + + if ignore_sitemap is not None: + crawl_input["ignore_sitemap"] = ignore_sitemap + if depth is not None: + crawl_input["depth"] = depth + if filter is not None: + crawl_input["filter"] = filter + if exclude_filter is not None: + crawl_input["exclude_filter"] = exclude_filter + + crawl_inputs.append(crawl_input) + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + params = { + "dataset_id": self.CRAWL_DATASET_ID, + "include_errors": str(include_errors).lower(), + "type": "discover_new", + "discover_by": "domain_url" + } + + if custom_output_fields: + payload = { + "input": crawl_inputs, + "custom_output_fields": custom_output_fields + } + else: + payload = crawl_inputs + + logger.info(f"Starting crawl for {len(urls)} URL(s)") + logger.debug(f"Crawl parameters: depth={depth}, filter={filter}, exclude_filter={exclude_filter}") + + try: + response = self.session.post( + api_url, + params=params, + json=payload, + timeout=self.default_timeout + ) + + if response.status_code == 200: + result = response.json() + snapshot_id = result.get('snapshot_id') + logger.info(f"Crawl initiated successfully. Snapshot ID: {snapshot_id}") + return result + + elif response.status_code == 401: + logger.error("Unauthorized (401): Check API token") + raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}") + elif response.status_code == 403: + logger.error("Forbidden (403): Insufficient permissions") + raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}") + elif response.status_code == 400: + logger.error(f"Bad request (400): {response.text}") + raise APIError(f"Bad request (400): {response.text}") + else: + logger.error(f"Crawl request failed ({response.status_code}): {response.text}") + raise APIError( + f"Crawl request failed ({response.status_code}): {response.text}", + status_code=response.status_code, + response_text=response.text + ) + + except Exception as e: + if isinstance(e, (ValidationError, AuthenticationError, APIError)): + raise + logger.error(f"Unexpected error during crawl: {e}") + raise APIError(f"Unexpected error during crawl: {str(e)}") \ No newline at end of file diff --git a/src/api/download.py b/src/api/download.py new file mode 100644 index 0000000..4bccdc0 --- /dev/null +++ b/src/api/download.py @@ -0,0 +1,265 @@ +import json +import requests +from datetime import datetime +from typing import Union, Dict, Any, List + +from ..utils import get_logger +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.download') + + +class DownloadAPI: + """Handles snapshot and content download operations using Bright Data's download API""" + + def __init__(self, session, api_token, default_timeout=30): + self.session = session + self.api_token = api_token + self.default_timeout = default_timeout + + def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str: + """ + ## Download content to a file based on its format + + ### Args: + content: The content to download (dict for JSON, string for other formats) + filename: Optional filename. If not provided, generates one with timestamp + format: Format of the content ("json", "csv", "ndjson", "jsonl", "txt") + parse: If True, automatically parse JSON strings in 'body' fields to objects (default: False) + + ### Returns: + Path to the downloaded file + """ + + if not filename: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"brightdata_results_{timestamp}.{format}" + + if not filename.endswith(f".{format}"): + filename = f"{filename}.{format}" + + if parse and isinstance(content, (list, dict)): + content = self._parse_body_json(content) + + try: + if format == "json": + with open(filename, 'w', encoding='utf-8') as f: + if isinstance(content, dict) or isinstance(content, list): + json.dump(content, f, indent=2, ensure_ascii=False) + else: + f.write(str(content)) + else: + with open(filename, 'w', encoding='utf-8') as f: + f.write(str(content)) + + logger.info(f"Content downloaded to: {filename}") + return filename + + except IOError as e: + raise APIError(f"Failed to write file {filename}: {str(e)}") + except Exception as e: + raise APIError(f"Failed to download content: {str(e)}") + + def download_snapshot( + self, + snapshot_id: str, + format: str = "json", + compress: bool = False, + batch_size: int = None, + part: int = None + ) -> Union[Dict[str, Any], List[Dict[str, Any]], str]: + """ + ## Download snapshot content from Bright Data dataset API + + Downloads the snapshot content using the snapshot ID returned from scrape_chatGPT() + or other dataset collection triggers. + + ### Parameters: + - `snapshot_id` (str): The snapshot ID returned when collection was triggered (required) + - `format` (str, optional): Format of the data - "json", "ndjson", "jsonl", or "csv" (default: "json") + - `compress` (bool, optional): Whether the result should be compressed (default: False) + - `batch_size` (int, optional): Divide into batches of X records (minimum: 1000) + - `part` (int, optional): If batch_size provided, specify which part to download + + ### Returns: + - `Union[Dict, List, str]`: Snapshot data in the requested format + + ### Example Usage: + ```python + # Download complete snapshot + data = client.download_snapshot("s_m4x7enmven8djfqak") + + # Download as CSV format + csv_data = client.download_snapshot("s_m4x7enmven8djfqak", format="csv") + + # Download in batches + batch_data = client.download_snapshot( + "s_m4x7enmven8djfqak", + batch_size=1000, + part=1 + ) + ``` + + ### Raises: + - `ValidationError`: Invalid parameters or snapshot_id format + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed, snapshot not found, or server error + """ + if not snapshot_id or not isinstance(snapshot_id, str): + raise ValidationError("Snapshot ID is required and must be a non-empty string") + + if format not in ["json", "ndjson", "jsonl", "csv"]: + raise ValidationError("Format must be one of: json, ndjson, jsonl, csv") + + if not isinstance(compress, bool): + raise ValidationError("Compress must be a boolean") + + if batch_size is not None: + if not isinstance(batch_size, int) or batch_size < 1000: + raise ValidationError("Batch size must be an integer >= 1000") + + if part is not None: + if not isinstance(part, int) or part < 1: + raise ValidationError("Part must be a positive integer") + if batch_size is None: + raise ValidationError("Part parameter requires batch_size to be specified") + + url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}" + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.api_token}", + "Accept": "application/json", + "User-Agent": user_agent + } + params = { + "format": format + } + + if compress: + params["compress"] = "true" + + if batch_size is not None: + params["batch_size"] = batch_size + + if part is not None: + params["part"] = part + + try: + logger.info(f"Downloading snapshot {snapshot_id} in {format} format") + + response = self.session.get( + url, + headers=headers, + params=params, + timeout=self.default_timeout + ) + + if response.status_code == 200: + pass + elif response.status_code == 202: + try: + response_data = response.json() + message = response_data.get('message', 'Snapshot is not ready yet') + print("Snapshot is not ready yet, try again soon") + return {"status": "not_ready", "message": message, "snapshot_id": snapshot_id} + except json.JSONDecodeError: + print("Snapshot is not ready yet, try again soon") + return {"status": "not_ready", "message": "Snapshot is not ready yet, check again soon", "snapshot_id": snapshot_id} + elif response.status_code == 401: + raise AuthenticationError("Invalid API token or insufficient permissions") + elif response.status_code == 404: + raise APIError(f"Snapshot '{snapshot_id}' not found") + else: + raise APIError(f"Download request failed with status {response.status_code}: {response.text}") + + if format == "csv": + data = response.text + save_data = data + else: + response_text = response.text + if '\n{' in response_text and response_text.strip().startswith('{'): + json_objects = [] + for line in response_text.strip().split('\n'): + if line.strip(): + try: + json_objects.append(json.loads(line)) + except json.JSONDecodeError: + continue + data = json_objects + save_data = json_objects + else: + try: + data = response.json() + save_data = data + except json.JSONDecodeError: + data = response_text + save_data = response_text + + try: + output_file = f"snapshot_{snapshot_id}.{format}" + if format == "csv" or isinstance(save_data, str): + with open(output_file, 'w', encoding='utf-8') as f: + f.write(str(save_data)) + else: + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(save_data, f, indent=2, ensure_ascii=False) + logger.info(f"Data saved to: {output_file}") + except Exception: + pass + + logger.info(f"Successfully downloaded snapshot {snapshot_id}") + return data + + except requests.exceptions.Timeout: + raise APIError("Timeout while downloading snapshot") + except requests.exceptions.RequestException as e: + raise APIError(f"Network error during snapshot download: {str(e)}") + except Exception as e: + if isinstance(e, (ValidationError, AuthenticationError, APIError)): + raise + raise APIError(f"Unexpected error during snapshot download: {str(e)}") + + def _parse_body_json(self, content: Union[Dict, List]) -> Union[Dict, List]: + """ + Parse JSON strings in 'body' fields to objects + + Args: + content: The content to process + + Returns: + Content with parsed body fields + """ + if content is None: + return content + + if isinstance(content, list): + for item in content: + if isinstance(item, dict) and 'body' in item: + body = item['body'] + if isinstance(body, str): + try: + item['body'] = json.loads(body) + except (json.JSONDecodeError, TypeError): + pass + elif isinstance(item, (dict, list)): + self._parse_body_json(item) + + elif isinstance(content, dict): + if 'body' in content: + body = content['body'] + if isinstance(body, str): + try: + content['body'] = json.loads(body) + except (json.JSONDecodeError, TypeError): + pass + + for key, value in content.items(): + if isinstance(value, (dict, list)): + content[key] = self._parse_body_json(value) + + return content \ No newline at end of file diff --git a/src/api/extract.py b/src/api/extract.py new file mode 100644 index 0000000..1b04b84 --- /dev/null +++ b/src/api/extract.py @@ -0,0 +1,419 @@ +import os +import re +import json +import openai +from typing import Dict, Any, Tuple, Union, List +from urllib.parse import urlparse + +from ..utils import get_logger +from ..exceptions import ValidationError, APIError + +logger = get_logger('api.extract') + + +class ExtractResult(str): + """ + Custom result class that behaves like a string (extracted content) + but also provides access to metadata attributes + """ + def __new__(cls, extracted_content, metadata): + obj = str.__new__(cls, extracted_content) + obj._metadata = metadata + return obj + + def __getattr__(self, name): + if name in self._metadata: + return self._metadata[name] + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") + + def __getitem__(self, key): + return self._metadata[key] + + def get(self, key, default=None): + return self._metadata.get(key, default) + + def keys(self): + return self._metadata.keys() + + def values(self): + return self._metadata.values() + + def items(self): + return self._metadata.items() + + @property + def metadata(self): + """Access full metadata dictionary""" + return self._metadata + + +class ExtractAPI: + """Handles content extraction using web scraping + LLM processing""" + + def __init__(self, client): + self.client = client + + def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: Dict[str, Any] = None, llm_key: str = None) -> Dict[str, Any]: + """ + ## Extract specific information from websites using AI + + Combines web scraping with OpenAI's language models to extract targeted information + from web pages based on natural language queries. + + ### Parameters: + - `query` (str): Natural language query describing what to extract. If `url` parameter is provided, + this becomes the pure extraction query. If `url` is not provided, this should include + the URL (e.g. "extract the most recent news from cnn.com") + - `url` (str | List[str], optional): Direct URL(s) to scrape. If provided, bypasses URL extraction + from query and sends these URLs to the web unlocker API + - `output_scheme` (dict, optional): JSON Schema defining the expected structure for the LLM response. + Uses OpenAI's Structured Outputs for reliable type-safe responses. + Example: {"type": "object", "properties": {"title": {"type": "string"}, "date": {"type": "string"}}, "required": ["title", "date"]} + - `llm_key` (str, optional): OpenAI API key. If not provided, uses OPENAI_API_KEY env variable + + ### Returns: + - `ExtractResult`: String containing extracted content with metadata attributes access + + ### Example Usage: + ```python + # Using URL parameter with structured output + result = client.extract( + query="extract the most recent news headlines", + url="https://cnn.com", + output_scheme={ + "type": "object", + "properties": { + "headlines": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "date": {"type": "string"} + }, + "required": ["title", "date"] + } + } + }, + "required": ["headlines"] + } + ) + + # Using URL in query (original behavior) + result = client.extract( + query="extract the most recent news from cnn.com", + llm_key="your-openai-api-key" + ) + + # Multiple URLs with structured schema + result = client.extract( + query="extract main headlines", + url=["https://cnn.com", "https://bbc.com"], + output_scheme={ + "type": "object", + "properties": { + "sources": { + "type": "array", + "items": { + "type": "object", + "properties": { + "source_name": {"type": "string"}, + "headlines": {"type": "array", "items": {"type": "string"}} + }, + "required": ["source_name", "headlines"] + } + } + }, + "required": ["sources"] + } + ) + ``` + + ### Raises: + - `ValidationError`: Invalid query format or missing LLM key + - `APIError`: Scraping failed or LLM processing error + """ + if not query or not isinstance(query, str): + raise ValidationError("Query must be a non-empty string") + + query = query.strip() + if len(query) > 10000: + raise ValidationError("Query is too long (maximum 10,000 characters)") + if len(query) < 5: + raise ValidationError("Query is too short (minimum 5 characters)") + + if not llm_key: + llm_key = os.getenv('OPENAI_API_KEY') + + if not llm_key or not isinstance(llm_key, str): + raise ValidationError("OpenAI API key is required. Provide it as parameter or set OPENAI_API_KEY environment variable") + + if output_scheme is not None: + if not isinstance(output_scheme, dict): + raise ValidationError("output_scheme must be a dict containing a valid JSON Schema") + if "type" not in output_scheme: + raise ValidationError("output_scheme must have a 'type' property") + + self._validate_structured_outputs_schema(output_scheme) + + logger.info(f"Processing extract query: {query[:50]}...") + + try: + if url is not None: + parsed_query = query.strip() + target_urls = url if isinstance(url, list) else [url] + logger.info(f"Using provided URL(s): {target_urls}") + else: + parsed_query, extracted_url = self._parse_query_and_url(query) + target_urls = [extracted_url] + logger.info(f"Parsed - Query: '{parsed_query}', URL: '{extracted_url}'") + + if len(target_urls) == 1: + scraped_content = self.client.scrape(target_urls[0], response_format="raw") + source_url = target_urls[0] + else: + scraped_content = self.client.scrape(target_urls, response_format="raw") + source_url = ', '.join(target_urls) + + logger.info(f"Scraped content from {len(target_urls)} URL(s)") + + if isinstance(scraped_content, list): + all_text = [] + all_titles = [] + for i, content in enumerate(scraped_content): + parsed = self.client.parse_content( + content, + extract_text=True, + extract_links=False, + extract_images=False + ) + all_text.append(f"--- Content from {target_urls[i]} ---\n{parsed.get('text', '')}") + all_titles.append(parsed.get('title', 'Unknown')) + + combined_text = "\n\n".join(all_text) + combined_title = " | ".join(all_titles) + parsed_content = {'text': combined_text, 'title': combined_title} + else: + parsed_content = self.client.parse_content( + scraped_content, + extract_text=True, + extract_links=False, + extract_images=False + ) + + logger.info(f"Parsed content - text length: {len(parsed_content.get('text', ''))}") + + extracted_info, token_usage = self._process_with_llm( + parsed_query, + parsed_content.get('text', ''), + llm_key, + source_url, + output_scheme + ) + + metadata = { + 'query': parsed_query, + 'url': source_url, + 'extracted_content': extracted_info, + 'source_title': parsed_content.get('title', 'Unknown'), + 'content_length': len(parsed_content.get('text', '')), + 'token_usage': token_usage, + 'success': True + } + + return ExtractResult(extracted_info, metadata) + + except Exception as e: + if isinstance(e, (ValidationError, APIError)): + raise + logger.error(f"Unexpected error during extraction: {e}") + raise APIError(f"Extraction failed: {str(e)}") + + def _parse_query_and_url(self, query: str) -> Tuple[str, str]: + """ + Parse natural language query to extract the task and URL + + Args: + query: Natural language query like "extract news from cnn.com" + + Returns: + Tuple of (parsed_query, full_url) + """ + query = query.strip() + + url_patterns = [ + r'from\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)', + r'on\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)', + r'at\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)', + r'((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)' + ] + + url = None + for pattern in url_patterns: + match = re.search(pattern, query, re.IGNORECASE) + if match: + url = match.group(1) + break + + if not url: + raise ValidationError("Could not extract URL from query. Please include a website URL.") + + full_url = self._build_full_url(url) + + extract_query = re.sub(r'\b(?:from|on|at)\s+(?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*', '', query, flags=re.IGNORECASE) + extract_query = re.sub(r'\b(?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*', '', extract_query, flags=re.IGNORECASE) + extract_query = re.sub(r'\s+', ' ', extract_query).strip() + + if not extract_query: + extract_query = "extract the main content" + + return extract_query, full_url + + def _build_full_url(self, url: str) -> str: + """ + Build a complete URL from potentially partial URL + + Args: + url: Potentially partial URL like "cnn.com" or "https://example.com" + + Returns: + Complete URL with https:// and www if needed + """ + url = url.strip() + + if not url.startswith(('http://', 'https://')): + if not url.startswith('www.'): + url = f'www.{url}' + url = f'https://{url}' + + parsed = urlparse(url) + if not parsed.netloc: + raise ValidationError(f"Invalid URL format: {url}") + + return url + + def _validate_structured_outputs_schema(self, schema: Dict[str, Any], path: str = "") -> None: + """ + Validate JSON Schema for OpenAI Structured Outputs compatibility + + Args: + schema: JSON Schema to validate + path: Current path in schema (for error reporting) + """ + if not isinstance(schema, dict): + return + + schema_type = schema.get("type") + + if schema_type == "object": + if "properties" not in schema: + raise ValidationError(f"Object schema at '{path}' must have 'properties' defined") + if "required" not in schema: + raise ValidationError(f"Object schema at '{path}' must have 'required' array (OpenAI Structured Outputs requirement)") + if "additionalProperties" not in schema or schema["additionalProperties"] is not False: + raise ValidationError(f"Object schema at '{path}' must have 'additionalProperties': false (OpenAI Structured Outputs requirement)") + + properties = set(schema["properties"].keys()) + required = set(schema["required"]) + if properties != required: + missing = properties - required + extra = required - properties + error_msg = f"OpenAI Structured Outputs requires ALL properties to be in 'required' array at '{path}'." + if missing: + error_msg += f" Missing from required: {list(missing)}" + if extra: + error_msg += f" Extra in required: {list(extra)}" + raise ValidationError(error_msg) + + for prop_name, prop_schema in schema["properties"].items(): + self._validate_structured_outputs_schema(prop_schema, f"{path}.{prop_name}") + + elif schema_type == "array": + if "items" in schema: + self._validate_structured_outputs_schema(schema["items"], f"{path}[]") + + def _process_with_llm(self, query: str, content: str, llm_key: str, source_url: str, output_scheme: Dict[str, Any] = None) -> Tuple[str, Dict[str, int]]: + """ + Process scraped content with OpenAI to extract requested information + + Args: + query: What to extract from the content + content: Scraped and parsed text content + llm_key: OpenAI API key + source_url: Source URL for context + output_scheme: JSON Schema dict for structured outputs (optional) + + Returns: + Tuple of (extracted information, token usage dict) + """ + if len(content) > 15000: + beginning = content[:8000] + end = content[-4000:] + content = f"{beginning}\n\n... [middle content truncated for token efficiency] ...\n\n{end}" + elif len(content) > 12000: + content = content[:12000] + "\n\n... [content truncated to optimize tokens]" + + client = openai.OpenAI(api_key=llm_key) + + system_prompt = f"""You are a precise web content extraction specialist. Your task: {query} + +SOURCE: {source_url} + +INSTRUCTIONS: +1. Extract ONLY the specific information requested +2. Include relevant details (dates, numbers, names) when available +3. If requested info isn't found, briefly state what content IS available +4. Keep response concise but complete +5. Be accurate and factual""" + + user_prompt = f"CONTENT TO ANALYZE:\n\n{content}\n\nEXTRACT: {query}" + + try: + call_params = { + "model": "gpt-4o-2024-08-06", + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + "max_tokens": 1000, + "temperature": 0.1 + } + + if output_scheme: + call_params["response_format"] = { + "type": "json_schema", + "json_schema": { + "name": "extracted_content", + "strict": True, + "schema": output_scheme + } + } + logger.info("Using OpenAI Structured Outputs with provided schema") + else: + logger.info("Using regular OpenAI completion (no structured schema provided)") + + response = client.chat.completions.create(**call_params) + + if not response.choices or not response.choices[0].message.content: + raise APIError("OpenAI returned empty response") + + extracted_content = response.choices[0].message.content.strip() + + if output_scheme: + logger.info("Received structured JSON response from OpenAI") + else: + logger.info("Received text response from OpenAI") + + token_usage = { + 'prompt_tokens': response.usage.prompt_tokens, + 'completion_tokens': response.usage.completion_tokens, + 'total_tokens': response.usage.total_tokens + } + + logger.info(f"OpenAI token usage: {token_usage['total_tokens']} total ({token_usage['prompt_tokens']} prompt + {token_usage['completion_tokens']} completion)") + + return extracted_content, token_usage + + except Exception as e: + logger.error(f"OpenAI API error: {e}") + raise APIError(f"Failed to process content with LLM: {str(e)}") \ No newline at end of file diff --git a/src/api/linkedin.py b/src/api/linkedin.py new file mode 100644 index 0000000..19ede6b --- /dev/null +++ b/src/api/linkedin.py @@ -0,0 +1,803 @@ +import json +import re +import requests +from typing import Union, Dict, Any, List + +from ..utils import get_logger +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.linkedin') + + +class LinkedInAPI: + """Handles LinkedIn data collection using Bright Data's collect API""" + + DATASET_IDS = { + 'profile': 'gd_l1viktl72bvl7bjuj0', + 'company': 'gd_l1vikfnt1wgvvqz95w', + 'job': 'gd_lpfll7v5hcqtkxl6l', + 'post': 'gd_lyy3tktm25m4avu764' + } + + URL_PATTERNS = { + 'profile': re.compile(r'linkedin\.com/in/[^/?]+/?(\?.*)?$'), + 'company': re.compile(r'linkedin\.com/(company|organization-guest/company)/[^/?]+/?(\?.*)?$'), + 'job': re.compile(r'linkedin\.com/jobs/view/[^/?]+/?(\?.*)?$'), + 'post': re.compile(r'linkedin\.com/(posts|pulse)/[^/?]+/?(\?.*)?$') + } + + def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5): + self.session = session + self.api_token = api_token + self.default_timeout = default_timeout + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + def _identify_dataset_type(self, url: str) -> str: + """ + Identify LinkedIn dataset type based on URL pattern + + Args: + url: LinkedIn URL to analyze + + Returns: + Dataset type ('profile', 'company', 'job', 'post') + + Raises: + ValidationError: If URL doesn't match any known LinkedIn pattern + """ + if not url or not isinstance(url, str): + raise ValidationError("URL must be a non-empty string") + + url = url.strip().lower() + for dataset_type, pattern in self.URL_PATTERNS.items(): + if pattern.search(url): + logger.debug(f"URL '{url}' identified as LinkedIn {dataset_type}") + return dataset_type + + raise ValidationError(f"URL '{url}' does not match any supported LinkedIn data type") + + def _scrape_linkedin_dataset( + self, + urls: Union[str, List[str]], + dataset_id: str, + dataset_type: str, + sync: bool = True, + timeout: int = None + ) -> Dict[str, Any]: + """ + Internal method to scrape LinkedIn data using Bright Data's collect API + + Args: + urls: Single LinkedIn URL or list of LinkedIn URLs + dataset_id: Bright Data dataset ID for the specific LinkedIn data type + dataset_type: Type of LinkedIn data (for logging purposes) + sync: If True (default), uses synchronous API for immediate results + timeout: Request timeout in seconds + + Returns: + Dict containing response with snapshot_id or direct data (if sync=True) + + Raises: + ValidationError: Invalid URL format + AuthenticationError: Invalid API token or insufficient permissions + APIError: Request failed or server error + """ + if isinstance(urls, str): + url_list = [urls] + else: + url_list = urls + + if not url_list or len(url_list) == 0: + raise ValidationError("At least one URL is required") + for url in url_list: + if not url or not isinstance(url, str): + raise ValidationError("All URLs must be non-empty strings") + + logger.info(f"Processing {len(url_list)} LinkedIn {dataset_type} URL(s) {'synchronously' if sync else 'asynchronously'}") + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + + if sync: + api_url = "https://api.brightdata.com/datasets/v3/scrape" + data = { + "input": [{"url": url} for url in url_list] + } + params = { + "dataset_id": dataset_id, + "notify": "false", + "include_errors": "true" + } + else: + api_url = "https://api.brightdata.com/datasets/v3/trigger" + data = [{"url": url} for url in url_list] + params = { + "dataset_id": dataset_id, + "include_errors": "true" + } + + try: + if sync: + response = self.session.post( + api_url, + headers=headers, + params=params, + json=data, + timeout=timeout or 65 + ) + else: + response = self.session.post( + api_url, + headers=headers, + params=params, + json=data, + timeout=timeout or self.default_timeout + ) + + if response.status_code == 401: + raise AuthenticationError("Invalid API token or insufficient permissions") + elif response.status_code not in [200, 202]: + raise APIError(f"LinkedIn data collection request failed with status {response.status_code}: {response.text}") + + if sync: + response_text = response.text + if '\n{' in response_text and response_text.strip().startswith('{'): + json_objects = [] + for line in response_text.strip().split('\n'): + if line.strip(): + try: + json_objects.append(json.loads(line)) + except json.JSONDecodeError: + continue + result = json_objects + else: + try: + result = response.json() + except json.JSONDecodeError: + result = response_text + + logger.info(f"LinkedIn {dataset_type} data retrieved synchronously for {len(url_list)} URL(s)") + print(f"Retrieved {len(result) if isinstance(result, list) else 1} LinkedIn {dataset_type} record(s)") + else: + result = response.json() + snapshot_id = result.get('snapshot_id') + if snapshot_id: + logger.info(f"LinkedIn {dataset_type} data collection job initiated successfully for {len(url_list)} URL(s)") + print("") + print("Snapshot ID:") + print(snapshot_id) + print("") + + return result + + except requests.exceptions.Timeout: + raise APIError("Timeout while initiating LinkedIn data collection") + except requests.exceptions.RequestException as e: + raise APIError(f"Network error during LinkedIn data collection: {str(e)}") + except json.JSONDecodeError as e: + raise APIError(f"Failed to parse LinkedIn data collection response: {str(e)}") + except Exception as e: + if isinstance(e, (ValidationError, AuthenticationError, APIError)): + raise + raise APIError(f"Unexpected error during LinkedIn data collection: {str(e)}") + + +class LinkedInScraper: + """LinkedIn data scraping interface with specialized methods for different data types""" + + def __init__(self, linkedin_api): + self.linkedin_api = linkedin_api + + def profiles(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]: + """ + ## Scrape LinkedIn Profile Data + + Scrapes structured data from LinkedIn profiles using the profiles dataset. + + ### Parameters: + - `url` (str | List[str]): Single LinkedIn profile URL or list of profile URLs + - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing + - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async) + + ### Returns: + - `Dict[str, Any]`: If sync=True, returns scraped profile data directly. If sync=False, returns response with snapshot_id for async processing + + ### Example URLs: + - `https://www.linkedin.com/in/username/` + - `https://linkedin.com/in/first-last-123456/` + + ### Example Usage: + ```python + # Single profile (synchronous - returns data immediately) + result = client.scrape_linkedin.profiles("https://www.linkedin.com/in/elad-moshe-05a90413/") + + # Multiple profiles (synchronous - returns data immediately) + profiles = [ + "https://www.linkedin.com/in/user1/", + "https://www.linkedin.com/in/user2/" + ] + result = client.scrape_linkedin.profiles(profiles) + + # Asynchronous processing (returns snapshot_id) + result = client.scrape_linkedin.profiles(profiles, sync=False) + ``` + """ + return self.linkedin_api._scrape_linkedin_dataset( + url, + self.linkedin_api.DATASET_IDS['profile'], + 'profile', + sync, + timeout + ) + + def companies(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]: + """ + ## Scrape LinkedIn Company Data + + Scrapes structured data from LinkedIn company pages using the companies dataset. + + ### Parameters: + - `url` (str | List[str]): Single LinkedIn company URL or list of company URLs + - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing + - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async) + + ### Returns: + - `Dict[str, Any]`: If sync=True, returns scraped company data directly. If sync=False, returns response with snapshot_id for async processing + + ### Example URLs: + - `https://www.linkedin.com/company/company-name/` + - `https://linkedin.com/company/bright-data/` + + ### Example Usage: + ```python + # Single company (synchronous) + result = client.scrape_linkedin.companies("https://www.linkedin.com/company/bright-data/") + + # Multiple companies (synchronous) + companies = [ + "https://www.linkedin.com/company/ibm/", + "https://www.linkedin.com/company/microsoft/" + ] + result = client.scrape_linkedin.companies(companies) + + # Asynchronous processing + result = client.scrape_linkedin.companies(companies, sync=False) + ``` + """ + return self.linkedin_api._scrape_linkedin_dataset( + url, + self.linkedin_api.DATASET_IDS['company'], + 'company', + sync, + timeout + ) + + def jobs(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]: + """ + ## Scrape LinkedIn Job Data + + Scrapes structured data from LinkedIn job listings using the jobs dataset. + + ### Parameters: + - `url` (str | List[str]): Single LinkedIn job URL or list of job URLs + - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing + - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async) + + ### Returns: + - `Dict[str, Any]`: If sync=True, returns scraped job data directly. If sync=False, returns response with snapshot_id for async processing + + ### Example URLs: + - `https://www.linkedin.com/jobs/view/1234567890/` + - `https://linkedin.com/jobs/view/job-id/` + + ### Example Usage: + ```python + # Single job listing (synchronous) + result = client.scrape_linkedin.jobs("https://www.linkedin.com/jobs/view/1234567890/") + + # Multiple job listings (synchronous) + jobs = [ + "https://www.linkedin.com/jobs/view/1111111/", + "https://www.linkedin.com/jobs/view/2222222/" + ] + result = client.scrape_linkedin.jobs(jobs) + + # Asynchronous processing + result = client.scrape_linkedin.jobs(jobs, sync=False) + ``` + """ + return self.linkedin_api._scrape_linkedin_dataset( + url, + self.linkedin_api.DATASET_IDS['job'], + 'job', + sync, + timeout + ) + + def posts(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]: + """ + ## Scrape LinkedIn Post Data + + Scrapes structured data from LinkedIn posts and articles using the posts dataset. + + ### Parameters: + - `url` (str | List[str]): Single LinkedIn post URL or list of post URLs + - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing + - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async) + + ### Returns: + - `Dict[str, Any]`: If sync=True, returns scraped post data directly. If sync=False, returns response with snapshot_id for async processing + + ### Example URLs: + - `https://www.linkedin.com/posts/username-activity-123456/` + - `https://www.linkedin.com/pulse/article-title-author/` + + ### Example Usage: + ```python + # Single post (synchronous) + result = client.scrape_linkedin.posts("https://www.linkedin.com/posts/user-activity-123/") + + # Multiple posts (synchronous) + posts = [ + "https://www.linkedin.com/posts/user1-activity-111/", + "https://www.linkedin.com/pulse/article-author/" + ] + result = client.scrape_linkedin.posts(posts) + + # Asynchronous processing + result = client.scrape_linkedin.posts(posts, sync=False) + ``` + """ + return self.linkedin_api._scrape_linkedin_dataset( + url, + self.linkedin_api.DATASET_IDS['post'], + 'post', + sync, + timeout + ) + + +class LinkedInSearcher: + """LinkedIn search interface for discovering new LinkedIn data by various criteria""" + + def __init__(self, linkedin_api): + self.linkedin_api = linkedin_api + + def profiles( + self, + first_name: Union[str, List[str]], + last_name: Union[str, List[str]], + timeout: int = None + ) -> Dict[str, Any]: + """ + ## Search LinkedIn Profiles by Name + + Discovers LinkedIn profiles by searching for first and last names. + + ### Parameters: + - `first_name` (str | List[str]): Single first name or list of first names to search for + - `last_name` (str | List[str]): Single last name or list of last names to search for + - `timeout` (int, optional): Request timeout in seconds (default: 30) + + ### Returns: + - `Dict[str, Any]`: Response containing snapshot_id for async processing + + ### Example Usage: + ```python + # Single name search (returns snapshot_id) + result = client.search_linkedin.profiles("James", "Smith") + + # Multiple names search (returns snapshot_id) + first_names = ["James", "Idan"] + last_names = ["Smith", "Vilenski"] + result = client.search_linkedin.profiles(first_names, last_names) + ``` + """ + if isinstance(first_name, str): + first_names = [first_name] + else: + first_names = first_name + + if isinstance(last_name, str): + last_names = [last_name] + else: + last_names = last_name + + if len(first_names) != len(last_names): + raise ValidationError("first_name and last_name must have the same length") + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['profile'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "name" + } + + data = [ + { + "first_name": first_names[i], + "last_name": last_names[i] + } + for i in range(len(first_names)) + ] + + return self._make_request(api_url, headers, params, data, 'profile search', len(data), timeout) + + def jobs( + self, + url: Union[str, List[str]] = None, + location: Union[str, List[str]] = None, + keyword: Union[str, List[str]] = "", + country: Union[str, List[str]] = "", + time_range: Union[str, List[str]] = "", + job_type: Union[str, List[str]] = "", + experience_level: Union[str, List[str]] = "", + remote: Union[str, List[str]] = "", + company: Union[str, List[str]] = "", + location_radius: Union[str, List[str]] = "", + selective_search: Union[bool, List[bool]] = False, + timeout: int = None + ) -> Dict[str, Any]: + """ + ## Search LinkedIn Jobs by URL or Keywords + + Discovers LinkedIn jobs either by searching specific job search URLs or by keyword criteria. + + ### Parameters: + - `url` (str | List[str], optional): LinkedIn job search URLs to scrape + - `location` (str | List[str], optional): Job location(s) - required when searching by keyword + - `keyword` (str | List[str], optional): Job keyword(s) to search for (default: "") + - `country` (str | List[str], optional): Country code(s) (default: "") + - `time_range` (str | List[str], optional): Time range filter (default: "") + - `job_type` (str | List[str], optional): Job type filter (default: "") + - `experience_level` (str | List[str], optional): Experience level filter (default: "") + - `remote` (str | List[str], optional): Remote work filter (default: "") + - `company` (str | List[str], optional): Company name filter (default: "") + - `location_radius` (str | List[str], optional): Location radius filter (default: "") + - `selective_search` (bool | List[bool], optional): Enable selective search (default: False) + - `timeout` (int, optional): Request timeout in seconds (default: 30) + + ### Returns: + - `Dict[str, Any]`: Response containing snapshot_id for async processing + + ### Example Usage: + ```python + # Search by job URLs (returns snapshot_id) + job_urls = [ + "https://www.linkedin.com/jobs/search?keywords=Software&location=Tel%20Aviv-Yafo", + "https://www.linkedin.com/jobs/reddit-inc.-jobs-worldwide?f_C=150573" + ] + result = client.search_linkedin.jobs(url=job_urls) + + # Search by keyword (returns snapshot_id) + result = client.search_linkedin.jobs( + location="Paris", + keyword="product manager", + country="FR", + time_range="Past month", + job_type="Full-time" + ) + ``` + """ + if url is not None: + return self._search_jobs_by_url(url, timeout) + elif location is not None: + return self._search_jobs_by_keyword( + location, keyword, country, time_range, job_type, + experience_level, remote, company, location_radius, + selective_search, timeout + ) + else: + raise ValidationError("Either 'url' or 'location' parameter must be provided") + + def posts( + self, + profile_url: Union[str, List[str]] = None, + company_url: Union[str, List[str]] = None, + url: Union[str, List[str]] = None, + start_date: Union[str, List[str]] = "", + end_date: Union[str, List[str]] = "", + timeout: int = None + ) -> Dict[str, Any]: + """ + ## Search LinkedIn Posts by Profile, Company, or General URL + + Discovers LinkedIn posts using various search methods. + + ### Parameters: + - `profile_url` (str | List[str], optional): LinkedIn profile URL(s) to get posts from + - `company_url` (str | List[str], optional): LinkedIn company URL(s) to get posts from + - `url` (str | List[str], optional): General LinkedIn URL(s) for posts + - `start_date` (str | List[str], optional): Start date filter (ISO format, default: "") + - `end_date` (str | List[str], optional): End date filter (ISO format, default: "") + - `timeout` (int, optional): Request timeout in seconds (default: 30) + + ### Returns: + - `Dict[str, Any]`: Response containing snapshot_id for async processing + + ### Example Usage: + ```python + # Search posts by profile URL with date range (returns snapshot_id) + result = client.search_linkedin.posts( + profile_url="https://www.linkedin.com/in/bettywliu", + start_date="2018-04-25T00:00:00.000Z", + end_date="2021-05-25T00:00:00.000Z" + ) + + # Search posts by company URL (returns snapshot_id) + result = client.search_linkedin.posts( + company_url="https://www.linkedin.com/company/bright-data" + ) + + # Search posts by general URL (returns snapshot_id) + result = client.search_linkedin.posts( + url="https://www.linkedin.com/posts/activity-123456" + ) + ``` + """ + if profile_url is not None: + return self._search_posts_by_profile(profile_url, start_date, end_date, timeout) + elif company_url is not None: + return self._search_posts_by_company(company_url, timeout) + elif url is not None: + return self._search_posts_by_url(url, timeout) + else: + raise ValidationError("One of 'profile_url', 'company_url', or 'url' parameter must be provided") + + def _search_jobs_by_url(self, urls, timeout): + """Search jobs by LinkedIn job search URLs""" + if isinstance(urls, str): + url_list = [urls] + else: + url_list = urls + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['job'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "url" + } + + data = [{"url": url} for url in url_list] + return self._make_request(api_url, headers, params, data, 'job search by URL', len(data), timeout) + + def _search_jobs_by_keyword(self, location, keyword, country, time_range, job_type, experience_level, remote, company, location_radius, selective_search, timeout): + """Search jobs by keyword criteria""" + params_dict = { + 'location': location, 'keyword': keyword, 'country': country, + 'time_range': time_range, 'job_type': job_type, 'experience_level': experience_level, + 'remote': remote, 'company': company, 'location_radius': location_radius, + 'selective_search': selective_search + } + + max_length = 1 + for key, value in params_dict.items(): + if isinstance(value, list): + max_length = max(max_length, len(value)) + normalized_params = {} + for key, value in params_dict.items(): + if isinstance(value, list): + if len(value) != max_length and len(value) != 1: + raise ValidationError(f"Parameter '{key}' list length must be 1 or {max_length}") + normalized_params[key] = value * max_length if len(value) == 1 else value + else: + normalized_params[key] = [value] * max_length + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['job'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "keyword" + } + + data = [] + for i in range(max_length): + data.append({ + "location": normalized_params['location'][i], + "keyword": normalized_params['keyword'][i], + "country": normalized_params['country'][i], + "time_range": normalized_params['time_range'][i], + "job_type": normalized_params['job_type'][i], + "experience_level": normalized_params['experience_level'][i], + "remote": normalized_params['remote'][i], + "company": normalized_params['company'][i], + "location_radius": normalized_params['location_radius'][i], + "selective_search": normalized_params['selective_search'][i] + }) + + return self._make_request(api_url, headers, params, data, 'job search by keyword', len(data), timeout) + + def _search_posts_by_profile(self, profile_urls, start_dates, end_dates, timeout): + """Search posts by profile URL with optional date filtering""" + if isinstance(profile_urls, str): + url_list = [profile_urls] + else: + url_list = profile_urls + + if isinstance(start_dates, str): + start_list = [start_dates] * len(url_list) + else: + start_list = start_dates if len(start_dates) == len(url_list) else [start_dates[0]] * len(url_list) + + if isinstance(end_dates, str): + end_list = [end_dates] * len(url_list) + else: + end_list = end_dates if len(end_dates) == len(url_list) else [end_dates[0]] * len(url_list) + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['post'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "profile_url" + } + + data = [] + for i in range(len(url_list)): + item = {"url": url_list[i]} + if start_list[i]: + item["start_date"] = start_list[i] + if end_list[i]: + item["end_date"] = end_list[i] + data.append(item) + + return self._make_request(api_url, headers, params, data, 'post search by profile', len(data), timeout) + + def _search_posts_by_company(self, company_urls, timeout): + """Search posts by company URL""" + if isinstance(company_urls, str): + url_list = [company_urls] + else: + url_list = company_urls + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['post'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "company_url" + } + + data = [{"url": url} for url in url_list] + return self._make_request(api_url, headers, params, data, 'post search by company', len(data), timeout) + + def _search_posts_by_url(self, urls, timeout): + """Search posts by general URL""" + if isinstance(urls, str): + url_list = [urls] + else: + url_list = urls + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['post'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "url" + } + + data = [{"url": url} for url in url_list] + return self._make_request(api_url, headers, params, data, 'post search by URL', len(data), timeout) + + def _make_request(self, api_url, headers, params, data, operation_type, count, timeout): + """Common method to make API requests (async only for search operations)""" + try: + response = self.linkedin_api.session.post( + api_url, + headers=headers, + params=params, + json=data, + timeout=timeout or self.linkedin_api.default_timeout + ) + + if response.status_code == 401: + raise AuthenticationError("Invalid API token or insufficient permissions") + elif response.status_code != 200: + raise APIError(f"LinkedIn {operation_type} request failed with status {response.status_code}: {response.text}") + + result = response.json() + snapshot_id = result.get('snapshot_id') + if snapshot_id: + logger.info(f"LinkedIn {operation_type} job initiated successfully for {count} item(s)") + print("") + print("Snapshot ID:") + print(snapshot_id) + print("") + + return result + + except requests.exceptions.Timeout: + raise APIError(f"Timeout while initiating LinkedIn {operation_type}") + except requests.exceptions.RequestException as e: + raise APIError(f"Network error during LinkedIn {operation_type}: {str(e)}") + except json.JSONDecodeError as e: + raise APIError(f"Failed to parse LinkedIn {operation_type} response: {str(e)}") + except Exception as e: + if isinstance(e, (ValidationError, AuthenticationError, APIError)): + raise + raise APIError(f"Unexpected error during LinkedIn {operation_type}: {str(e)}") \ No newline at end of file diff --git a/src/api/scraper.py b/src/api/scraper.py new file mode 100644 index 0000000..0d4fc31 --- /dev/null +++ b/src/api/scraper.py @@ -0,0 +1,205 @@ +import time +from typing import Union, Dict, Any, List +from concurrent.futures import ThreadPoolExecutor, as_completed + +from ..utils import ( + validate_url, validate_zone_name, validate_country_code, + validate_timeout, validate_max_workers, validate_url_list, + validate_response_format, validate_http_method, retry_request, + get_logger, log_request, safe_json_parse, validate_response_size +) +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.scraper') + + +class WebScraper: + """Handles web scraping operations using Bright Data Web Unlocker API""" + + def __init__(self, session, default_timeout=30, max_retries=3, retry_backoff=1.5): + self.session = session + self.default_timeout = default_timeout + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + def scrape( + self, + url: Union[str, List[str]], + zone: str, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "markdown", + async_request: bool = False, + max_workers: int = 10, + timeout: int = None + ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]: + """ + **Unlock and scrape websites using Bright Data Web Unlocker API** + + Scrapes one or multiple URLs through Bright Data's proxy network with anti-bot detection bypass. + + **Parameters:** + - `url` (str | List[str]): Single URL string or list of URLs to scrape + - `zone` (str): Your Bright Data zone identifier + - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`) + - `method` (str, optional): HTTP method for the request (default: `"GET"`) + - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`) + - `data_format` (str, optional): Additional format transformation (default: `"html"`) + - `async_request` (bool, optional): Enable asynchronous processing (default: `False`) + - `max_workers` (int, optional): Maximum parallel workers for multiple URLs (default: `10`) + - `timeout` (int, optional): Request timeout in seconds (default: `30`) + + **Returns:** + - Single URL: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"` + - Multiple URLs: `List[Union[Dict[str, Any], str]]` corresponding to each input URL + + **Example Usage:** + ```python + # Single URL scraping + result = client.scrape( + url="https://example.com", + zone="your_zone_name", + response_format="json" + ) + + # Multiple URLs scraping + urls = ["https://site1.com", "https://site2.com"] + results = client.scrape( + url=urls, + zone="your_zone_name", + response_format="raw", + max_workers=5 + ) + ``` + + **Raises:** + - `ValidationError`: Invalid URL format or empty URL list + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + + timeout = timeout or self.default_timeout + validate_zone_name(zone) + validate_response_format(response_format) + validate_http_method(method) + validate_country_code(country) + validate_timeout(timeout) + validate_max_workers(max_workers) + + if isinstance(url, list): + validate_url_list(url) + effective_max_workers = min(len(url), max_workers or 10) + + results = [None] * len(url) + + with ThreadPoolExecutor(max_workers=effective_max_workers) as executor: + future_to_index = { + executor.submit( + self._perform_single_scrape, + single_url, zone, response_format, method, country, + data_format, async_request, timeout + ): i + for i, single_url in enumerate(url) + } + for future in as_completed(future_to_index): + index = future_to_index[future] + try: + result = future.result() + results[index] = result + except Exception as e: + raise APIError(f"Failed to scrape {url[index]}: {str(e)}") + + return results + else: + validate_url(url) + return self._perform_single_scrape( + url, zone, response_format, method, country, + data_format, async_request, timeout + ) + + def _perform_single_scrape( + self, + url: str, + zone: str, + response_format: str, + method: str, + country: str, + data_format: str, + async_request: bool, + timeout: int + ) -> Union[Dict[str, Any], str]: + """ + Perform a single scrape operation with comprehensive logging + """ + endpoint = "https://api.brightdata.com/request" + start_time = time.time() + + logger.info(f"Starting scrape request for URL: {url[:100]}{'...' if len(url) > 100 else ''}") + + payload = { + "zone": zone, + "url": url, + "format": response_format, + "method": method, + "data_format": data_format + } + + params = {} + if async_request: + params['async'] = 'true' + + @retry_request( + max_retries=self.max_retries, + backoff_factor=self.retry_backoff, + retry_statuses={429, 500, 502, 503, 504} + ) + def make_request(): + return self.session.post( + endpoint, + json=payload, + params=params, + timeout=timeout + ) + + try: + response = make_request() + response_time = (time.time() - start_time) * 1000 + + # Log request details + log_request(logger, 'POST', endpoint, response.status_code, response_time) + + if response.status_code == 200: + logger.info(f"Scrape completed successfully in {response_time:.2f}ms") + + validate_response_size(response.text) + + if response_format == "json": + result = safe_json_parse(response.text) + logger.debug(f"Processed response with {len(str(result))} characters") + return result + else: + logger.debug(f"Returning raw response with {len(response.text)} characters") + return response.text + + elif response.status_code == 400: + logger.error(f"Bad Request (400) for URL {url}: {response.text}") + raise ValidationError(f"Bad Request (400): {response.text}") + elif response.status_code == 401: + logger.error(f"Unauthorized (401) for URL {url}: Check API token") + raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}") + elif response.status_code == 403: + logger.error(f"Forbidden (403) for URL {url}: Insufficient permissions") + raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}") + elif response.status_code == 404: + logger.error(f"Not Found (404) for URL {url}: {response.text}") + raise APIError(f"Not Found (404): {response.text}") + else: + logger.error(f"API Error ({response.status_code}) for URL {url}: {response.text}") + raise APIError(f"API Error ({response.status_code}): {response.text}", + status_code=response.status_code, response_text=response.text) + + except Exception as e: + response_time = (time.time() - start_time) * 1000 + logger.error(f"Request failed after {response_time:.2f}ms for URL {url}: {str(e)}", exc_info=True) + raise \ No newline at end of file diff --git a/src/api/search.py b/src/api/search.py new file mode 100644 index 0000000..24e6365 --- /dev/null +++ b/src/api/search.py @@ -0,0 +1,212 @@ +import json +import time +from typing import Union, Dict, Any, List +from concurrent.futures import ThreadPoolExecutor, as_completed +from urllib.parse import quote_plus + +from ..utils import ( + validate_zone_name, validate_country_code, validate_timeout, + validate_max_workers, validate_search_engine, validate_query, + validate_response_format, validate_http_method, retry_request, + get_logger, log_request, safe_json_parse, validate_response_size +) +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.search') + + +class SearchAPI: + """Handles search operations using Bright Data SERP API""" + + def __init__(self, session, default_timeout=30, max_retries=3, retry_backoff=1.5): + self.session = session + self.default_timeout = default_timeout + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + def search( + self, + query: Union[str, List[str]], + search_engine: str = "google", + zone: str = None, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "markdown", + async_request: bool = False, + max_workers: int = 10, + timeout: int = None, + parse: bool = False + ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]: + """ + ## Search the web using Bright Data SERP API + + Performs web searches through major search engines using Bright Data's proxy network + for reliable, bot-detection-free results. + + ### Parameters: + - `query` (str | List[str]): Search query string or list of search queries + - `search_engine` (str, optional): Search engine to use - `"google"`, `"bing"`, or `"yandex"` (default: `"google"`) + - `zone` (str, optional): Your Bright Data zone identifier (default: `None`) + - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`) + - `method` (str, optional): HTTP method for the request (default: `"GET"`) + - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`) + - `data_format` (str, optional): Additional format transformation (default: `"markdown"`) + - `async_request` (bool, optional): Enable asynchronous processing (default: `False`) + - `max_workers` (int, optional): Maximum parallel workers for multiple queries (default: `10`) + - `timeout` (int, optional): Request timeout in seconds (default: `30`) + - `parse` (bool, optional): Enable JSON parsing by adding brd_json=1 to URL (default: `False`) + + ### Returns: + - Single query: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"` + - Multiple queries: `List[Union[Dict[str, Any], str]]` corresponding to each input query + + ### Example Usage: + ```python + # Single search query + result = client.search( + query="best laptops 2024", + search_engine="google", + response_format="json" + ) + + # Multiple search queries + queries = ["python tutorials", "machine learning courses", "web development"] + results = client.search( + query=queries, + search_engine="bing", + zone="your_zone_name", + max_workers=3 + ) + ``` + + ### Supported Search Engines: + - `"google"` - Google Search + - `"bing"` - Microsoft Bing + - `"yandex"` - Yandex Search + + ### Raises: + - `ValidationError`: Invalid search engine, empty query, or validation errors + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + + timeout = timeout or self.default_timeout + validate_zone_name(zone) + validate_search_engine(search_engine) + validate_query(query) + validate_response_format(response_format) + validate_http_method(method) + validate_country_code(country) + validate_timeout(timeout) + validate_max_workers(max_workers) + + base_url_map = { + "google": "https://www.google.com/search?q=", + "bing": "https://www.bing.com/search?q=", + "yandex": "https://yandex.com/search/?text=" + } + + base_url = base_url_map[search_engine.lower()] + + if isinstance(query, list): + effective_max_workers = min(len(query), max_workers or 10) + results = [None] * len(query) + + with ThreadPoolExecutor(max_workers=effective_max_workers) as executor: + future_to_index = { + executor.submit( + self._perform_single_search, + single_query, zone, response_format, method, country, + data_format, async_request, base_url, timeout, parse + ): i + for i, single_query in enumerate(query) + } + + for future in as_completed(future_to_index): + index = future_to_index[future] + try: + result = future.result() + results[index] = result + except Exception as e: + raise APIError(f"Failed to search '{query[index]}': {str(e)}") + + return results + else: + return self._perform_single_search( + query, zone, response_format, method, country, + data_format, async_request, base_url, timeout, parse + ) + + def _perform_single_search( + self, + query: str, + zone: str, + response_format: str, + method: str, + country: str, + data_format: str, + async_request: bool, + base_url: str, + timeout: int, + parse: bool + ) -> Union[Dict[str, Any], str]: + """ + Perform a single search operation + """ + encoded_query = quote_plus(query) + url = f"{base_url}{encoded_query}" + + if parse: + url += "&brd_json=1" + + endpoint = "https://api.brightdata.com/request" + + payload = { + "zone": zone, + "url": url, + "format": response_format, + "method": method, + "data_format": data_format + } + + params = {} + if async_request: + params['async'] = 'true' + + @retry_request( + max_retries=self.max_retries, + backoff_factor=self.retry_backoff, + retry_statuses={429, 500, 502, 503, 504} + ) + def make_request(): + return self.session.post( + endpoint, + json=payload, + params=params, + timeout=timeout + ) + + response = make_request() + + if response.status_code == 200: + if response_format == "json": + try: + return response.json() + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse JSON response: {e}") + return response.text + else: + return response.text + + elif response.status_code == 400: + raise ValidationError(f"Bad Request (400): {response.text}") + elif response.status_code == 401: + raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}") + elif response.status_code == 403: + raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}") + elif response.status_code == 404: + raise APIError(f"Not Found (404): {response.text}") + else: + raise APIError(f"API Error ({response.status_code}): {response.text}", + status_code=response.status_code, response_text=response.text) \ No newline at end of file From b5a6625d449de9f874cdc3e50750a9131a71079b Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:16:03 +0200 Subject: [PATCH 06/70] Delete src/api/tst --- src/api/tst | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/api/tst diff --git a/src/api/tst b/src/api/tst deleted file mode 100644 index 8b13789..0000000 --- a/src/api/tst +++ /dev/null @@ -1 +0,0 @@ - From bf6605de307c39c64ca0d968d692399353c3cbda Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:16:28 +0200 Subject: [PATCH 07/70] Create tst --- src/.github/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/.github/tst diff --git a/src/.github/tst b/src/.github/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/.github/tst @@ -0,0 +1 @@ + From 9d17b0dfaae400ea175f80a696f84b6554746702 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:18:00 +0200 Subject: [PATCH 08/70] Create tst --- src/.github/workflows/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/.github/workflows/tst diff --git a/src/.github/workflows/tst b/src/.github/workflows/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/.github/workflows/tst @@ -0,0 +1 @@ + From cbb11e4067bbc735b33f9a0693d6c27fd90c91d9 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:18:21 +0200 Subject: [PATCH 09/70] Add files via upload --- src/.github/workflows/publish.yml | 65 +++++++++++++++ src/.github/workflows/test.yml | 129 ++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 src/.github/workflows/publish.yml create mode 100644 src/.github/workflows/test.yml diff --git a/src/.github/workflows/publish.yml b/src/.github/workflows/publish.yml new file mode 100644 index 0000000..7c2ec42 --- /dev/null +++ b/src/.github/workflows/publish.yml @@ -0,0 +1,65 @@ +name: Build and Publish + +on: + push: + tags: + - 'v*' + release: + types: [published] + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build twine + pip install -r requirements.txt + + - name: Build package + run: python -m build + + - name: Upload build artifacts + uses: actions/upload-artifact@v4 + with: + name: dist-files + path: dist/ + + - name: Publish to PyPI + if: github.event_name == 'release' + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + twine upload dist/* + + test-install: + runs-on: ubuntu-latest + needs: build + + steps: + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + name: dist-files + path: dist/ + + - name: Test wheel installation + run: | + pip install dist/*.whl + python -c "import brightdata; print('✅ Package imported successfully')" \ No newline at end of file diff --git a/src/.github/workflows/test.yml b/src/.github/workflows/test.yml new file mode 100644 index 0000000..69a0a2d --- /dev/null +++ b/src/.github/workflows/test.yml @@ -0,0 +1,129 @@ +name: Tests + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main ] + schedule: + - cron: '0 2 * * *' + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest pytest-cov + + - name: Test package import + run: | + python -c "import brightdata; print('Import successful')" + + - name: Run tests + run: | + python -m pytest tests/ -v --cov=brightdata --cov-report=xml + + - name: Upload coverage to Codecov + if: matrix.python-version == '3.8' + uses: codecov/codecov-action@v3 + with: + file: ./coverage.xml + + test-pypi-package: + runs-on: ubuntu-latest + if: github.event_name == 'schedule' + strategy: + matrix: + python-version: ['3.8', '3.11'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install PyPI package + run: | + python -m pip install --upgrade pip + pip install brightdata-sdk + pip install pytest + + - name: Test PyPI package import + run: | + python -c "import brightdata; print('PyPI package import successful')" + python -c "from brightdata import bdclient; print('bdclient import successful')" + + - name: Test PyPI package basic functionality + run: | + python -c " + import sys + from brightdata import bdclient, __version__ + print(f'PyPI package version: {__version__}') + + # Test that validation works (accept any validation error as success) + try: + client = bdclient(api_token='test_token_too_short') + print('WARNING: No validation error - this might indicate an issue') + except Exception as e: + print(f'Validation error caught: {str(e)[:100]}...') + print('PyPI package validation working correctly') + + # Test basic client creation with disabled auto-zone creation + try: + client = bdclient(api_token='test_token_123456789', auto_create_zones=False) + print('Client creation successful') + + # Test that basic methods exist + methods = ['scrape', 'search', 'download_content'] + for method in methods: + if hasattr(client, method): + print(f'Method {method} exists') + else: + print(f'Method {method} missing (might be version difference)') + + except Exception as e: + print(f'ERROR: Client creation failed: {e}') + sys.exit(1) + + print('PyPI package basic functionality test completed') + " + + - name: Test PyPI package compatibility + run: | + python -c " + print('Running PyPI package compatibility tests...') + + # Test import compatibility + try: + from brightdata import bdclient, __version__ + from brightdata.exceptions import ValidationError + print('Core imports working') + except ImportError as e: + print(f'ERROR: Import failed: {e}') + exit(1) + + # Test that client requires token + try: + client = bdclient() # Should fail without token + print('WARNING: Client created without token - unexpected') + except Exception: + print('Token requirement validated') + + print('PyPI package compatibility tests completed') + " \ No newline at end of file From c590d4942cf1b3c71574e28742215c3ed5355144 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:21:30 +0200 Subject: [PATCH 10/70] Create tst --- examples/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 examples/tst diff --git a/examples/tst b/examples/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/examples/tst @@ -0,0 +1 @@ + From baea61662b70cec4e73ec89e33e5367c431a141e Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:21:53 +0200 Subject: [PATCH 11/70] Add files via upload --- examples/browser_connection_example.py | 33 +++++++++++++++++++++ examples/crawl_example.py | 11 +++++++ examples/download_snapshot_example.py | 9 ++++++ examples/extract_example.py | 30 +++++++++++++++++++ examples/scrape_chatgpt_example.py | 15 ++++++++++ examples/scrape_example.py | 16 +++++++++++ examples/scrape_linkedin_example.py | 32 +++++++++++++++++++++ examples/search_example.py | 16 +++++++++++ examples/search_linkedin_example.py | 40 ++++++++++++++++++++++++++ 9 files changed, 202 insertions(+) create mode 100644 examples/browser_connection_example.py create mode 100644 examples/crawl_example.py create mode 100644 examples/download_snapshot_example.py create mode 100644 examples/extract_example.py create mode 100644 examples/scrape_chatgpt_example.py create mode 100644 examples/scrape_example.py create mode 100644 examples/scrape_linkedin_example.py create mode 100644 examples/search_example.py create mode 100644 examples/search_linkedin_example.py diff --git a/examples/browser_connection_example.py b/examples/browser_connection_example.py new file mode 100644 index 0000000..a6ebf98 --- /dev/null +++ b/examples/browser_connection_example.py @@ -0,0 +1,33 @@ +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from brightdata import bdclient +from playwright.sync_api import sync_playwright, Playwright + +client = bdclient( + api_token="your-api-key", + browser_username="copy-from-zone-configuration", + browser_password="copy-from-zone-configuration", + browser_zone="your-custom-browser-zone" +) # Hover over the function to see browser parameters (can also be taken from .env file) + +def scrape(playwright: Playwright, url="https://example.com"): + browser = playwright.chromium.connect_over_cdp(client.connect_browser()) # Connect to the browser using Bright Data's endpoint + try: + print(f'Connected! Navigating to {url}...') + page = browser.new_page() + page.goto(url, timeout=2*60_000) + print('Navigated! Scraping page content...') + data = page.content() + print(f'Scraped! Data: {data}') + finally: + browser.close() + + +def main(): + with sync_playwright() as playwright: + scrape(playwright) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/crawl_example.py b/examples/crawl_example.py new file mode 100644 index 0000000..65b2695 --- /dev/null +++ b/examples/crawl_example.py @@ -0,0 +1,11 @@ +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from brightdata import bdclient +client = bdclient(api_token="your-api-key") # can also be taken from .env file + +result = client.crawl( + url="https://example.com/", depth=1, filter="/product/", + exclude_filter="/ads/", custom_output_fields=["markdown", "url", "page_title"] +) +print(f"Snapshot ID: {result['snapshot_id']}") \ No newline at end of file diff --git a/examples/download_snapshot_example.py b/examples/download_snapshot_example.py new file mode 100644 index 0000000..ea7f8f0 --- /dev/null +++ b/examples/download_snapshot_example.py @@ -0,0 +1,9 @@ +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from brightdata import bdclient + +client = bdclient(api_token="your-api-key") # can also be taken from .env file + +snapshot_id = "" # replace with your snapshot ID + +client.download_snapshot(snapshot_id) \ No newline at end of file diff --git a/examples/extract_example.py b/examples/extract_example.py new file mode 100644 index 0000000..0723350 --- /dev/null +++ b/examples/extract_example.py @@ -0,0 +1,30 @@ +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from brightdata import bdclient + +client = bdclient() + +# Basic extraction +result = client.extract("Extract news headlines from CNN.com") +print(result) + +# Using URL parameter with structured output +schema = { + "type": "object", + "properties": { + "headlines": { + "type": "array", + "items": {"type": "string"} + } + }, + "required": ["headlines"], + "additionalProperties": False +} + +result = client.extract( + query="Extract main headlines", + url="https://cnn.com", + output_scheme=schema +) +print(result) \ No newline at end of file diff --git a/examples/scrape_chatgpt_example.py b/examples/scrape_chatgpt_example.py new file mode 100644 index 0000000..b695734 --- /dev/null +++ b/examples/scrape_chatgpt_example.py @@ -0,0 +1,15 @@ +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from brightdata import bdclient + +client = bdclient("your-api-key") # can also be taken from .env file + +result = client.search_chatGPT( + prompt="what day is it today?" + # prompt=["What are the top 3 programming languages in 2024?", "Best hotels in New York", "Explain quantum computing"], + # additional_prompt=["Can you explain why?", "Are you sure?", ""] +) + +client.download_content(result) +# In case of timeout error, your snapshot is still created and can be downloaded using the snapshot ID example file diff --git a/examples/scrape_example.py b/examples/scrape_example.py new file mode 100644 index 0000000..bf6b1a8 --- /dev/null +++ b/examples/scrape_example.py @@ -0,0 +1,16 @@ +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from brightdata import bdclient + +client = bdclient(api_token="your-API-key") # Can also be taken from .env file + +URL = (["https://www.amazon.com/dp/B079QHML21", + "https://www.ebay.com/itm/365771796300", + "https://www.walmart.com/ip/Apple-MacBook-Air-13-3-inch-Laptop-Space-Gray-M1-Chip-8GB-RAM-256GB-storage/609040889"]) + +results = client.scrape(url=URL, max_workers=5) + +result = client.parse_content(results, extract_text=True) # Choose what to extract + +print(result) \ No newline at end of file diff --git a/examples/scrape_linkedin_example.py b/examples/scrape_linkedin_example.py new file mode 100644 index 0000000..8483f5a --- /dev/null +++ b/examples/scrape_linkedin_example.py @@ -0,0 +1,32 @@ +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from brightdata import bdclient + +client = bdclient() # can also be taken from .env file + +# LinkedIn Profile URLs +profile_url = "https://www.linkedin.com/in/elad-moshe-05a90413/" + +# LinkedIn Company URLs +company_urls = [ + "https://il.linkedin.com/company/ibm", + "https://www.linkedin.com/company/bright-data", + "https://www.linkedin.com/company/stalkit" +] + +# LinkedIn Job URLs +job_urls = [ + "https://www.linkedin.com/jobs/view/remote-typist-%E2%80%93-data-entry-specialist-work-from-home-at-cwa-group-4181034038?trk=public_jobs_topcard-title", + "https://www.linkedin.com/jobs/view/arrt-r-at-shared-imaging-llc-4180989163?trk=public_jobs_topcard-title" +] + +# LinkedIn Post URLs +post_urls = [ + "https://www.linkedin.com/posts/orlenchner_scrapecon-activity-7180537307521769472-oSYN?trk=public_profile", + "https://www.linkedin.com/pulse/getting-value-out-sunburst-guillaume-de-b%C3%A9naz%C3%A9?trk=public_profile_article_view" +] + +results = client.scrape_linkedin.posts(post_urls) # can also be changed to async + +client.download_content(results) \ No newline at end of file diff --git a/examples/search_example.py b/examples/search_example.py new file mode 100644 index 0000000..3b9e3eb --- /dev/null +++ b/examples/search_example.py @@ -0,0 +1,16 @@ +import sys, os + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from brightdata import bdclient + +client = bdclient(api_token="your-api-token", auto_create_zones=False, serp_zone="your-custom-serp-zone") # zone and API token can also be defined in .env file + +query = ["iphone 16", "coffee maker", "portable projector", "sony headphones", + "laptop stand", "power bank", "running shoes", "android tablet", + "hiking backpack", "dash cam"] + +results = client.search(query, max_workers=10, +response_format="json", parse=True) + +client.download_content(results, parse=True) # parse=True to save as JSON, otherwise saves as raw HTML \ No newline at end of file diff --git a/examples/search_linkedin_example.py b/examples/search_linkedin_example.py new file mode 100644 index 0000000..be5f7df --- /dev/null +++ b/examples/search_linkedin_example.py @@ -0,0 +1,40 @@ +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from brightdata import bdclient + +client = bdclient(api_token="your-api-key") # can also be taken from .env file + +# Search LinkedIn profiles by name +first_names = ["James", "Idan"] +last_names = ["Smith", "Vilenski"] +result = client.search_linkedin.profiles(first_names, last_names) + +# Search jobs by URL +job_urls = [ + "https://www.linkedin.com/jobs/search?keywords=Software&location=Tel%20Aviv-Yafo", + "https://www.linkedin.com/jobs/reddit-inc.-jobs-worldwide?f_C=150573" +] +result = client.search_linkedin.jobs(url=job_urls) + +# Search jobs by keyword and location +result = client.search_linkedin.jobs( + location="Paris", + keyword="product manager", + country="FR", + time_range="Past month", + job_type="Full-time" +) + +# Search posts by profile URL with date range +result = client.search_linkedin.posts( + profile_url="https://www.linkedin.com/in/bettywliu", + start_date="2018-04-25T00:00:00.000Z", + end_date="2021-05-25T00:00:00.000Z" +) +# Search posts by company URL +result = client.search_linkedin.posts( + company_url="https://www.linkedin.com/company/bright-data" +) + +# Returns snapshot ID that can be used to download the content later using download_snapshot function \ No newline at end of file From 468a96f19d94be277cf800ce081a35fbfd7b047d Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:22:22 +0200 Subject: [PATCH 12/70] Delete src/.github/tst --- src/.github/tst | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/.github/tst diff --git a/src/.github/tst b/src/.github/tst deleted file mode 100644 index 8b13789..0000000 --- a/src/.github/tst +++ /dev/null @@ -1 +0,0 @@ - From f469f81e3f83fcfa72a7b42276ba64ebd7435440 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:24:44 +0200 Subject: [PATCH 13/70] Create tst --- tests/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 tests/tst diff --git a/tests/tst b/tests/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/tst @@ -0,0 +1 @@ + From a6a8e5ae925c0bd6c7e80566add2c768862ff069 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:25:03 +0200 Subject: [PATCH 14/70] Add files via upload --- tests/__init__.py | 0 tests/test_client.py | 121 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/test_client.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_client.py b/tests/test_client.py new file mode 100644 index 0000000..51b1315 --- /dev/null +++ b/tests/test_client.py @@ -0,0 +1,121 @@ +""" +Comprehensive tests for the Bright Data SDK client. + +This test suite covers: +- Client initialization with API tokens (from parameter and environment) +- API token validation and error handling for missing tokens +- Zone configuration (default and custom zone names) +- URL validation in scrape method (scheme requirement) +- Search query validation (empty query handling) +- Search engine validation (unsupported engine handling) + +All tests are designed to run without requiring real API tokens by: +- Using sufficiently long test tokens to pass validation +- Mocking zone management to avoid network calls +- Testing validation logic and error messages +""" + +import pytest +import os +from unittest.mock import patch + +from brightdata import bdclient +from brightdata.exceptions import ValidationError + + +class TestBdClient: + """Test cases for the main bdclient class""" + + @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones') + def test_client_init_with_token(self, mock_zones): + """Test client initialization with API token""" + with patch.dict(os.environ, {}, clear=True): + client = bdclient(api_token="valid_test_token_12345678", auto_create_zones=False) + assert client.api_token == "valid_test_token_12345678" + + @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones') + def test_client_init_from_env(self, mock_zones): + """Test client initialization from environment variable""" + with patch.dict(os.environ, {"BRIGHTDATA_API_TOKEN": "valid_env_token_12345678"}): + client = bdclient(auto_create_zones=False) + assert client.api_token == "valid_env_token_12345678" + + def test_client_init_no_token_raises_error(self): + """Test that missing API token raises ValidationError""" + with patch.dict(os.environ, {}, clear=True): + with patch('dotenv.load_dotenv'): + with pytest.raises(ValidationError, match="API token is required"): + bdclient() + + @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones') + def test_client_zone_defaults(self, mock_zones): + """Test default zone configurations""" + with patch.dict(os.environ, {}, clear=True): + client = bdclient(api_token="valid_test_token_12345678", auto_create_zones=False) + assert client.web_unlocker_zone == "sdk_unlocker" + assert client.serp_zone == "sdk_serp" + + @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones') + def test_client_custom_zones(self, mock_zones): + """Test custom zone configuration""" + with patch.dict(os.environ, {}, clear=True): + client = bdclient( + api_token="valid_test_token_12345678", + web_unlocker_zone="custom_unlocker", + serp_zone="custom_serp", + auto_create_zones=False + ) + assert client.web_unlocker_zone == "custom_unlocker" + assert client.serp_zone == "custom_serp" + + +class TestClientMethods: + """Test cases for client methods with mocked responses""" + + @pytest.fixture + @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones') + def client(self, mock_zones): + """Create a test client with mocked validation""" + with patch.dict(os.environ, {}, clear=True): + client = bdclient(api_token="valid_test_token_12345678", auto_create_zones=False) + return client + + def test_scrape_single_url_validation(self, client): + """Test URL validation in scrape method""" + with pytest.raises(ValidationError, match="URL must include a scheme"): + client.scrape("not_a_url") + + def test_search_empty_query_validation(self, client): + """Test query validation in search method""" + with pytest.raises(ValidationError, match="cannot be empty"): + client.search("") + + def test_search_unsupported_engine(self, client): + """Test unsupported search engine validation""" + with pytest.raises(ValidationError, match="Invalid search engine"): + client.search("test query", search_engine="invalid_engine") + + def test_search_with_parse_parameter(self, client, monkeypatch): + """Test search with parse parameter adds brd_json=1 to URL""" + # Mock the session.post method to capture the request + captured_request = {} + + def mock_post(*args, **kwargs): + captured_request.update(kwargs) + from unittest.mock import Mock + response = Mock() + response.status_code = 200 + response.text = "mocked html response" + return response + + monkeypatch.setattr(client.search_api.session, 'post', mock_post) + + result = client.search("test query", parse=True) + + # Verify the request was made with correct URL containing &brd_json=1 + request_data = captured_request.get('json', {}) + assert "&brd_json=1" in request_data["url"] + + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file From 35e632a0593432597280737cfb03584a7b71ff3a Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:25:22 +0200 Subject: [PATCH 15/70] Delete tests/tst --- tests/tst | 1 - 1 file changed, 1 deletion(-) delete mode 100644 tests/tst diff --git a/tests/tst b/tests/tst deleted file mode 100644 index 8b13789..0000000 --- a/tests/tst +++ /dev/null @@ -1 +0,0 @@ - From b54f9f067e3b7a04561e91ec61a72c0443ee9742 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:26:20 +0200 Subject: [PATCH 16/70] Create tst --- .github/workflows/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 .github/workflows/tst diff --git a/.github/workflows/tst b/.github/workflows/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/.github/workflows/tst @@ -0,0 +1 @@ + From 6d625f4b0017177206308683c0ac34106fe20a18 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:26:40 +0200 Subject: [PATCH 17/70] Add files via upload --- .github/workflows/publish.yml | 65 +++++++++++++++++ .github/workflows/test.yml | 129 ++++++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 .github/workflows/publish.yml create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..7c2ec42 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,65 @@ +name: Build and Publish + +on: + push: + tags: + - 'v*' + release: + types: [published] + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build twine + pip install -r requirements.txt + + - name: Build package + run: python -m build + + - name: Upload build artifacts + uses: actions/upload-artifact@v4 + with: + name: dist-files + path: dist/ + + - name: Publish to PyPI + if: github.event_name == 'release' + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + twine upload dist/* + + test-install: + runs-on: ubuntu-latest + needs: build + + steps: + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + name: dist-files + path: dist/ + + - name: Test wheel installation + run: | + pip install dist/*.whl + python -c "import brightdata; print('✅ Package imported successfully')" \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..69a0a2d --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,129 @@ +name: Tests + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main ] + schedule: + - cron: '0 2 * * *' + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest pytest-cov + + - name: Test package import + run: | + python -c "import brightdata; print('Import successful')" + + - name: Run tests + run: | + python -m pytest tests/ -v --cov=brightdata --cov-report=xml + + - name: Upload coverage to Codecov + if: matrix.python-version == '3.8' + uses: codecov/codecov-action@v3 + with: + file: ./coverage.xml + + test-pypi-package: + runs-on: ubuntu-latest + if: github.event_name == 'schedule' + strategy: + matrix: + python-version: ['3.8', '3.11'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install PyPI package + run: | + python -m pip install --upgrade pip + pip install brightdata-sdk + pip install pytest + + - name: Test PyPI package import + run: | + python -c "import brightdata; print('PyPI package import successful')" + python -c "from brightdata import bdclient; print('bdclient import successful')" + + - name: Test PyPI package basic functionality + run: | + python -c " + import sys + from brightdata import bdclient, __version__ + print(f'PyPI package version: {__version__}') + + # Test that validation works (accept any validation error as success) + try: + client = bdclient(api_token='test_token_too_short') + print('WARNING: No validation error - this might indicate an issue') + except Exception as e: + print(f'Validation error caught: {str(e)[:100]}...') + print('PyPI package validation working correctly') + + # Test basic client creation with disabled auto-zone creation + try: + client = bdclient(api_token='test_token_123456789', auto_create_zones=False) + print('Client creation successful') + + # Test that basic methods exist + methods = ['scrape', 'search', 'download_content'] + for method in methods: + if hasattr(client, method): + print(f'Method {method} exists') + else: + print(f'Method {method} missing (might be version difference)') + + except Exception as e: + print(f'ERROR: Client creation failed: {e}') + sys.exit(1) + + print('PyPI package basic functionality test completed') + " + + - name: Test PyPI package compatibility + run: | + python -c " + print('Running PyPI package compatibility tests...') + + # Test import compatibility + try: + from brightdata import bdclient, __version__ + from brightdata.exceptions import ValidationError + print('Core imports working') + except ImportError as e: + print(f'ERROR: Import failed: {e}') + exit(1) + + # Test that client requires token + try: + client = bdclient() # Should fail without token + print('WARNING: Client created without token - unexpected') + except Exception: + print('Token requirement validated') + + print('PyPI package compatibility tests completed') + " \ No newline at end of file From 3407ff9a10868472e342d88722e52730703d004b Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:26:58 +0200 Subject: [PATCH 18/70] Delete src/.github/workflows directory --- src/.github/workflows/publish.yml | 65 --------------- src/.github/workflows/test.yml | 129 ------------------------------ src/.github/workflows/tst | 1 - 3 files changed, 195 deletions(-) delete mode 100644 src/.github/workflows/publish.yml delete mode 100644 src/.github/workflows/test.yml delete mode 100644 src/.github/workflows/tst diff --git a/src/.github/workflows/publish.yml b/src/.github/workflows/publish.yml deleted file mode 100644 index 7c2ec42..0000000 --- a/src/.github/workflows/publish.yml +++ /dev/null @@ -1,65 +0,0 @@ -name: Build and Publish - -on: - push: - tags: - - 'v*' - release: - types: [published] - workflow_dispatch: - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build twine - pip install -r requirements.txt - - - name: Build package - run: python -m build - - - name: Upload build artifacts - uses: actions/upload-artifact@v4 - with: - name: dist-files - path: dist/ - - - name: Publish to PyPI - if: github.event_name == 'release' - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: | - twine upload dist/* - - test-install: - runs-on: ubuntu-latest - needs: build - - steps: - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - - - name: Download build artifacts - uses: actions/download-artifact@v4 - with: - name: dist-files - path: dist/ - - - name: Test wheel installation - run: | - pip install dist/*.whl - python -c "import brightdata; print('✅ Package imported successfully')" \ No newline at end of file diff --git a/src/.github/workflows/test.yml b/src/.github/workflows/test.yml deleted file mode 100644 index 69a0a2d..0000000 --- a/src/.github/workflows/test.yml +++ /dev/null @@ -1,129 +0,0 @@ -name: Tests - -on: - push: - branches: [ main, develop ] - pull_request: - branches: [ main ] - schedule: - - cron: '0 2 * * *' - -jobs: - test: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - pip install pytest pytest-cov - - - name: Test package import - run: | - python -c "import brightdata; print('Import successful')" - - - name: Run tests - run: | - python -m pytest tests/ -v --cov=brightdata --cov-report=xml - - - name: Upload coverage to Codecov - if: matrix.python-version == '3.8' - uses: codecov/codecov-action@v3 - with: - file: ./coverage.xml - - test-pypi-package: - runs-on: ubuntu-latest - if: github.event_name == 'schedule' - strategy: - matrix: - python-version: ['3.8', '3.11'] - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install PyPI package - run: | - python -m pip install --upgrade pip - pip install brightdata-sdk - pip install pytest - - - name: Test PyPI package import - run: | - python -c "import brightdata; print('PyPI package import successful')" - python -c "from brightdata import bdclient; print('bdclient import successful')" - - - name: Test PyPI package basic functionality - run: | - python -c " - import sys - from brightdata import bdclient, __version__ - print(f'PyPI package version: {__version__}') - - # Test that validation works (accept any validation error as success) - try: - client = bdclient(api_token='test_token_too_short') - print('WARNING: No validation error - this might indicate an issue') - except Exception as e: - print(f'Validation error caught: {str(e)[:100]}...') - print('PyPI package validation working correctly') - - # Test basic client creation with disabled auto-zone creation - try: - client = bdclient(api_token='test_token_123456789', auto_create_zones=False) - print('Client creation successful') - - # Test that basic methods exist - methods = ['scrape', 'search', 'download_content'] - for method in methods: - if hasattr(client, method): - print(f'Method {method} exists') - else: - print(f'Method {method} missing (might be version difference)') - - except Exception as e: - print(f'ERROR: Client creation failed: {e}') - sys.exit(1) - - print('PyPI package basic functionality test completed') - " - - - name: Test PyPI package compatibility - run: | - python -c " - print('Running PyPI package compatibility tests...') - - # Test import compatibility - try: - from brightdata import bdclient, __version__ - from brightdata.exceptions import ValidationError - print('Core imports working') - except ImportError as e: - print(f'ERROR: Import failed: {e}') - exit(1) - - # Test that client requires token - try: - client = bdclient() # Should fail without token - print('WARNING: Client created without token - unexpected') - except Exception: - print('Token requirement validated') - - print('PyPI package compatibility tests completed') - " \ No newline at end of file diff --git a/src/.github/workflows/tst b/src/.github/workflows/tst deleted file mode 100644 index 8b13789..0000000 --- a/src/.github/workflows/tst +++ /dev/null @@ -1 +0,0 @@ - From e7a5e316efa02f6b447b30009f070bc5d3c8aac7 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:27:17 +0200 Subject: [PATCH 19/70] Delete .github/workflows/tst --- .github/workflows/tst | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .github/workflows/tst diff --git a/.github/workflows/tst b/.github/workflows/tst deleted file mode 100644 index 8b13789..0000000 --- a/.github/workflows/tst +++ /dev/null @@ -1 +0,0 @@ - From 35f434b0a5ad6400ae37e34da642b78cf6b49a16 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:35:08 +0200 Subject: [PATCH 20/70] Create tst --- src/utils/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/utils/tst diff --git a/src/utils/tst b/src/utils/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/utils/tst @@ -0,0 +1 @@ + From 916467aa1209d846d639c19d88ca5803cb29e225 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:35:31 +0200 Subject: [PATCH 21/70] Add files via upload --- src/utils/__init__.py | 35 +++++ src/utils/logging_config.py | 177 +++++++++++++++++++++ src/utils/parser.py | 264 ++++++++++++++++++++++++++++++++ src/utils/response_validator.py | 49 ++++++ src/utils/retry.py | 90 +++++++++++ src/utils/validation.py | 183 ++++++++++++++++++++++ src/utils/zone_manager.py | 174 +++++++++++++++++++++ 7 files changed, 972 insertions(+) create mode 100644 src/utils/__init__.py create mode 100644 src/utils/logging_config.py create mode 100644 src/utils/parser.py create mode 100644 src/utils/response_validator.py create mode 100644 src/utils/retry.py create mode 100644 src/utils/validation.py create mode 100644 src/utils/zone_manager.py diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..75a2f6c --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1,35 @@ +from .validation import ( + validate_url, validate_zone_name, validate_country_code, + validate_timeout, validate_max_workers, validate_url_list, + validate_search_engine, validate_query, validate_response_format, + validate_http_method +) +from .retry import retry_request +from .zone_manager import ZoneManager +from .logging_config import setup_logging, get_logger, log_request +from .response_validator import safe_json_parse, validate_response_size, check_response_not_empty +from .parser import parse_content, parse_multiple, extract_structured_data + +__all__ = [ + 'validate_url', + 'validate_zone_name', + 'validate_country_code', + 'validate_timeout', + 'validate_max_workers', + 'validate_url_list', + 'validate_search_engine', + 'validate_query', + 'validate_response_format', + 'validate_http_method', + 'retry_request', + 'ZoneManager', + 'setup_logging', + 'get_logger', + 'log_request', + 'safe_json_parse', + 'validate_response_size', + 'check_response_not_empty', + 'parse_content', + 'parse_multiple', + 'extract_structured_data' +] \ No newline at end of file diff --git a/src/utils/logging_config.py b/src/utils/logging_config.py new file mode 100644 index 0000000..89289da --- /dev/null +++ b/src/utils/logging_config.py @@ -0,0 +1,177 @@ +""" +Structured logging configuration for Bright Data SDK +""" +import logging +import json +import time +from typing import Dict, Any +import uuid + + +class StructuredFormatter(logging.Formatter): + """Custom formatter that outputs structured JSON logs""" + + def __init__(self): + super().__init__() + self.start_time = time.time() + + def format(self, record): + log_data = { + 'timestamp': self.formatTime(record), + 'level': record.levelname, + 'logger': record.name, + 'message': record.getMessage(), + 'module': record.module, + 'function': record.funcName, + 'line': record.lineno + } + + correlation_id = getattr(record, 'correlation_id', None) + if correlation_id: + log_data['correlation_id'] = correlation_id + + if hasattr(record, 'url'): + log_data['url'] = record.url + if hasattr(record, 'method'): + log_data['method'] = record.method + if hasattr(record, 'status_code'): + log_data['status_code'] = record.status_code + if hasattr(record, 'response_time'): + log_data['response_time_ms'] = record.response_time + + if record.exc_info: + log_data['exception'] = { + 'type': record.exc_info[0].__name__ if record.exc_info[0] else None, + 'message': str(record.exc_info[1]) if record.exc_info[1] else None, + 'traceback': self.formatException(record.exc_info) + } + + log_data = self._sanitize_log_data(log_data) + + return json.dumps(log_data, default=str) + + def _sanitize_log_data(self, log_data: Dict[str, Any]) -> Dict[str, Any]: + """Remove or mask sensitive information from log data""" + sensitive_keys = ['authorization', 'token', 'api_token', 'password', 'secret'] + + def sanitize_value(key: str, value: Any) -> Any: + if isinstance(key, str) and any(sensitive in key.lower() for sensitive in sensitive_keys): + return "***REDACTED***" + elif isinstance(value, str) and len(value) > 20: + if value.isalnum() and len(value) > 32: + return f"{value[:8]}***REDACTED***{value[-4:]}" + return value + + def recursive_sanitize(obj): + if isinstance(obj, dict): + return {k: recursive_sanitize(sanitize_value(k, v)) for k, v in obj.items()} + elif isinstance(obj, list): + return [recursive_sanitize(item) for item in obj] + else: + return obj + + return recursive_sanitize(log_data) + + +def setup_logging(level: str = "INFO", structured: bool = True, verbose: bool = True) -> None: + """ + Setup logging configuration for the SDK + + Args: + level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + structured: Whether to use structured JSON logging + verbose: Whether to show verbose logging (default: True) + When False, only WARNING and above are shown + When True, uses the specified level + """ + if not verbose: + log_level = logging.WARNING + else: + log_level = getattr(logging, level.upper(), logging.INFO) + + root_logger = logging.getLogger('brightdata') + root_logger.handlers.clear() + + handler = logging.StreamHandler() + handler.setLevel(log_level) + + if structured: + formatter = StructuredFormatter() + else: + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + handler.setFormatter(formatter) + root_logger.addHandler(handler) + root_logger.setLevel(log_level) + + root_logger.propagate = False + + +def get_logger(name: str) -> logging.Logger: + """ + Get a logger instance with the specified name + + Args: + name: Logger name + + Returns: + Configured logger instance + """ + return logging.getLogger(f'brightdata.{name}') + + +def log_request(logger: logging.Logger, method: str, url: str, + status_code: int = None, response_time: float = None, + correlation_id: str = None) -> None: + """ + Log HTTP request details + + Args: + logger: Logger instance + method: HTTP method + url: Request URL (will be sanitized) + status_code: HTTP response status code + response_time: Response time in milliseconds + correlation_id: Request correlation ID + """ + extra = { + 'method': method, + 'url': _sanitize_url(url), + 'correlation_id': correlation_id or str(uuid.uuid4()) + } + + if status_code is not None: + extra['status_code'] = status_code + if response_time is not None: + extra['response_time'] = response_time + + if status_code and status_code >= 400: + logger.error(f"HTTP request failed: {method} {_sanitize_url(url)}", extra=extra) + else: + logger.info(f"HTTP request: {method} {_sanitize_url(url)}", extra=extra) + + +def _sanitize_url(url: str) -> str: + """Sanitize URL to remove sensitive query parameters""" + try: + from urllib.parse import urlparse, parse_qs, urlencode, urlunparse + + parsed = urlparse(url) + query_params = parse_qs(parsed.query) + + sensitive_params = ['token', 'api_key', 'secret', 'password'] + for param in sensitive_params: + if param in query_params: + query_params[param] = ['***REDACTED***'] + + sanitized_query = urlencode(query_params, doseq=True) + sanitized = urlunparse(( + parsed.scheme, parsed.netloc, parsed.path, + parsed.params, sanitized_query, parsed.fragment + )) + + return sanitized + except Exception: + return url.split('?')[0] + ('?***PARAMS_REDACTED***' if '?' in url else '') \ No newline at end of file diff --git a/src/utils/parser.py b/src/utils/parser.py new file mode 100644 index 0000000..686ad39 --- /dev/null +++ b/src/utils/parser.py @@ -0,0 +1,264 @@ +""" +Content parsing utilities for Bright Data SDK responses + +Provides functions to extract and parse content from scraping and search results. +""" +import json +import re +from typing import Any, Dict, List, Union, Optional + +from bs4 import BeautifulSoup + + +def parse_content(data: Union[str, Dict, List], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> Union[Dict[str, Any], List[Dict[str, Any]]]: + """ + Parse content from Bright Data API responses + + Automatically detects and handles both single and multiple results from scrape/search operations. + Can be used as a standalone function or called from the client. + + Args: + data: Response data from scrape() or search() - can be JSON dict/list or HTML string + extract_text: Extract clean text content (default: True) + extract_links: Extract all links from content (default: False) + extract_images: Extract image URLs from content (default: False) + + Returns: + Dict containing parsed content for single results, or List[Dict] for multiple results with keys: + - 'type': 'json' or 'html' + - 'text': Cleaned text content (if extract_text=True) + - 'links': List of extracted links (if extract_links=True) + - 'images': List of image URLs (if extract_images=True) + - 'title': Page title (if available) + - 'raw_length': Length of original content + - 'structured_data': Original JSON data (if type='json') + """ + if _is_multiple_results(data): + return parse_multiple(data, extract_text=extract_text, extract_links=extract_links, extract_images=extract_images) + + return _parse_single_content(data, extract_text, extract_links, extract_images) + + +def parse_multiple(data_list: List[Union[str, Dict]], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> List[Dict[str, Any]]: + """ + Parse multiple content items (useful for batch scraping results) + + Args: + data_list: List of response data items + extract_text: Extract clean text content (default: True) + extract_links: Extract all links from content (default: False) + extract_images: Extract image URLs from content (default: False) + + Returns: + List of parsed content dictionaries + """ + if not isinstance(data_list, list): + return [] + + return [_parse_single_content(item, extract_text, extract_links, extract_images) for item in data_list] + + +def _is_multiple_results(data: Union[str, Dict, List]) -> bool: + """ + Detect if data contains multiple scraping/search results + + Args: + data: Response data to analyze + + Returns: + True if data appears to be multiple results, False otherwise + """ + if not isinstance(data, list): + return False + + if len(data) <= 1: + return False + + multiple_result_indicators = 0 + + for item in data[:3]: + if isinstance(item, dict): + common_keys = {'html', 'body', 'content', 'page_html', 'raw_html', 'url', 'status_code'} + if any(key in item for key in common_keys): + multiple_result_indicators += 1 + elif isinstance(item, str) and len(item) > 100: + if '= 2 + + +def _parse_single_content(data: Union[str, Dict, List], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> Dict[str, Any]: + """ + Parse single content item from Bright Data API responses + + Args: + data: Single response data item - can be JSON dict or HTML string + extract_text: Extract clean text content (default: True) + extract_links: Extract all links from content (default: False) + extract_images: Extract image URLs from content (default: False) + + Returns: + Dict containing parsed content + """ + result = { + 'type': None, + 'raw_length': 0, + 'title': None + } + + if data is None: + return result + + if isinstance(data, (dict, list)): + result['type'] = 'json' + result['structured_data'] = data + result['raw_length'] = len(str(data)) + + html_content = _extract_html_from_json(data) + if html_content and (extract_text or extract_links or extract_images): + _parse_html_content(html_content, result, extract_text, extract_links, extract_images) + + result['title'] = _extract_title_from_json(data) + + elif isinstance(data, str): + result['type'] = 'html' + result['raw_length'] = len(data) + + if extract_text or extract_links or extract_images: + _parse_html_content(data, result, extract_text, extract_links, extract_images) + + return result + + +def extract_structured_data(data: Union[str, Dict, List]) -> Optional[Dict]: + """ + Extract structured data (JSON-LD, microdata) from content + + Args: + data: Response data + + Returns: + Structured data if found, None otherwise + """ + html_content = None + + if isinstance(data, str): + html_content = data + elif isinstance(data, (dict, list)): + html_content = _extract_html_from_json(data) + + if not html_content: + return None + + try: + soup = BeautifulSoup(html_content, 'html.parser') + + scripts = soup.find_all('script', type='application/ld+json') + if scripts: + structured_data = [] + for script in scripts: + try: + data = json.loads(script.string) + structured_data.append(data) + except json.JSONDecodeError: + continue + if structured_data: + return {'json_ld': structured_data} + + except Exception: + pass + + return None + + +def _extract_html_from_json(data: Union[Dict, List]) -> Optional[str]: + """Extract HTML content from JSON response structure""" + if isinstance(data, dict): + html_keys = ['html', 'body', 'content', 'page_html', 'raw_html'] + for key in html_keys: + if key in data and isinstance(data[key], str): + return data[key] + + for value in data.values(): + if isinstance(value, (dict, list)): + html = _extract_html_from_json(value) + if html: + return html + + elif isinstance(data, list): + for item in data: + if isinstance(item, (dict, list)): + html = _extract_html_from_json(item) + if html: + return html + + return None + + +def _extract_title_from_json(data: Union[Dict, List]) -> Optional[str]: + """Extract title from JSON response structure""" + if isinstance(data, dict): + title_keys = ['title', 'page_title', 'name'] + for key in title_keys: + if key in data and isinstance(data[key], str): + return data[key].strip() + + for value in data.values(): + if isinstance(value, (dict, list)): + title = _extract_title_from_json(value) + if title: + return title + + elif isinstance(data, list): + for item in data: + if isinstance(item, (dict, list)): + title = _extract_title_from_json(item) + if title: + return title + + return None + + +def _parse_html_content(html: str, result: Dict, extract_text: bool, extract_links: bool, extract_images: bool): + """Parse HTML content and update result dictionary""" + try: + soup = BeautifulSoup(html, 'html.parser') + + if not result.get('title'): + title_tag = soup.find('title') + if title_tag: + result['title'] = title_tag.get_text().strip() + + if extract_text: + for script in soup(["script", "style"]): + script.decompose() + + text = soup.get_text() + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + result['text'] = '\n'.join(chunk for chunk in chunks if chunk) + + if extract_links: + links = [] + for a_tag in soup.find_all('a', href=True): + href = a_tag['href'] + text = a_tag.get_text().strip() + links.append({'url': href, 'text': text}) + result['links'] = links + + if extract_images: + images = [] + for img_tag in soup.find_all('img', src=True): + src = img_tag['src'] + alt = img_tag.get('alt', '').strip() + images.append({'url': src, 'alt': alt}) + result['images'] = images + + except Exception as e: + if extract_text: + result['text'] = f"HTML parsing failed: {str(e)}" + if extract_links: + result['links'] = [] + if extract_images: + result['images'] = [] \ No newline at end of file diff --git a/src/utils/response_validator.py b/src/utils/response_validator.py new file mode 100644 index 0000000..83a9aa7 --- /dev/null +++ b/src/utils/response_validator.py @@ -0,0 +1,49 @@ +""" +Minimal response validation utilities for Bright Data SDK +""" +import json +from typing import Any, Dict, Union +from ..exceptions import ValidationError + + +def safe_json_parse(response_text: str) -> Dict[str, Any]: + """ + Safely parse JSON response with minimal validation + + Args: + response_text: Raw response text from API + + Returns: + Parsed JSON data or original text if parsing fails + """ + if not response_text: + return {} + + try: + return json.loads(response_text) + except (json.JSONDecodeError, TypeError): + # Return original text if JSON parsing fails + return response_text + + +def validate_response_size(response_text: str, max_size_mb: float = 100.0) -> None: + """ + Quick size check to prevent memory issues + + Args: + response_text: Response text to validate + max_size_mb: Maximum allowed size in megabytes + """ + if response_text and len(response_text) > (max_size_mb * 1024 * 1024): + raise ValidationError(f"Response too large (>{max_size_mb}MB)") + + +def check_response_not_empty(data: Any) -> None: + """ + Minimal check that response contains data + + Args: + data: Response data to check + """ + if data is None or (isinstance(data, str) and len(data.strip()) == 0): + raise ValidationError("Empty response received") \ No newline at end of file diff --git a/src/utils/retry.py b/src/utils/retry.py new file mode 100644 index 0000000..361645a --- /dev/null +++ b/src/utils/retry.py @@ -0,0 +1,90 @@ +import time +import random +import requests +from functools import wraps +from ..exceptions import NetworkError, APIError + + +def retry_request(max_retries=3, backoff_factor=1.5, retry_statuses=None, max_backoff=60): + """ + Decorator for retrying requests with exponential backoff and jitter + + Args: + max_retries: Maximum number of retry attempts + backoff_factor: Exponential backoff multiplier + retry_statuses: HTTP status codes that should trigger retries + max_backoff: Maximum backoff time in seconds + """ + if retry_statuses is None: + retry_statuses = {429, 500, 502, 503, 504} + + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + last_exception = None + + for attempt in range(max_retries + 1): # +1 to include initial attempt + try: + response = func(*args, **kwargs) + + # Check if we should retry based on status code + if hasattr(response, 'status_code') and response.status_code in retry_statuses: + if attempt >= max_retries: + raise APIError( + f"Server error after {max_retries} retries: HTTP {response.status_code}", + status_code=response.status_code, + response_text=getattr(response, 'text', '') + ) + + # Calculate backoff with jitter + backoff_time = min(backoff_factor ** attempt, max_backoff) + jitter = backoff_time * 0.1 * random.random() # Add up to 10% jitter + total_delay = backoff_time + jitter + + time.sleep(total_delay) + continue + + return response + + except requests.exceptions.ConnectTimeout as e: + last_exception = NetworkError(f"Connection timeout: {str(e)}") + except requests.exceptions.ReadTimeout as e: + last_exception = NetworkError(f"Read timeout: {str(e)}") + except requests.exceptions.Timeout as e: + last_exception = NetworkError(f"Request timeout: {str(e)}") + except requests.exceptions.ConnectionError as e: + # Handle DNS resolution, connection refused, etc. + if "Name or service not known" in str(e): + last_exception = NetworkError(f"DNS resolution failed: {str(e)}") + elif "Connection refused" in str(e): + last_exception = NetworkError(f"Connection refused: {str(e)}") + else: + last_exception = NetworkError(f"Connection error: {str(e)}") + except requests.exceptions.SSLError as e: + last_exception = NetworkError(f"SSL/TLS error: {str(e)}") + except requests.exceptions.ProxyError as e: + last_exception = NetworkError(f"Proxy error: {str(e)}") + except requests.exceptions.RequestException as e: + last_exception = NetworkError(f"Network error: {str(e)}") + except Exception as e: + # Catch any other unexpected exceptions + last_exception = NetworkError(f"Unexpected error: {str(e)}") + + # If this was the last attempt, raise the exception + if attempt >= max_retries: + raise last_exception + + # Calculate backoff with jitter for network errors + backoff_time = min(backoff_factor ** attempt, max_backoff) + jitter = backoff_time * 0.1 * random.random() + total_delay = backoff_time + jitter + + time.sleep(total_delay) + + # This should never be reached, but just in case + if last_exception: + raise last_exception + return None + + return wrapper + return decorator \ No newline at end of file diff --git a/src/utils/validation.py b/src/utils/validation.py new file mode 100644 index 0000000..938cb43 --- /dev/null +++ b/src/utils/validation.py @@ -0,0 +1,183 @@ +from urllib.parse import urlparse +from typing import Union, List +from ..exceptions import ValidationError + + +def validate_url(url: str) -> None: + """Validate URL format with comprehensive checks""" + if not isinstance(url, str): + raise ValidationError(f"URL must be a string, got {type(url).__name__}") + + if not url.strip(): + raise ValidationError("URL cannot be empty or whitespace") + + # Check URL length + if len(url) > 8192: # Common URL length limit + raise ValidationError("URL exceeds maximum length of 8192 characters") + + try: + parsed = urlparse(url.strip()) + if not parsed.scheme: + raise ValidationError(f"URL must include a scheme (http/https): {url}") + if parsed.scheme.lower() not in ['http', 'https']: + raise ValidationError(f"URL scheme must be http or https, got: {parsed.scheme}") + if not parsed.netloc: + raise ValidationError(f"URL must include a valid domain: {url}") + # Check for suspicious characters + if any(char in url for char in ['<', '>', '"', "'"]): + raise ValidationError("URL contains invalid characters") + except Exception as e: + if isinstance(e, ValidationError): + raise + raise ValidationError(f"Invalid URL format '{url}': {str(e)}") + + +def validate_zone_name(zone: str = None) -> None: + """Validate zone name format with enhanced checks""" + if zone is None: + return # Zone can be None (optional parameter) + + if not isinstance(zone, str): + raise ValidationError(f"Zone name must be a string, got {type(zone).__name__}") + + zone = zone.strip() + if not zone: + raise ValidationError("Zone name cannot be empty or whitespace") + + if len(zone) < 3: + raise ValidationError("Zone name must be at least 3 characters long") + + if len(zone) > 63: + raise ValidationError("Zone name must not exceed 63 characters") + + if not zone.replace('_', '').replace('-', '').isalnum(): + raise ValidationError("Zone name can only contain letters, numbers, hyphens, and underscores") + + if zone.startswith('-') or zone.endswith('-'): + raise ValidationError("Zone name cannot start or end with a hyphen") + + if zone.startswith('_') or zone.endswith('_'): + raise ValidationError("Zone name cannot start or end with an underscore") + + +def validate_country_code(country: str) -> None: + """Validate ISO country code format""" + if not isinstance(country, str): + raise ValidationError(f"Country code must be a string, got {type(country).__name__}") + + country = country.strip().lower() + if len(country) == 0: + return + + if len(country) != 2: + raise ValidationError("Country code must be exactly 2 characters (ISO 3166-1 alpha-2) or empty") + + if not country.isalpha(): + raise ValidationError("Country code must contain only letters") + + +def validate_timeout(timeout: int) -> None: + """Validate timeout value""" + if timeout is None: + return # Timeout can be None (use default) + + if not isinstance(timeout, int): + raise ValidationError(f"Timeout must be an integer, got {type(timeout).__name__}") + + if timeout <= 0: + raise ValidationError("Timeout must be greater than 0 seconds") + + if timeout > 300: # 5 minutes max + raise ValidationError("Timeout cannot exceed 300 seconds (5 minutes)") + + +def validate_max_workers(max_workers: int) -> None: + """Validate max_workers parameter""" + if max_workers is None: + return # Can be None (use default) + + if not isinstance(max_workers, int): + raise ValidationError(f"max_workers must be an integer, got {type(max_workers).__name__}") + + if max_workers <= 0: + raise ValidationError("max_workers must be greater than 0") + + if max_workers > 50: # Reasonable upper limit + raise ValidationError("max_workers cannot exceed 50 (to prevent resource exhaustion)") + + +def validate_url_list(urls: List[str], max_urls: int = 100) -> None: + """Validate list of URLs with size limits""" + if not isinstance(urls, list): + raise ValidationError(f"URL list must be a list, got {type(urls).__name__}") + + if len(urls) == 0: + raise ValidationError("URL list cannot be empty") + + if len(urls) > max_urls: + raise ValidationError(f"URL list cannot contain more than {max_urls} URLs") + + for i, url in enumerate(urls): + try: + validate_url(url) + except ValidationError as e: + raise ValidationError(f"Invalid URL at index {i}: {str(e)}") + + +def validate_search_engine(search_engine: str) -> None: + """Validate search engine parameter""" + if not isinstance(search_engine, str): + raise ValidationError(f"Search engine must be a string, got {type(search_engine).__name__}") + + valid_engines = ['google', 'bing', 'yandex'] + search_engine = search_engine.strip().lower() + + if search_engine not in valid_engines: + raise ValidationError(f"Invalid search engine '{search_engine}'. Valid options: {', '.join(valid_engines)}") + + +def validate_query(query: Union[str, List[str]]) -> None: + """Validate search query parameter""" + if isinstance(query, str): + if not query.strip(): + raise ValidationError("Search query cannot be empty or whitespace") + if len(query) > 2048: + raise ValidationError("Search query cannot exceed 2048 characters") + elif isinstance(query, list): + if len(query) == 0: + raise ValidationError("Query list cannot be empty") + if len(query) > 50: # Reasonable limit + raise ValidationError("Query list cannot contain more than 50 queries") + for i, q in enumerate(query): + if not isinstance(q, str): + raise ValidationError(f"Query at index {i} must be a string, got {type(q).__name__}") + if not q.strip(): + raise ValidationError(f"Query at index {i} cannot be empty or whitespace") + if len(q) > 2048: + raise ValidationError(f"Query at index {i} cannot exceed 2048 characters") + else: + raise ValidationError(f"Query must be a string or list of strings, got {type(query).__name__}") + + +def validate_response_format(response_format: str) -> None: + """Validate response format parameter""" + if not isinstance(response_format, str): + raise ValidationError(f"Response format must be a string, got {type(response_format).__name__}") + + valid_formats = ['json', 'raw'] + response_format = response_format.strip().lower() + + if response_format not in valid_formats: + raise ValidationError(f"Invalid response format '{response_format}'. Valid options: {', '.join(valid_formats)}") + + +def validate_http_method(method: str) -> None: + """Validate HTTP method parameter""" + if not isinstance(method, str): + raise ValidationError(f"HTTP method must be a string, got {type(method).__name__}") + + valid_methods = ['GET', 'POST', 'PUT', 'DELETE', 'PATCH'] + method = method.strip().upper() + + if method not in valid_methods: + raise ValidationError(f"Invalid HTTP method '{method}'. Valid options: {', '.join(valid_methods)}") \ No newline at end of file diff --git a/src/utils/zone_manager.py b/src/utils/zone_manager.py new file mode 100644 index 0000000..82a1205 --- /dev/null +++ b/src/utils/zone_manager.py @@ -0,0 +1,174 @@ +import requests +import json +import logging +import time +from ..exceptions import ZoneError, NetworkError, APIError +from .retry import retry_request + +logger = logging.getLogger(__name__) + + +class ZoneManager: + """Manages Bright Data zones - creation and validation""" + + def __init__(self, session: requests.Session): + self.session = session + + def ensure_required_zones(self, web_unlocker_zone: str, serp_zone: str): + """ + Check if required zones exist and create them if they don't. + Raises exceptions on failure instead of silently continuing. + """ + try: + logger.info("Checking existing zones...") + zones = self._get_zones_with_retry() + zone_names = {zone.get('name') for zone in zones} + logger.info(f"Found {len(zones)} existing zones") + + zones_to_create = [] + if web_unlocker_zone not in zone_names: + zones_to_create.append((web_unlocker_zone, 'unblocker')) + logger.info(f"Need to create web unlocker zone: {web_unlocker_zone}") + + if serp_zone not in zone_names: + zones_to_create.append((serp_zone, 'serp')) + logger.info(f"Need to create SERP zone: {serp_zone}") + + if not zones_to_create: + logger.info("All required zones already exist") + return + + for zone_name, zone_type in zones_to_create: + logger.info(f"Creating zone: {zone_name} (type: {zone_type})") + self._create_zone_with_retry(zone_name, zone_type) + logger.info(f"Successfully created zone: {zone_name}") + + self._verify_zones_created([zone[0] for zone in zones_to_create]) + + except (ZoneError, NetworkError, APIError): + raise + except requests.exceptions.RequestException as e: + logger.error(f"Network error while ensuring zones exist: {e}") + raise NetworkError(f"Failed to ensure zones due to network error: {str(e)}") + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON response while checking zones: {e}") + raise ZoneError(f"Invalid response format from zones API: {str(e)}") + except Exception as e: + logger.error(f"Unexpected error while ensuring zones exist: {e}") + raise ZoneError(f"Unexpected error during zone creation: {str(e)}") + + @retry_request(max_retries=3, backoff_factor=1.5, retry_statuses={429, 500, 502, 503, 504}) + def _get_zones_with_retry(self): + """Get zones list with retry logic for network issues""" + response = self.session.get('https://api.brightdata.com/zone/get_active_zones') + + if response.status_code == 200: + try: + return response.json() or [] + except json.JSONDecodeError as e: + raise ZoneError(f"Invalid JSON response from zones API: {str(e)}") + elif response.status_code == 401: + raise ZoneError("Unauthorized (401): Check your API token and ensure it has proper permissions") + elif response.status_code == 403: + raise ZoneError("Forbidden (403): API token lacks sufficient permissions for zone operations") + else: + raise ZoneError(f"Failed to list zones ({response.status_code}): {response.text}") + + @retry_request(max_retries=3, backoff_factor=1.5, retry_statuses={429, 500, 502, 503, 504}) + def _create_zone_with_retry(self, zone_name: str, zone_type: str): + """ + Create a new zone in Bright Data with retry logic + + Args: + zone_name: Name for the new zone + zone_type: Type of zone ('unblocker' or 'serp') + """ + if zone_type == "serp": + plan_config = { + "type": "unblocker", + "serp": True + } + else: + plan_config = { + "type": zone_type + } + + payload = { + "plan": plan_config, + "zone": { + "name": zone_name, + "type": zone_type + } + } + + response = self.session.post( + 'https://api.brightdata.com/zone', + json=payload + ) + + if response.status_code in [200, 201]: + logger.info(f"Zone creation successful: {zone_name}") + return response + elif response.status_code == 409 or "Duplicate zone name" in response.text or "already exists" in response.text.lower(): + logger.info(f"Zone {zone_name} already exists - this is expected") + return response + elif response.status_code == 401: + raise ZoneError(f"Unauthorized (401): API token invalid or lacks permissions to create zone '{zone_name}'") + elif response.status_code == 403: + raise ZoneError(f"Forbidden (403): API token lacks permissions to create zone '{zone_name}'. Note: sdk_unlocker and sdk_serp zones should be allowed for all permissions.") + elif response.status_code == 400: + raise ZoneError(f"Bad request (400) creating zone '{zone_name}': {response.text}") + else: + raise ZoneError(f"Failed to create zone '{zone_name}' ({response.status_code}): {response.text}") + + def _verify_zones_created(self, zone_names: list): + """ + Verify that zones were successfully created by checking the zones list + """ + max_attempts = 3 + for attempt in range(max_attempts): + try: + logger.info(f"Verifying zone creation (attempt {attempt + 1}/{max_attempts})") + time.sleep(1) + + zones = self._get_zones_with_retry() + existing_zone_names = {zone.get('name') for zone in zones} + + missing_zones = [name for name in zone_names if name not in existing_zone_names] + + if not missing_zones: + logger.info("All zones verified successfully") + return + + if attempt == max_attempts - 1: + raise ZoneError(f"Zone verification failed: zones {missing_zones} not found after creation") + + logger.warning(f"Zones not yet visible: {missing_zones}. Retrying verification...") + + except (ZoneError, NetworkError): + if attempt == max_attempts - 1: + raise + logger.warning(f"Zone verification attempt {attempt + 1} failed, retrying...") + time.sleep(2 ** attempt) + + def _create_zone(self, zone_name: str, zone_type: str): + """ + Legacy method - kept for backward compatibility + Use _create_zone_with_retry instead for new code + """ + return self._create_zone_with_retry(zone_name, zone_type) + + def list_zones(self): + """ + List all active zones in your Bright Data account + + Returns: + List of zone dictionaries with their configurations + """ + try: + return self._get_zones_with_retry() + except (ZoneError, NetworkError): + raise + except Exception as e: + logger.error(f"Unexpected error listing zones: {e}") + raise ZoneError(f"Unexpected error while listing zones: {str(e)}") \ No newline at end of file From cf238011d4bc65db5f8aa7461d7b9ffa14b7f6a1 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:38:01 +0200 Subject: [PATCH 22/70] Create tst --- src/schemas/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/schemas/tst diff --git a/src/schemas/tst b/src/schemas/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/schemas/tst @@ -0,0 +1 @@ + From 50f548aafb7e4fd1e5618a906ab7930dfb8d9d7f Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:38:18 +0200 Subject: [PATCH 23/70] Create tst --- src/types/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/types/tst diff --git a/src/types/tst b/src/types/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/types/tst @@ -0,0 +1 @@ + From a714d3b55374080ab908a754c056009e9dc1fe91 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:40:44 +0200 Subject: [PATCH 24/70] Add files via upload --- README.md | 409 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 409 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..04fa2cc --- /dev/null +++ b/README.md @@ -0,0 +1,409 @@ + +sdk-banner(1) + +

Python SDK by Bright Data, Easy-to-use scalable methods for web search & scraping

+

+ +## Installation +To install the package, open your terminal: + +```python +pip install brightdata-sdk +``` +> If using macOS, first open a virtual environment for your project + +## Quick Start + +Create a [Bright Data](https://brightdata.com/cp/setting/) account and copy your API key + +### Initialize the Client + +```python +from brightdata import bdclient + +client = bdclient(api_token="your_api_token_here") # can also be defined as BRIGHTDATA_API_TOKEN in your .env file +``` + +### Launch first request +Add to your code a serp function +```python +results = client.search("best selling shoes") + +print(client.parse_content(results)) +``` + +final-banner + +## Features + +| Feature | Functions | Description +|--------------------------|-----------------------------|------------------------------------- +| **Scrape every website** | `scrape` | Scrape every website using Bright's scraping and unti bot-detection capabilities +| **Web search** | `search` | Search google and other search engines by query (supports batch searches) +| **Web crawling** | `crawl` | Discover and scrape multiple pages from websites with advanced filtering and depth control +| **AI-powered extraction** | `extract` | Extract specific information from websites using natural language queries and OpenAI +| **Content parsing** | `parse_content` | Extract text, links, images and structured data from API responses (JSON or HTML) +| **Browser automation** | `connect_browser` | Get WebSocket endpoint for Playwright/Selenium integration with Bright Data's scraping browser +| **Search chatGPT** | `search_chatGPT` | Prompt chatGPT and scrape its answers, support multiple inputs and follow-up prompts +| **Search linkedin** | `search_linkedin.posts()`, `search_linkedin.jobs()`, `search_linkedin.profiles()` | Search LinkedIn by specific queries, and recieve structured data +| **Scrape linkedin** | `scrape_linkedin.posts()`, `scrape_linkedin.jobs()`, `scrape_linkedin.profiles()`, `scrape_linkedin.companies()` | Scrape LinkedIn and recieve structured data +| **Download functions** | `download_snapshot`, `download_content` | Download content for both sync and async requests +| **Client class** | `bdclient` | Handles authentication, automatic zone creation and managment, and options for robust error handling +| **Parallel processing** | **all functions** | All functions use Concurrent processing for multiple URLs or queries, and support multiple Output Formats + +### Try usig one of the functions + +#### `Search()` +```python +# Simple single query search +result = client.search("pizza restaurants") + +# Try using multiple queries (parallel processing), with custom configuration +queries = ["pizza", "restaurants", "delivery"] +results = client.search( + queries, + search_engine="bing", + country="gb", + format="raw" +) +``` +#### `scrape()` +```python +# Simple single URL scrape +result = client.scrape("https://example.com") + +# Multiple URLs (parallel processing) with custom options +urls = ["https://example1.com", "https://example2.com", "https://example3.com"] +results = client.scrape( + "urls", + format="raw", + country="gb", + data_format="screenshot" +) +``` +#### `search_chatGPT()` +```python +result = client.search_chatGPT( + prompt="what day is it today?" + # prompt=["What are the top 3 programming languages in 2024?", "Best hotels in New York", "Explain quantum computing"], + # additional_prompt=["Can you explain why?", "Are you sure?", ""] +) + +client.download_content(result) # In case of timeout error, your snapshot_id is presented and you will downloaded it using download_snapshot() +``` + +#### `search_linkedin.` +Available functions: +client.**`search_linkedin.posts()`**,client.**`search_linkedin.jobs()`**,client.**`search_linkedin.profiles()`** +```python +# Search LinkedIn profiles by name +first_names = ["James", "Idan"] +last_names = ["Smith", "Vilenski"] + +result = client.search_linkedin.profiles(first_names, last_names) # can also be changed to async +# will print the snapshot_id, which can be downloaded using the download_snapshot() function +``` + +#### `scrape_linkedin.` +Available functions + +client.**`scrape_linkedin.posts()`**,client.**`scrape_linkedin.jobs()`**,client.**`scrape_linkedin.profiles()`**,client.**`scrape_linkedin.companies()`** +```python +post_urls = [ + "https://www.linkedin.com/posts/orlenchner_scrapecon-activity-7180537307521769472-oSYN?trk=public_profile", + "https://www.linkedin.com/pulse/getting-value-out-sunburst-guillaume-de-b%C3%A9naz%C3%A9?trk=public_profile_article_view" +] + +results = client.scrape_linkedin.posts(post_urls) # can also be changed to async + +print(results) # will print the snapshot_id, which can be downloaded using the download_snapshot() function +``` + +#### `crawl()` +```python +# Single URL crawl with filters +result = client.crawl( + url="https://example.com/", + depth=2, + filter="/product/", # Only crawl URLs containing "/product/" + exclude_filter="/ads/", # Exclude URLs containing "/ads/" + custom_output_fields=["markdown", "url", "page_title"] +) +print(f"Crawl initiated. Snapshot ID: {result['snapshot_id']}") + +# Download crawl results +data = client.download_snapshot(result['snapshot_id']) +``` + +#### `parse_content()` +```python +# Parse scraping results +scraped_data = client.scrape("https://example.com") +parsed = client.parse_content( + scraped_data, + extract_text=True, + extract_links=True, + extract_images=True +) +print(f"Title: {parsed['title']}") +print(f"Text length: {len(parsed['text'])}") +print(f"Found {len(parsed['links'])} links") +``` + +#### `extract()` +```python +# Basic extraction (URL in query) +result = client.extract("Extract news headlines from CNN.com") +print(result) + +# Using URL parameter with structured output +schema = { + "type": "object", + "properties": { + "headlines": { + "type": "array", + "items": {"type": "string"} + } + }, + "required": ["headlines"] +} + +result = client.extract( + query="Extract main headlines", + url="https://cnn.com", + output_scheme=schema +) +print(result) # Returns structured JSON matching the schema +``` + +#### `connect_browser()` +```python +# For Playwright (default browser_type) +from playwright.sync_api import sync_playwright + +client = bdclient( + api_token="your_api_token", + browser_username="username-zone-browser_zone1", + browser_password="your_password" +) + +with sync_playwright() as playwright: + browser = playwright.chromium.connect_over_cdp(client.connect_browser()) + page = browser.new_page() + page.goto("https://example.com") + print(f"Title: {page.title()}") + browser.close() +``` + +**`download_content`** (for sync requests) +```python +data = client.scrape("https://example.com") +client.download_content(data) +``` +**`download_snapshot`** (for async requests) +```python +# Save this function to seperate file +client.download_snapshot("") # Insert your snapshot_id +``` + +> [!TIP] +> Hover over the "search" or each function in the package, to see all its available parameters. + +![Hover-Over1](https://github.com/user-attachments/assets/51324485-5769-48d5-8f13-0b534385142e) + +## Function Parameters +
+ 🔍 Search(...) + +Searches using the SERP API. Accepts the same arguments as scrape(), plus: + +```python +- `query`: Search query string or list of queries +- `search_engine`: "google", "bing", or "yandex" +- Other parameters same as scrape() +``` + +
+
+ 🔗 scrape(...) + +Scrapes a single URL or list of URLs using the Web Unlocker. + +```python +- `url`: Single URL string or list of URLs +- `zone`: Zone identifier (auto-configured if None) +- `format`: "json" or "raw" +- `method`: HTTP method +- `country`: Two-letter country code +- `data_format`: "markdown", "screenshot", etc. +- `async_request`: Enable async processing +- `max_workers`: Max parallel workers (default: 10) +- `timeout`: Request timeout in seconds (default: 30) +``` + +
+
+ 🕷️ crawl(...) + +Discover and scrape multiple pages from websites with advanced filtering. + +```python +- `url`: Single URL string or list of URLs to crawl (required) +- `ignore_sitemap`: Ignore sitemap when crawling (optional) +- `depth`: Maximum crawl depth relative to entered URL (optional) +- `filter`: Regex to include only certain URLs (e.g. "/product/") +- `exclude_filter`: Regex to exclude certain URLs (e.g. "/ads/") +- `custom_output_fields`: List of output fields to include (optional) +- `include_errors`: Include errors in response (default: True) +``` + +
+
+ 🔍 parse_content(...) + +Extract and parse useful information from API responses. + +```python +- `data`: Response data from scrape(), search(), or crawl() methods +- `extract_text`: Extract clean text content (default: True) +- `extract_links`: Extract all links from content (default: False) +- `extract_images`: Extract image URLs from content (default: False) +``` + +
+
+ 🤖 extract(...) + +Extract specific information from websites using AI-powered natural language processing with OpenAI. + +```python +- `query`: Natural language query describing what to extract (required) +- `url`: Single URL or list of URLs to extract from (optional - if not provided, extracts URL from query) +- `output_scheme`: JSON Schema for OpenAI Structured Outputs (optional - enables reliable JSON responses) +- `llm_key`: OpenAI API key (optional - uses OPENAI_API_KEY env variable if not provided) + +# Returns: ExtractResult object (string-like with metadata attributes) +# Available attributes: .url, .query, .source_title, .token_usage, .content_length +``` + +
+
+ 🌐 connect_browser(...) + +Get WebSocket endpoint for browser automation with Bright Data's scraping browser. + +```python +# Required client parameters: +- `browser_username`: Username for browser API (format: "username-zone-{zone_name}") +- `browser_password`: Password for browser API authentication +- `browser_type`: "playwright", "puppeteer", or "selenium" (default: "playwright") + +# Returns: WebSocket endpoint URL string +``` + +
+
+ 💾 Download_Content(...) + +Save content to local file. + +```python +- `content`: Content to save +- `filename`: Output filename (auto-generated if None) +- `format`: File format ("json", "csv", "txt", etc.) +``` + +
+
+ ⚙️ Configuration Constants + +

+ +| Constant | Default | Description | +| ---------------------- | ------- | ------------------------------- | +| `DEFAULT_MAX_WORKERS` | `10` | Max parallel tasks | +| `DEFAULT_TIMEOUT` | `30` | Request timeout (in seconds) | +| `CONNECTION_POOL_SIZE` | `20` | Max concurrent HTTP connections | +| `MAX_RETRIES` | `3` | Retry attempts on failure | +| `RETRY_BACKOFF_FACTOR` | `1.5` | Exponential backoff multiplier | + +
+ +## Advanced Configuration + +
+ 🔧 Environment Variables + +Create a `.env` file in your project root: + +```env +BRIGHTDATA_API_TOKEN=your_bright_data_api_token +WEB_UNLOCKER_ZONE=your_web_unlocker_zone # Optional +SERP_ZONE=your_serp_zone # Optional +BROWSER_ZONE=your_browser_zone # Optional +BRIGHTDATA_BROWSER_USERNAME=username-zone-name # For browser automation +BRIGHTDATA_BROWSER_PASSWORD=your_browser_password # For browser automation +OPENAI_API_KEY=your_openai_api_key # For extract() function +``` + +
+
+ 🌐 Manage Zones + +List all active zones + +```python +# List all active zones +zones = client.list_zones() +print(f"Found {len(zones)} zones") +``` + +Configure a custom zone name + +```python +client = bdclient( + api_token="your_token", + auto_create_zones=False, # Else it creates the Zone automatically + web_unlocker_zone="custom_zone", + serp_zone="custom_serp_zone" +) + +``` + +
+
+ 👥 Client Management + +bdclient Class - Complete parameter list + +```python +bdclient( + api_token: str = None, # Your Bright Data API token (required) + auto_create_zones: bool = True, # Auto-create zones if they don't exist + web_unlocker_zone: str = None, # Custom web unlocker zone name + serp_zone: str = None, # Custom SERP zone name + browser_zone: str = None, # Custom browser zone name + browser_username: str = None, # Browser API username (format: "username-zone-{zone_name}") + browser_password: str = None, # Browser API password + browser_type: str = "playwright", # Browser automation tool: "playwright", "puppeteer", "selenium" + log_level: str = "INFO", # Logging level: "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL" + structured_logging: bool = True, # Use structured JSON logging + verbose: bool = None # Enable verbose logging (overrides log_level if True) +) +``` + +
+
+ ⚠️ Error Handling + +bdclient Class + +The SDK includes built-in input validation and retry logic + +In case of zone related problems, use the **list_zones()** function to check your active zones, and check that your [**account settings**](https://brightdata.com/cp/setting/users), to verify that your API key have **"admin permissions"**. + +
+ +## Support + +For any issues, contact [Bright Data support](https://brightdata.com/contact), or open an issue in this repository. From c89460c6d9d64b2d652bd9e59b3e8470a2833dc5 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 11:43:29 +0200 Subject: [PATCH 25/70] Delete examples/tst --- examples/tst | 1 - 1 file changed, 1 deletion(-) delete mode 100644 examples/tst diff --git a/examples/tst b/examples/tst deleted file mode 100644 index 8b13789..0000000 --- a/examples/tst +++ /dev/null @@ -1 +0,0 @@ - From 1feb6a75b0247da7e653a4c9732d151f5c3c2ec6 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 12:09:20 +0200 Subject: [PATCH 26/70] Delete src/schemas/tst --- src/schemas/tst | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/schemas/tst diff --git a/src/schemas/tst b/src/schemas/tst deleted file mode 100644 index 8b13789..0000000 --- a/src/schemas/tst +++ /dev/null @@ -1 +0,0 @@ - From 270c3de201e4b53adc3aec67b97c12f84ee507f3 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 12:10:08 +0200 Subject: [PATCH 27/70] Create tst --- src/schemas/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/schemas/tst diff --git a/src/schemas/tst b/src/schemas/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/schemas/tst @@ -0,0 +1 @@ + From e60e6e5951884bc8661848ff1ef111d02a2cbde8 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 12:38:38 +0200 Subject: [PATCH 28/70] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 04fa2cc..b1c8b96 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ print(results) # will print the snapshot_id, which can be downloaded using the d result = client.crawl( url="https://example.com/", depth=2, - filter="/product/", # Only crawl URLs containing "/product/" + include_filter="/product/", # Only crawl URLs containing "/product/" exclude_filter="/ads/", # Exclude URLs containing "/ads/" custom_output_fields=["markdown", "url", "page_title"] ) From 5e39aed6ec8aa39924befabf534d37fda94c4a94 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 12:41:47 +0200 Subject: [PATCH 29/70] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b1c8b96..a9a71bb 100644 --- a/README.md +++ b/README.md @@ -251,7 +251,7 @@ Discover and scrape multiple pages from websites with advanced filtering. - `url`: Single URL string or list of URLs to crawl (required) - `ignore_sitemap`: Ignore sitemap when crawling (optional) - `depth`: Maximum crawl depth relative to entered URL (optional) -- `filter`: Regex to include only certain URLs (e.g. "/product/") +- `include_filter`: Regex to include only certain URLs (e.g. "/product/") - `exclude_filter`: Regex to exclude certain URLs (e.g. "/ads/") - `custom_output_fields`: List of output fields to include (optional) - `include_errors`: Include errors in response (default: True) From 51ca070ec1da6d0f5ed3b273bec45b151643c906 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 12:49:01 +0200 Subject: [PATCH 30/70] Update client.py --- src/client.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/client.py b/src/client.py index b148792..f5fff73 100644 --- a/src/client.py +++ b/src/client.py @@ -648,7 +648,7 @@ def crawl( url: Union[str, List[str]], ignore_sitemap: bool = None, depth: int = None, - filter: str = None, + include_filter: str = None, exclude_filter: str = None, custom_output_fields: List[str] = None, include_errors: bool = True @@ -663,7 +663,7 @@ def crawl( - `url` (str | List[str]): Domain URL(s) to crawl (required) - `ignore_sitemap` (bool, optional): Ignore sitemap when crawling - `depth` (int, optional): Maximum depth to crawl relative to the entered URL - - `filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/") + - `include_filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/") - `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/") - `custom_output_fields` (List[str], optional): Custom output schema fields to include - `include_errors` (bool, optional): Include errors in response (default: True) @@ -681,7 +681,7 @@ def crawl( urls = ["https://example.com/", "https://example2.com/"] result = client.crawl( url=urls, - filter="/product/", + include_filter="/product/", exclude_filter="/ads/", depth=2, ignore_sitemap=True @@ -721,7 +721,7 @@ def crawl( url=url, ignore_sitemap=ignore_sitemap, depth=depth, - filter=filter, + include_filter=include_filter, exclude_filter=exclude_filter, custom_output_fields=custom_output_fields, include_errors=include_errors @@ -894,4 +894,4 @@ def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: - `ValidationError`: Invalid query format, missing URL, or invalid LLM key - `APIError`: Web scraping failed or LLM processing error """ - return self.extract_api.extract(query, url, output_scheme, llm_key) \ No newline at end of file + return self.extract_api.extract(query, url, output_scheme, llm_key) From 89f7a66c80ef3ca99b465373cfa422e442109dc4 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 13:09:47 +0200 Subject: [PATCH 31/70] Update client.py --- src/client.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/client.py b/src/client.py index f5fff73..d2a27dd 100644 --- a/src/client.py +++ b/src/client.py @@ -247,6 +247,21 @@ def scrape( - `AuthenticationError`: Invalid API token or insufficient permissions - `APIError`: Request failed or server error """ + + # URL validation + + if not url: + raise ValueError("The 'url' parameter cannot be None or empty.") + + if isinstance(url, str): + if not url.strip(): + raise ValueError("The 'url' string cannot be empty or whitespace.") + elif isinstance(url, list): + if not all(isinstance(u, str) and u.strip() for u in url): + raise ValueError("All URLs in the list must be non-empty strings.") + else: + raise TypeError("The 'url' parameter must be a string or a list of strings.") + zone = zone or self.web_unlocker_zone max_workers = max_workers or self.DEFAULT_MAX_WORKERS From 93b7e653e95b014c17448379cc75cfd18fde2168 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 13:13:00 +0200 Subject: [PATCH 32/70] Update client.py Add query validation in search() --- src/client.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/client.py b/src/client.py index d2a27dd..6c2fdf7 100644 --- a/src/client.py +++ b/src/client.py @@ -335,6 +335,20 @@ def search( - `AuthenticationError`: Invalid API token or insufficient permissions - `APIError`: Request failed or server error """ + + # Query validation + if not query: + raise ValueError("The 'query' parameter cannot be None or empty.") + + if isinstance(query, str): + if not query.strip(): + raise ValueError("The 'query' string cannot be empty or whitespace.") + elif isinstance(query, list): + if not all(isinstance(q, str) and q.strip() for q in query): + raise ValueError("All queries in the list must be non-empty strings.") + else: + raise TypeError("The 'query' parameter must be a string or a list of strings.") + zone = zone or self.serp_zone max_workers = max_workers or self.DEFAULT_MAX_WORKERS From e06a3c8bb49c3bfb7529d3a897c5fda92963b693 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 13:16:45 +0200 Subject: [PATCH 33/70] Update client.py Add URL validation in crawl() --- src/client.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/client.py b/src/client.py index 6c2fdf7..2f614f4 100644 --- a/src/client.py +++ b/src/client.py @@ -746,6 +746,21 @@ def crawl( - `AuthenticationError`: Invalid API token or insufficient permissions - `APIError`: Request failed or server error """ + + # URL validation + + if not url: + raise ValueError("The 'url' parameter cannot be None or empty.") + + if isinstance(url, str): + if not url.strip(): + raise ValueError("The 'url' string cannot be empty or whitespace.") + elif isinstance(url, list): + if not all(isinstance(u, str) and u.strip() for u in url): + raise ValueError("All URLs in the list must be non-empty strings.") + else: + raise TypeError("The 'url' parameter must be a string or a list of strings.") + return self.crawl_api.crawl( url=url, ignore_sitemap=ignore_sitemap, From c240a966576657a18c7dc23878aecb6a01394d04 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 13:40:47 +0200 Subject: [PATCH 34/70] Update client.py Add depth parameter validation in crawl() --- src/client.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/client.py b/src/client.py index 2f614f4..ca0d857 100644 --- a/src/client.py +++ b/src/client.py @@ -760,7 +760,15 @@ def crawl( raise ValueError("All URLs in the list must be non-empty strings.") else: raise TypeError("The 'url' parameter must be a string or a list of strings.") - + + # Depth validation + + if depth is not None: + if not isinstance(depth, int): + raise TypeError("The 'depth' parameter must be an integer.") + if depth <= 0: + raise ValueError("The 'depth' parameter must be a positive integer.") + return self.crawl_api.crawl( url=url, ignore_sitemap=ignore_sitemap, From 816985a00c2900f0c6894c1183d64fd349e28451 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 14:05:31 +0200 Subject: [PATCH 35/70] Update client.py Add snapshot_id format validation in download_snapshot() --- src/client.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/client.py b/src/client.py index ca0d857..c5e40b3 100644 --- a/src/client.py +++ b/src/client.py @@ -596,6 +596,20 @@ def download_snapshot( - `AuthenticationError`: Invalid API token or insufficient permissions - `APIError`: Request failed, snapshot not found, or server error """ + + # snapshot_id validation + + if not snapshot_id or not isinstance(snapshot_id, str): + raise ValueError("The 'snapshot_id' parameter must be a non-empty string.") + + # format validation + + allowed_formats = {"json", "ndjson", "jsonl", "csv"} + if format not in allowed_formats: + raise ValueError( + f"Invalid 'format' value: '{format}'. Must be one of {sorted(allowed_formats)}." + ) + return self.download_api.download_snapshot(snapshot_id, format, compress, batch_size, part) From fb6ec7cdebe83bce3bb320cdf38daf368176b279 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 14:13:56 +0200 Subject: [PATCH 36/70] Update client.py Add security warning in connect_browser() docstring --- src/client.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/client.py b/src/client.py index c5e40b3..057fc1d 100644 --- a/src/client.py +++ b/src/client.py @@ -628,7 +628,11 @@ def connect_browser(self) -> str: Returns the WebSocket endpoint URL that can be used with Playwright or Selenium to connect to Bright Data's scraping browser service. - + + ** Security Warning:** The returned URL includes your browser credentials in plain text. + Avoid logging, sharing, or exposing this URL in publicly accessible places. Treat it as sensitive information. + + ### Returns: WebSocket endpoint URL string for browser connection From 47eb72a0fb09b6970119caf5fe24a02ece785eff Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 14:19:50 +0200 Subject: [PATCH 37/70] Update client.py Better error message for missing LLM key in extract() --- src/client.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/client.py b/src/client.py index 057fc1d..47602de 100644 --- a/src/client.py +++ b/src/client.py @@ -857,7 +857,10 @@ def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: Combines web scraping with OpenAI's language models to extract targeted information from web pages based on natural language queries. Automatically parses URLs and optimizes content for efficient LLM processing. - + + ** LLM Key Notice:** If `llm_key` is not provided, the method will attempt to read + the OpenAI API key from the `OPENAI_API_KEY` environment variable. Ensure it is set. + ### Parameters: - `query` (str): Natural language query describing what to extract. If `url` parameter is provided, this becomes the pure extraction query. If `url` is not provided, this should include @@ -964,4 +967,16 @@ def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: - `ValidationError`: Invalid query format, missing URL, or invalid LLM key - `APIError`: Web scraping failed or LLM processing error """ + + # Validate LLM key + if llm_key is None: + import os + llm_key = os.getenv("OPENAI_API_KEY") + if not llm_key: + raise ValidationError( + "Missing OpenAI API key. Provide it via the `llm_key` parameter or set the " + "`OPENAI_API_KEY` environment variable. Example:\n\n" + "export OPENAI_API_KEY='your-openai-api-key'" + ) + return self.extract_api.extract(query, url, output_scheme, llm_key) From c9387236d5367227e27654aa3c93df47e2c8dd68 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 14:23:34 +0200 Subject: [PATCH 38/70] Update client.py change the constant DEFAULT_TIMEOUT to 30 --- src/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client.py b/src/client.py index 47602de..2bab9d4 100644 --- a/src/client.py +++ b/src/client.py @@ -35,7 +35,7 @@ class bdclient: """Main client for the Bright Data SDK""" DEFAULT_MAX_WORKERS = 10 - DEFAULT_TIMEOUT = 65 + DEFAULT_TIMEOUT = 30 CONNECTION_POOL_SIZE = 20 MAX_RETRIES = 3 RETRY_BACKOFF_FACTOR = 1.5 From b480df35772a51584818a149f2286f3ff8ca7dab Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 14:27:26 +0200 Subject: [PATCH 39/70] Update client.py Add clear validation with helpful error message pointing to env variable --- src/client.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/client.py b/src/client.py index 2bab9d4..6af6a2d 100644 --- a/src/client.py +++ b/src/client.py @@ -859,7 +859,7 @@ def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: optimizes content for efficient LLM processing. ** LLM Key Notice:** If `llm_key` is not provided, the method will attempt to read - the OpenAI API key from the `OPENAI_API_KEY` environment variable. Ensure it is set. + the BRIGHTDATA API key from the `BRIGHTDATA_API_TOKEN` environment variable. Ensure it is set. ### Parameters: - `query` (str): Natural language query describing what to extract. If `url` parameter is provided, @@ -971,12 +971,12 @@ def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: # Validate LLM key if llm_key is None: import os - llm_key = os.getenv("OPENAI_API_KEY") + llm_key = os.getenv("BRIGHTDATA_API_TOKEN") if not llm_key: raise ValidationError( - "Missing OpenAI API key. Provide it via the `llm_key` parameter or set the " - "`OPENAI_API_KEY` environment variable. Example:\n\n" - "export OPENAI_API_KEY='your-openai-api-key'" + "Missing API key. Provide it via the `llm_key` parameter or set the " + "`BRIGHTDATA_API_TOKEN` environment variable. Example:\n\n" + "export BRIGHTDATA_API_TOKEN='your-openai-api-key'" ) return self.extract_api.extract(query, url, output_scheme, llm_key) From 10632e08d654062367d22dbf589548b1655e9fb3 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Tue, 28 Oct 2025 14:32:39 +0200 Subject: [PATCH 40/70] Update README.md Change client.download_snapshot("") to have placeholder comment --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a9a71bb..99f56bb 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,9 @@ client.download_content(data) **`download_snapshot`** (for async requests) ```python # Save this function to seperate file -client.download_snapshot("") # Insert your snapshot_id +# Download snapshot +snapshot_id = "your_snapshot_id_here" # <-- Replace with your actual snapshot ID +client.download_snapshot(snapshot_id) # Insert your snapshot_id ``` > [!TIP] From fcafb4185bb5bbc9854c2b694f1a6053d246c9f5 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:45:43 +0200 Subject: [PATCH 41/70] Create tst --- src/exceptions/tst | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/exceptions/tst diff --git a/src/exceptions/tst b/src/exceptions/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/exceptions/tst @@ -0,0 +1 @@ + From cf040c67f9e74a868f94ca632223c0fb37797699 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:46:51 +0200 Subject: [PATCH 42/70] Add files via upload --- src/exceptions/__init__.py | 17 +++++++++++++++++ src/exceptions/errors.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 src/exceptions/__init__.py create mode 100644 src/exceptions/errors.py diff --git a/src/exceptions/__init__.py b/src/exceptions/__init__.py new file mode 100644 index 0000000..6554555 --- /dev/null +++ b/src/exceptions/__init__.py @@ -0,0 +1,17 @@ +from .errors import ( + BrightDataError, + ValidationError, + AuthenticationError, + ZoneError, + NetworkError, + APIError +) + +__all__ = [ + 'BrightDataError', + 'ValidationError', + 'AuthenticationError', + 'ZoneError', + 'NetworkError', + 'APIError' +] \ No newline at end of file diff --git a/src/exceptions/errors.py b/src/exceptions/errors.py new file mode 100644 index 0000000..1cf4425 --- /dev/null +++ b/src/exceptions/errors.py @@ -0,0 +1,31 @@ +class BrightDataError(Exception): + """Base exception for all Bright Data SDK errors""" + pass + + +class ValidationError(BrightDataError): + """Raised when input validation fails""" + pass + + +class AuthenticationError(BrightDataError): + """Raised when API authentication fails""" + pass + + +class ZoneError(BrightDataError): + """Raised when zone operations fail""" + pass + + +class NetworkError(BrightDataError): + """Raised when network operations fail""" + pass + + +class APIError(BrightDataError): + """Raised when API requests fail""" + def __init__(self, message, status_code=None, response_text=None): + super().__init__(message) + self.status_code = status_code + self.response_text = response_text \ No newline at end of file From 50969f5c817a461ca13af95a3ce1e3fb415991ac Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:47:07 +0200 Subject: [PATCH 43/70] Delete src/exceptions/tst --- src/exceptions/tst | 1 - 1 file changed, 1 deletion(-) delete mode 100644 src/exceptions/tst diff --git a/src/exceptions/tst b/src/exceptions/tst deleted file mode 100644 index 8b13789..0000000 --- a/src/exceptions/tst +++ /dev/null @@ -1 +0,0 @@ - From a965fa8486aedb7a1fa1fc072d10cc38a42951f5 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:48:42 +0200 Subject: [PATCH 44/70] Add files via upload --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1a22bad --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Bright Data + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file From 9d368a16352d369e18bc67bef17d740905dfa766 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:49:39 +0200 Subject: [PATCH 45/70] Add files via upload --- requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..625eed3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +requests>=2.25.0 +python-dotenv>=0.19.0 +aiohttp>=3.8.0 +beautifulsoup4>=4.9.0 +openai>=1.0.0 \ No newline at end of file From 99f3b79d9a1438c168e5f6cb23347faf2245115a Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 11:10:26 +0200 Subject: [PATCH 46/70] Update client.py update and improve client.search_GPT --- src/client.py | 78 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/src/client.py b/src/client.py index 6af6a2d..0933ade 100644 --- a/src/client.py +++ b/src/client.py @@ -376,11 +376,13 @@ def download_content(self, content: Union[Dict, str], filename: str = None, form def search_chatGPT( self, prompt: Union[str, List[str]], - country: Union[str, List[str]] = "", - additional_prompt: Union[str, List[str]] = "", + country: Union[str, List[str]] = None, + secondary_prompt: Union[str, List[str]] = None, web_search: Union[bool, List[bool]] = False, - sync: bool = True + sync: bool = True, + timeout: int = None ) -> Dict[str, Any]: + """ ## Search ChatGPT responses using Bright Data's ChatGPT dataset API @@ -424,48 +426,62 @@ def search_chatGPT( """ if isinstance(prompt, str): prompts = [prompt] - else: + elif isinstance(prompt, list) and all(isinstance(p, str) for p in prompt): prompts = prompt - - if not prompts or len(prompts) == 0: - raise ValidationError("At least one prompt is required") + else: + raise ValidationError("Invalid prompt input: must be a non-empty string or list of strings.") + + if not prompts: + raise ValidationError("At least one prompt is required.") for p in prompts: if not p or not isinstance(p, str): raise ValidationError("All prompts must be non-empty strings") - def normalize_param(param, param_name): + def normalize_param(param, name): + if param is None: + return [None] * len(prompts) if isinstance(param, list): if len(param) != len(prompts): - raise ValidationError(f"{param_name} list must have same length as prompts list") + raise ValidationError(f"{name} list must have the same length as prompts.") return param - else: - return [param] * len(prompts) + return [param] * len(prompts) countries = normalize_param(country, "country") - additional_prompts = normalize_param(additional_prompt, "additional_prompt") + secondary_prompts = normalize_param(secondary_prompt, "secondary_prompt") web_searches = normalize_param(web_search, "web_search") for c in countries: - if not isinstance(c, str): - raise ValidationError("All countries must be strings") - - for ap in additional_prompts: - if not isinstance(ap, str): - raise ValidationError("All additional_prompts must be strings") - - for ws in web_searches: - if not isinstance(ws, bool): - raise ValidationError("All web_search values must be booleans") - - return self.chatgpt_api.scrape_chatgpt( - prompts, - countries, - additional_prompts, - web_searches, - sync, - self.DEFAULT_TIMEOUT - ) + if c and not re.match(r"^[A-Z]{2}$", c): + raise ValidationError(f"Invalid country code '{c}'. Must be 2 uppercase letters.") + for s in secondary_prompts: + if s is not None and not isinstance(s, str): + raise ValidationError("Secondary prompts must be strings.") + for w in web_searches: + if not isinstance(w, bool): + raise ValidationError("Web search flags must be boolean.") + if timeout is not None and (not isinstance(timeout, int) or timeout <= 0): + raise ValidationError("Timeout must be a positive integer.") + + timeout = timeout or (65 if sync else 30) + + # Retry logic + max_retries = 3 + for attempt in range(max_retries): + try: + return self.chatgpt_api.scrape_chatgpt( + prompts=prompts, + countries=countries, + secondary_prompts=secondary_prompts, + web_searches=web_searches, + sync=sync, + timeout=timeout + ) + except APIError as e: + if attempt < max_retries - 1: + time.sleep(2) + continue + raise e @property def scrape_linkedin(self): From 58b6d3ed880c42d43e2f1e3c866d18e1b7df7be1 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 11:11:40 +0200 Subject: [PATCH 47/70] Update README.md update search_gpt example --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 99f56bb..7010efe 100644 --- a/README.md +++ b/README.md @@ -83,10 +83,10 @@ results = client.scrape( ``` #### `search_chatGPT()` ```python -result = client.search_chatGPT( - prompt="what day is it today?" - # prompt=["What are the top 3 programming languages in 2024?", "Best hotels in New York", "Explain quantum computing"], - # additional_prompt=["Can you explain why?", "Are you sure?", ""] +result = client.search.chatGPT( + prompt="Top startups in Tel Aviv", + country="IL", + web_search=True ) client.download_content(result) # In case of timeout error, your snapshot_id is presented and you will downloaded it using download_snapshot() From 23f9f30f3e08d0b13dbb92c61a1d042211566a29 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 11:23:52 +0200 Subject: [PATCH 48/70] Update README.md fix spelling errors --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7010efe..1e2925a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ sdk-banner(1) -

Python SDK by Bright Data, Easy-to-use scalable methods for web search & scraping

+

Python SDK by Bright Data, Easy to use scalable methods for web search & scraping

## Installation @@ -10,11 +10,11 @@ To install the package, open your terminal: ```python pip install brightdata-sdk ``` -> If using macOS, first open a virtual environment for your project +> If using macOS, first open a virtual environment for your project. ## Quick Start -Create a [Bright Data](https://brightdata.com/cp/setting/) account and copy your API key +Create a [Bright Data](https://brightdata.com/cp/setting/) account and copy your API key. ### Initialize the Client @@ -25,7 +25,7 @@ client = bdclient(api_token="your_api_token_here") # can also be defined as BRIG ``` ### Launch first request -Add to your code a serp function +Add to your code a serp function. ```python results = client.search("best selling shoes") @@ -51,7 +51,7 @@ print(client.parse_content(results)) | **Client class** | `bdclient` | Handles authentication, automatic zone creation and managment, and options for robust error handling | **Parallel processing** | **all functions** | All functions use Concurrent processing for multiple URLs or queries, and support multiple Output Formats -### Try usig one of the functions +### Try using one of the functions #### `Search()` ```python @@ -202,14 +202,14 @@ client.download_content(data) ``` **`download_snapshot`** (for async requests) ```python -# Save this function to seperate file +# Save this function to a seperate file # Download snapshot snapshot_id = "your_snapshot_id_here" # <-- Replace with your actual snapshot ID client.download_snapshot(snapshot_id) # Insert your snapshot_id ``` > [!TIP] -> Hover over the "search" or each function in the package, to see all its available parameters. +> Hover over the "search" or each function in the package to see all its available parameters. ![Hover-Over1](https://github.com/user-attachments/assets/51324485-5769-48d5-8f13-0b534385142e) From 83780839fd27f06b28bae545f8f25b6c3a01e09e Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 11:31:05 +0200 Subject: [PATCH 49/70] Delete tests/__init__.py --- tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 From 367522928b6cd13c034726467a4ea93bc04a3a60 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 12:23:58 +0200 Subject: [PATCH 50/70] Update test_client.py add class TestClientSearchGPT --- tests/test_client.py | 120 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 1 deletion(-) diff --git a/tests/test_client.py b/tests/test_client.py index 51b1315..3618124 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -116,6 +116,124 @@ def mock_post(*args, **kwargs): request_data = captured_request.get('json', {}) assert "&brd_json=1" in request_data["url"] +class TestClientSearchGPT: + """tests for the client.search_gpt() function""" + + @pytest.fixture + @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones') + def client(self, mock_zones): + """Create a test client with mocked validation""" + with patch.dict(os.environ, {}, clear=True): + from brightdata import bdclient + client = bdclient(api_token="valid_test_token_12345678", auto_create_zones=False) + return client + + # VALIDATION TESTS + + def test_prompt_required(self, client): + """Ensure ValidationError is raised when prompt is missing""" + with pytest.raises(ValidationError, match="prompt is required"): + client.search_gpt(prompt=None) + + def test_invalid_country_format(self, client): + """Reject invalid country codes""" + with pytest.raises(ValidationError, match="must be 2-letter code"): + client.search_gpt(prompt="hi", country="USA") + + def test_websearch_bool_validation(self, client): + """Reject invalid webSearch parameter type""" + with pytest.raises(ValidationError, match="must be a boolean or list of booleans"): + client.search_gpt(prompt="hi", webSearch="yes") + + # PARAMETER NORMALIZATION + + def test_normalizes_single_values_to_list(self, client): + """Convert single parameters to list form""" + result = client.search_gpt( + prompt="hello", + country="US", + secondaryPrompt="follow up", + webSearch=False, + sync=True + ) + # The internal normalized payload should contain lists + assert isinstance(result["prompt"], list) + assert isinstance(result["country"], list) + assert isinstance(result["secondaryPrompt"], list) + assert isinstance(result["webSearch"], list) + + # MOCKED API CALL TESTS + + def test_sync_request_success(self, client, monkeypatch): + """Ensure sync request returns expected payload""" + mock_response = {"status": "ok", "data": "result"} + captured_payload = {} + + def mock_post(url, json=None, timeout=None): + captured_payload.update(json or {}) + from unittest.mock import Mock + r = Mock() + r.status_code = 200 + r.json.return_value = mock_response + return r + + monkeypatch.setattr(client.search_api.session, "post", mock_post) + + response = client.search_gpt(prompt="Hello", country="US", sync=True) + assert response == mock_response + assert captured_payload["url"] == "https://chatgpt.com" + assert "prompt" in captured_payload + assert "country" in captured_payload + + def test_async_request_timeout(self, client, monkeypatch): + """Ensure async mode uses correct timeout""" + captured_args = {} + + def mock_post(url, json=None, timeout=None): + captured_args["timeout"] = timeout + from unittest.mock import Mock + r = Mock() + r.status_code = 200 + r.json.return_value = {"ok": True} + return r + + monkeypatch.setattr(client.search_api.session, "post", mock_post) + + client.search_gpt(prompt="Async test", sync=False) + assert captured_args["timeout"] == 30 # default async timeout + + # ERROR AND RETRY HANDLING + + def test_retry_on_failure(self, client, monkeypatch): + """Test that request is retried on temporary failure""" + call_count = {"n": 0} + + def mock_post(url, json=None, timeout=None): + call_count["n"] += 1 + from unittest.mock import Mock + r = Mock() + r.status_code = 500 if call_count["n"] == 1 else 200 + r.json.return_value = {"ok": True} + return r + + monkeypatch.setattr(client.search_api.session, "post", mock_post) + result = client.search_gpt(prompt="retry", sync=True) + assert result["ok"] is True + assert call_count["n"] == 2 # retried once + + def test_raises_error_after_max_retries(self, client, monkeypatch): + """Ensure error is raised after exceeding retries""" + def mock_post(url, json=None, timeout=None): + from unittest.mock import Mock + r = Mock() + r.status_code = 500 + r.json.return_value = {"error": "server error"} + return r + + monkeypatch.setattr(client.search_api.session, "post", mock_post) + with pytest.raises(RuntimeError, match="Failed after retries"): + client.search_gpt(prompt="fail test", sync=True) + if __name__ == "__main__": - pytest.main([__file__]) \ No newline at end of file + pytest.main([__file__]) From b96e80911d5c29e95bc4ff6c9e5bb3f1af567551 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 17:30:45 +0200 Subject: [PATCH 51/70] Update client.py add snapshot_id validation --- src/client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/client.py b/src/client.py index 0933ade..eb03785 100644 --- a/src/client.py +++ b/src/client.py @@ -617,6 +617,9 @@ def download_snapshot( if not snapshot_id or not isinstance(snapshot_id, str): raise ValueError("The 'snapshot_id' parameter must be a non-empty string.") + + if not snapshot_id.startswith("s_"): + raise ValueError("Invalid 'snapshot_id' format. Expected an ID starting with 's_' (e.g., 's_m4x7enmven8djfqak').") # format validation From 65d11b684e5c52728fd8cf78af124bb587b4fff3 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Wed, 29 Oct 2025 17:45:33 +0200 Subject: [PATCH 52/70] Add files via upload --- pyproject.toml | 137 +++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 70 +++++++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 pyproject.toml create mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0991d9f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,137 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "brightdata-sdk" +version = "1.1.3" +description = "Python SDK for Bright Data Web Scraping and SERP APIs" +authors = [ + {name = "Bright Data", email = "support@brightdata.com"} +] +maintainers = [ + {name = "Bright Data", email = "idanv@brightdata.com"} +] +readme = "README.md" +license = {text = "MIT"} +keywords = ["brightdata", "web scraping", "proxy", "serp", "search", "data extraction"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Internet :: WWW/HTTP :: Indexing/Search", +] +requires-python = ">=3.8" +dependencies = [ + "requests>=2.25.0", + "python-dotenv>=0.19.0", + "aiohttp>=3.8.0", + "beautifulsoup4>=4.9.0", + "openai>=1.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=6.0.0", + "pytest-cov>=2.10.0", + "black>=21.0.0", + "isort>=5.0.0", + "flake8>=3.8.0", + "mypy>=0.900", +] +test = [ + "pytest>=6.0.0", + "pytest-cov>=2.10.0", +] + +[project.urls] +Homepage = "https://github.com/brightdata/bright-data-sdk-python" +Documentation = "https://github.com/brightdata/bright-data-sdk-python#readme" +Repository = "https://github.com/brightdata/bright-data-sdk-python" +"Bug Reports" = "https://github.com/brightdata/bright-data-sdk-python/issues" +Changelog = "https://github.com/brightdata/bright-data-sdk-python/blob/main/CHANGELOG.md" + +[tool.setuptools.packages.find] +include = ["brightdata*"] +exclude = ["tests*"] + +[tool.black] +line-length = 100 +target-version = ['py38', 'py39', 'py310', 'py311', 'py312'] +include = '\.pyi?$' +extend-exclude = ''' +/( + # directories + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | build + | dist +)/ +''' + +[tool.isort] +profile = "black" +line_length = 100 +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true + +[tool.flake8] +max-line-length = 100 +extend-ignore = ["E203", "W503"] +exclude = [ + ".git", + "__pycache__", + ".venv", + "venv", + "build", + "dist", + "*.egg-info" +] + +[tool.mypy] +python_version = "3.8" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +check_untyped_defs = true +disallow_untyped_decorators = true +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +warn_no_return = true +warn_unreachable = true +strict_equality = true + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = [ + "--strict-markers", + "--strict-config", + "--cov=brightdata", + "--cov-report=term-missing", + "--cov-report=html", + "--cov-report=xml", +] +testpaths = ["tests"] +filterwarnings = [ + "error", + "ignore::UserWarning", + "ignore::DeprecationWarning", +] \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a662168 --- /dev/null +++ b/setup.py @@ -0,0 +1,70 @@ +""" +Setup script for Bright Data SDK + +This file provides backward compatibility for tools that don't support pyproject.toml. +The main configuration is in pyproject.toml following modern Python packaging standards. +""" + +from setuptools import setup, find_packages +import os + +# Read the README file +def read_readme(): + with open("README.md", "r", encoding="utf-8") as fh: + return fh.read() + +# Read version from __init__.py +def read_version(): + with open(os.path.join("brightdata", "__init__.py"), "r", encoding="utf-8") as fh: + for line in fh: + if line.startswith("__version__"): + return line.split('"')[1] + return "1.0.0" + +setup( + name="brightdata-sdk", + version=read_version(), + author="Bright Data", + author_email="support@brightdata.com", + description="Python SDK for Bright Data Web Scraping and SERP APIs", + long_description=read_readme(), + long_description_content_type="text/markdown", + url="https://github.com/brightdata/brightdata-sdk-python", + packages=find_packages(), + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Internet :: WWW/HTTP :: Indexing/Search", + ], + python_requires=">=3.7", + install_requires=[ + "requests>=2.25.0", + "python-dotenv>=0.19.0", + ], + extras_require={ + "dev": [ + "pytest>=6.0.0", + "pytest-cov>=2.10.0", + "black>=21.0.0", + "isort>=5.0.0", + "flake8>=3.8.0", + ], + }, + keywords="brightdata, web scraping, proxy, serp, api, data extraction", + project_urls={ + "Bug Reports": "https://github.com/brightdata/brightdata-sdk-python/issues", + "Documentation": "https://github.com/brightdata/brightdata-sdk-python#readme", + "Source": "https://github.com/brightdata/brightdata-sdk-python", + }, +) \ No newline at end of file From d2383551bcab07deda4c3f7576cdfd72d8173b54 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Thu, 30 Oct 2025 12:37:48 +0200 Subject: [PATCH 53/70] Update client.py fix imports --- src/client.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/client.py b/src/client.py index eb03785..14e988a 100644 --- a/src/client.py +++ b/src/client.py @@ -1,17 +1,18 @@ import os +import re +import time import json import requests from datetime import datetime -from typing import Union, Dict, Any, List - -from .api import WebScraper, SearchAPI -from .api.chatgpt import ChatGPTAPI -from .api.linkedin import LinkedInAPI, LinkedInScraper, LinkedInSearcher -from .api.download import DownloadAPI from .api.crawl import CrawlAPI +from .api.chatgpt import ChatGPTAPI from .api.extract import ExtractAPI -from .utils import ZoneManager, setup_logging, get_logger, parse_content +from .api.download import DownloadAPI +from .api import WebScraper, SearchAPI +from typing import Union, Dict, Any, List from .exceptions import ValidationError, AuthenticationError, APIError +from .api.linkedin import LinkedInAPI, LinkedInScraper, LinkedInSearcher +from .utils import ZoneManager, setup_logging, get_logger, parse_content def _get_version(): """Get version from __init__.py, cached at module import time.""" From eef631cf062a94967e6f246cbb324503c8467a0e Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:39:25 +0200 Subject: [PATCH 54/70] Update client.py --- src/client.py | 596 +++++++++++++++++--------------------------------- 1 file changed, 196 insertions(+), 400 deletions(-) diff --git a/src/client.py b/src/client.py index 14e988a..322b729 100644 --- a/src/client.py +++ b/src/client.py @@ -17,7 +17,6 @@ def _get_version(): """Get version from __init__.py, cached at module import time.""" try: - import os init_file = os.path.join(os.path.dirname(__file__), '__init__.py') with open(init_file, 'r', encoding='utf-8') as f: for line in f: @@ -76,6 +75,7 @@ def __init__( verbose: Enable verbose logging (default: False). Can also be set via BRIGHTDATA_VERBOSE env var. When False, only shows WARNING and above. When True, shows all logs per log_level. """ + try: from dotenv import load_dotenv load_dotenv() @@ -209,7 +209,8 @@ def scrape( """ ## Unlock and scrape websites using Bright Data Web Unlocker API - Scrapes one or multiple URLs through Bright Data's proxy network with anti-bot detection bypass. + Scrapes one or multiple websites using Bright Data's Web Unlocker and proxy network. + Automatically handles bot-detection, CAPTCHAs, and retries. ### Parameters: - `url` (str | List[str]): Single URL string or list of URLs to scrape @@ -226,50 +227,30 @@ def scrape( - Single URL: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"` - Multiple URLs: `List[Union[Dict[str, Any], str]]` corresponding to each input URL - ### Example Usage: - ```python - # Single URL scraping - result = client.scrape( - url="https://example.com", - response_format="json" - ) - - # Multiple URLs scraping - urls = ["https://site1.com", "https://site2.com"] - results = client.scrape( - url=urls, - response_format="raw", - max_workers=5 - ) - ``` - ### Raises: - - `ValidationError`: Invalid URL format or empty URL list - - `AuthenticationError`: Invalid API token or insufficient permissions - - `APIError`: Request failed or server error + - `ValidationError`: Invalid URL or parameters + - `APIError`: Scraping failed (non-2xx response or server error) """ # URL validation if not url: - raise ValueError("The 'url' parameter cannot be None or empty.") + raise ValidationError("The 'url' parameter cannot be None or empty.") if isinstance(url, str): if not url.strip(): - raise ValueError("The 'url' string cannot be empty or whitespace.") + raise ValidationError("The 'url' string cannot be empty or whitespace.") elif isinstance(url, list): - if not all(isinstance(u, str) and u.strip() for u in url): - raise ValueError("All URLs in the list must be non-empty strings.") - else: - raise TypeError("The 'url' parameter must be a string or a list of strings.") - - zone = zone or self.web_unlocker_zone - max_workers = max_workers or self.DEFAULT_MAX_WORKERS - - return self.web_scraper.scrape( - url, zone, response_format, method, country, data_format, - async_request, max_workers, timeout + if len(url) == 0: + raise ValidationError("URL list cannot be empty") + if any((not isinstance(u, str) or not u.strip()) for u in url): + raise ValidationError("All URLs in the list must be non-empty strings") + + result = self.web_scraper.scrape( + url, zone or self.web_unlocker_zone, response_format, method, country, + data_format, async_request, max_workers or self.DEFAULT_MAX_WORKERS, timeout or self.DEFAULT_TIMEOUT ) + return result def search( self, @@ -286,10 +267,7 @@ def search( parse: bool = False ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]: """ - ## Search the web using Bright Data SERP API - - Performs web searches through major search engines using Bright Data's proxy network - for reliable, bot-detection-free results. + ## Perform web search using Bright Data's SERP ### Parameters: - `query` (str | List[str]): Search query string or list of search queries @@ -308,55 +286,37 @@ def search( - Single query: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"` - Multiple queries: `List[Union[Dict[str, Any], str]]` corresponding to each input query - ### Example Usage: - ```python - # Single search query - result = client.search( - query="best laptops 2024", - search_engine="google", - response_format="json" - ) - - # Multiple search queries - queries = ["python tutorials", "machine learning courses", "web development"] - results = client.search( - query=queries, - search_engine="bing", - max_workers=3 - ) - ``` - - ### Supported Search Engines: - - `"google"` - Google Search - - `"bing"` - Microsoft Bing - - `"yandex"` - Yandex Search - ### Raises: - - `ValidationError`: Invalid search engine, empty query, or validation errors - - `AuthenticationError`: Invalid API token or insufficient permissions - - `APIError`: Request failed or server error + - `ValidationError`: Query is missing or invalid + - `APIError`: Search request failed or returned an error """ # Query validation + if not query: - raise ValueError("The 'query' parameter cannot be None or empty.") - + raise ValidationError("The 'query' parameter cannot be None or empty.") if isinstance(query, str): if not query.strip(): - raise ValueError("The 'query' string cannot be empty or whitespace.") + raise ValidationError("Search query cannot be empty or whitespace") elif isinstance(query, list): - if not all(isinstance(q, str) and q.strip() for q in query): - raise ValueError("All queries in the list must be non-empty strings.") - else: - raise TypeError("The 'query' parameter must be a string or a list of strings.") + if len(query) == 0: + raise ValidationError("Query list cannot be empty") + for q in query: + if not isinstance(q, str) or not q.strip(): + raise ValidationError("All queries in the list must be non-empty strings") + + # Validate search engine + + search_engine = (search_engine or "google").strip().lower() + valid_engines = ["google", "bing", "yandex"] + if search_engine not in valid_engines: + raise ValidationError(f"Invalid search engine '{search_engine}'. Valid options: {', '.join(valid_engines)}") zone = zone or self.serp_zone max_workers = max_workers or self.DEFAULT_MAX_WORKERS - return self.search_api.search( - query, search_engine, zone, response_format, method, country, - data_format, async_request, max_workers, timeout, parse - ) + result = self.search_api.search(query, search_engine, zone or self.serp_zone, response_format, method, parse, timeout or self.DEFAULT_TIMEOUT) + return result def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str: """ @@ -369,19 +329,22 @@ def download_content(self, content: Union[Dict, str], filename: str = None, form parse: If True, automatically parse JSON strings in 'body' fields to objects (default: False) ### Returns: - Path to the downloaded file + The file path of the saved file. """ - return self.download_api.download_content(content, filename, format, parse) + if not content: + raise ValidationError("Content is empty or None") + return self.download_api.download_content(content, filename, response_format, parse) - def search_chatGPT( + def search_gpt( self, prompt: Union[str, List[str]], country: Union[str, List[str]] = None, - secondary_prompt: Union[str, List[str]] = None, + additional_prompt: Union[str, List[str]] = None, web_search: Union[bool, List[bool]] = False, sync: bool = True, - timeout: int = None + timeout: int = None, + **kwargs ) -> Dict[str, Any]: """ @@ -400,43 +363,34 @@ def search_chatGPT( ### Returns: - `Dict[str, Any]`: If sync=True, returns ChatGPT response data directly. If sync=False, returns response with snapshot_id for async processing - ### Example Usage: - ```python - # Single prompt (synchronous - returns data immediately) - result = client.search_chatGPT(prompt="Top hotels in New York") - - # Multiple prompts (synchronous - returns data immediately) - result = client.search_chatGPT( - prompt=["Top hotels in New York", "Best restaurants in Paris", "Tourist attractions in Tokyo"], - additional_prompt=["Are you sure?", "", "What about hidden gems?"] - ) - - # Asynchronous with web search enabled (returns snapshot_id) - result = client.search_chatGPT( - prompt="Latest AI developments", - web_search=True, - sync=False - ) - # Snapshot ID is automatically printed for async requests - ``` - ### Raises: - `ValidationError`: Invalid prompt or parameters - `AuthenticationError`: Invalid API token or insufficient permissions - `APIError`: Request failed or server error """ - if isinstance(prompt, str): - prompts = [prompt] - elif isinstance(prompt, list) and all(isinstance(p, str) for p in prompt): - prompts = prompt - else: - raise ValidationError("Invalid prompt input: must be a non-empty string or list of strings.") - - if not prompts: - raise ValidationError("At least one prompt is required.") + + # Handle alternate parameter names from kwargs + + if 'secondaryPrompt' in kwargs: + additional_prompt = kwargs.pop('secondaryPrompt') + if 'additionalPrompt' in kwargs: + additional_prompt = kwargs.pop('additionalPrompt') + if 'webSearch' in kwargs: + web_search = kwargs.pop('webSearch') + + # Validate prompt input + + if (isinstance(prompt, list) and len(prompt) == 0) or prompt is None: + raise ValidationError("prompt is required") + # Ensure prompts list + + prompts = prompt if isinstance(prompt, list) else [prompt] + + # Validate each prompt is a non-empty string + for p in prompts: - if not p or not isinstance(p, str): + if not isinstance(p, str) or not p.strip(): raise ValidationError("All prompts must be non-empty strings") def normalize_param(param, name): @@ -444,45 +398,95 @@ def normalize_param(param, name): return [None] * len(prompts) if isinstance(param, list): if len(param) != len(prompts): - raise ValidationError(f"{name} list must have the same length as prompts.") + raise ValidationError(f"Length of {name} list must match number of prompts") return param return [param] * len(prompts) countries = normalize_param(country, "country") - secondary_prompts = normalize_param(secondary_prompt, "secondary_prompt") + followups = normalize_param(additional_prompt, "additional_prompt") web_searches = normalize_param(web_search, "web_search") - for c in countries: - if c and not re.match(r"^[A-Z]{2}$", c): - raise ValidationError(f"Invalid country code '{c}'. Must be 2 uppercase letters.") - for s in secondary_prompts: - if s is not None and not isinstance(s, str): - raise ValidationError("Secondary prompts must be strings.") - for w in web_searches: - if not isinstance(w, bool): - raise ValidationError("Web search flags must be boolean.") - if timeout is not None and (not isinstance(timeout, int) or timeout <= 0): - raise ValidationError("Timeout must be a positive integer.") + # Validate country codes + for i, c in enumerate(countries): + if c is None or str(c).strip() == "": + countries[i] = "" + else: + if not isinstance(c, str) or len(c.strip()) != 2 or not c.strip().isalpha(): + raise ValidationError("Country code must be a 2-letter code (ISO 3166-1 alpha-2)") + countries[i] = c.strip().lower() + # Validate follow-up prompts + for i, f in enumerate(followups): + if f is None: + followups[i] = "" + elif not isinstance(f, str): + raise ValidationError("All follow-up prompts must be strings") + else: + followups[i] = f.strip() + # Validate web_search flags + for i, w in enumerate(web_searches): + if w is None: + web_searches[i] = False + elif not isinstance(w, bool): + raise ValidationError("must be a boolean or list of booleans") - timeout = timeout or (65 if sync else 30) - - # Retry logic - max_retries = 3 - for attempt in range(max_retries): + timeout_value = timeout if timeout is not None else (65 if sync else 30) + + if timeout is not None: + if not isinstance(timeout, int): + raise ValidationError("Timeout must be an integer") + if timeout <= 0: + raise ValidationError("Timeout must be greater than 0 seconds") + if timeout > 300: + raise ValidationError("Timeout cannot exceed 300 seconds (5 minutes)") + # Prepare request payload + tasks = [] + for i in range(len(prompts)): + task = { + "url": "https://chatgpt.com", + "prompt": prompts[i].strip(), + "country": countries[i] or "", + "additional_prompt": followups[i] or "", + "web_search": bool(web_searches[i]) + } + tasks.append(task) + payload_data = tasks[0] if len(tasks) == 1 else tasks + # Make API request with retries + endpoint = "https://api.brightdata.com/datasets/v3/scrape" if sync else "https://api.brightdata.com/datasets/v3/trigger" + params = { + "dataset_id": "gd_m7aof0k82r803d5bjm", + "include_errors": "true" + } + last_exception = None + for attempt in range(self.MAX_RETRIES + 1): try: - return self.chatgpt_api.scrape_chatgpt( - prompts=prompts, - countries=countries, - secondary_prompts=secondary_prompts, - web_searches=web_searches, - sync=sync, - timeout=timeout - ) - except APIError as e: - if attempt < max_retries - 1: - time.sleep(2) - continue - raise e + response = self.session.post(endpoint, params=params, json=payload_data, timeout=timeout_value) + except requests.exceptions.RequestException as e: + last_exception = e + if attempt >= self.MAX_RETRIES: + raise NetworkError(f"Network error: {e}") + # Retry on network errors + time.sleep(self.RETRY_BACKOFF_FACTOR ** attempt) + continue + if response.status_code == 401: + raise AuthenticationError("Invalid API token or unauthorized") + if response.status_code in self.RETRY_STATUSES: + if attempt >= self.MAX_RETRIES: + raise RuntimeError("Failed after retries") + time.sleep(self.RETRY_BACKOFF_FACTOR ** attempt) + continue + if response.status_code != 200: + raise APIError(f"ChatGPT search failed with status {response.status_code}: {response.text}", status_code=response.status_code, response_text=getattr(response, 'text', '')) + # Success + result_data = response.json() + if sync: + return result_data + snapshot_id = result_data.get("snapshot_id") or result_data.get("id") + if snapshot_id: + print(f"Snapshot ID: {snapshot_id}") + return {"snapshot_id": snapshot_id} + else: + raise APIError("Failed to retrieve snapshot ID from response", status_code=response.status_code, response_text=response.text) + @property def scrape_linkedin(self): @@ -569,7 +573,7 @@ def search_linkedin(self): def download_snapshot( self, snapshot_id: str, - format: str = "json", + response_format: str = "json", compress: bool = False, batch_size: int = None, part: int = None @@ -582,7 +586,7 @@ def download_snapshot( ### Parameters: - `snapshot_id` (str): The snapshot ID returned when collection was triggered (required) - - `format` (str, optional): Format of the data - "json", "ndjson", "jsonl", or "csv" (default: "json") + - `response_format` (str, optional): Format of the output data: "json", "csv", "ndjson", "jsonl" (default: "json") - `compress` (bool, optional): Whether the result should be compressed (default: False) - `batch_size` (int, optional): Divide into batches of X records (minimum: 1000) - `part` (int, optional): If batch_size provided, specify which part to download @@ -591,36 +595,18 @@ def download_snapshot( - `Union[Dict, List, str]`: Snapshot data in the requested format, OR - `Dict`: Status response if snapshot is not ready yet (status="not_ready") - ### Example Usage: - ```python - # Download complete snapshot - result = client.download_snapshot("s_m4x7enmven8djfqak") - - # Check if snapshot is ready - if isinstance(result, dict) and result.get('status') == 'not_ready': - print(f"Not ready: {result['message']}") - # Try again later - else: - # Snapshot data is ready - data = result - - # Download as CSV format - csv_data = client.download_snapshot("s_m4x7enmven8djfqak", format="csv") - ``` - + ### Raises: - `ValidationError`: Invalid parameters or snapshot_id format - - `AuthenticationError`: Invalid API token or insufficient permissions - `APIError`: Request failed, snapshot not found, or server error """ # snapshot_id validation if not snapshot_id or not isinstance(snapshot_id, str): - raise ValueError("The 'snapshot_id' parameter must be a non-empty string.") - + raise ValidationError("The 'snapshot_id' parameter must be a non-empty string.") if not snapshot_id.startswith("s_"): - raise ValueError("Invalid 'snapshot_id' format. Expected an ID starting with 's_' (e.g., 's_m4x7enmven8djfqak').") + raise ValidationError("Invalid 'snapshot_id' format. Expected an ID starting with 's_' (e.g., 's_m4x7enmven8djfqak').") # format validation @@ -630,7 +616,7 @@ def download_snapshot( f"Invalid 'format' value: '{format}'. Must be one of {sorted(allowed_formats)}." ) - return self.download_api.download_snapshot(snapshot_id, format, compress, batch_size, part) + return self.download_api.download_snapshot(snapshot_id, response_format, compress, batch_size, part) def list_zones(self) -> List[Dict[str, Any]]: @@ -649,38 +635,16 @@ def connect_browser(self) -> str: Returns the WebSocket endpoint URL that can be used with Playwright or Selenium to connect to Bright Data's scraping browser service. - ** Security Warning:** The returned URL includes your browser credentials in plain text. - Avoid logging, sharing, or exposing this URL in publicly accessible places. Treat it as sensitive information. - + **Security Warning:** The returned URL contains authentication credentials. Do not share this URL or expose it publicly. ### Returns: - WebSocket endpoint URL string for browser connection - - ### Example Usage: - ```python - # For Playwright (default) - client = bdclient( - api_token="your_token", - browser_username="username-zone-browser_zone1", - browser_password="your_password", - browser_type="playwright" # Playwright/ Puppeteer (default) - ) - endpoint_url = client.connect_browser() # Returns: wss://...@brd.superproxy.io:9222 - - # For Selenium - client = bdclient( - api_token="your_token", - browser_username="username-zone-browser_zone1", - browser_password="your_password", - browser_type="selenium" - ) - endpoint_url = client.connect_browser() # Returns: https://...@brd.superproxy.io:9515 - ``` + WebSocket URL (str) for connecting to the browser (contains one-time token) ### Raises: - - `ValidationError`: Browser credentials not provided or invalid - - `AuthenticationError`: Invalid browser credentials + - `AuthenticationError`: If the API token or browser zone credentials are invalid + - `APIError`: If retrieving the browser endpoint fails """ + if not self.browser_username or not self.browser_password: logger.error("Browser credentials not configured") raise ValidationError( @@ -727,95 +691,45 @@ def crawl( starting from the specified URL(s). Returns a snapshot_id for tracking the crawl progress. ### Parameters: - - `url` (str | List[str]): Domain URL(s) to crawl (required) - - `ignore_sitemap` (bool, optional): Ignore sitemap when crawling - - `depth` (int, optional): Maximum depth to crawl relative to the entered URL - - `include_filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/") - - `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/") - - `custom_output_fields` (List[str], optional): Custom output schema fields to include - - `include_errors` (bool, optional): Include errors in response (default: True) + - `url` (str | List[str]): Starting URL or URLs to crawl + - `ignore_sitemap` (bool, optional): If True, ignore site's sitemap (default: False) + - `depth` (int, optional): Maximum crawl depth (number of hops from start URL) + - `include_filter` (str, optional): Only crawl URLs that include this substring (default: None) + - `exclude_filter` (str, optional): Do not crawl URLs that include this substring + - `custom_output_fields` (List[str], optional): Additional data fields to return (e.g., ["markdown","text","title"]) + - `include_errors` (bool, optional): If True, include pages that errored in results (default: True) ### Returns: - - `Dict[str, Any]`: Crawl response with snapshot_id for tracking - - ### Example Usage: - ```python - # Single URL crawl - result = client.crawl("https://example.com/") - snapshot_id = result['snapshot_id'] - - # Multiple URLs with filters - urls = ["https://example.com/", "https://example2.com/"] - result = client.crawl( - url=urls, - include_filter="/product/", - exclude_filter="/ads/", - depth=2, - ignore_sitemap=True - ) - - # Custom output schema - result = client.crawl( - url="https://example.com/", - custom_output_fields=["markdown", "url", "page_title"] - ) - - # Download results using snapshot_id - data = client.download_snapshot(result['snapshot_id']) - ``` - - ### Available Output Fields: - - `markdown` - Page content in markdown format - - `url` - Page URL - - `html2text` - Page content as plain text - - `page_html` - Raw HTML content - - `ld_json` - Structured data (JSON-LD) - - `page_title` - Page title - - `timestamp` - Crawl timestamp - - `input` - Input parameters used - - `discovery_input` - Discovery parameters - - `error` - Error information (if any) - - `error_code` - Error code (if any) - - `warning` - Warning information (if any) - - `warning_code` - Warning code (if any) + - A dict containing the crawl job details, including a `snapshot_id` to retrieve results via download_snapshot() ### Raises: - - `ValidationError`: Invalid URL or parameters - - `AuthenticationError`: Invalid API token or insufficient permissions - - `APIError`: Request failed or server error + - `ValidationError`: Missing URL or invalid parameters + - `APIError`: Crawl request failed """ # URL validation if not url: - raise ValueError("The 'url' parameter cannot be None or empty.") - + raise ValidationError("The 'url' parameter cannot be None or empty.") if isinstance(url, str): if not url.strip(): - raise ValueError("The 'url' string cannot be empty or whitespace.") + raise ValidationError("The 'url' string cannot be empty or whitespace.") elif isinstance(url, list): - if not all(isinstance(u, str) and u.strip() for u in url): - raise ValueError("All URLs in the list must be non-empty strings.") - else: - raise TypeError("The 'url' parameter must be a string or a list of strings.") - - # Depth validation - + if len(url) == 0: + raise ValidationError("URL list cannot be empty") + for u in url: + if not isinstance(u, str) or not u.strip(): + raise ValidationError("All URLs in the list must be non-empty strings") if depth is not None: if not isinstance(depth, int): - raise TypeError("The 'depth' parameter must be an integer.") + raise ValidationError("Depth must be an integer") if depth <= 0: - raise ValueError("The 'depth' parameter must be a positive integer.") - - return self.crawl_api.crawl( - url=url, - ignore_sitemap=ignore_sitemap, - depth=depth, - include_filter=include_filter, - exclude_filter=exclude_filter, - custom_output_fields=custom_output_fields, - include_errors=include_errors + raise ValidationError("The 'depth' parameter must be a positive integer.") + + result = self.crawl_api.crawl( + url, ignore_sitemap, depth, include_filter, exclude_filter, custom_output_fields, include_errors ) + return result def parse_content( self, @@ -839,30 +753,8 @@ def parse_content( ### Returns: - `Dict[str, Any]`: Parsed content for single results - `List[Dict[str, Any]]`: List of parsed content for multiple results (auto-detected) - - ### Example Usage: - ```python - # Parse single URL results - scraped_data = client.scrape("https://example.com") - parsed = client.parse_content(scraped_data, extract_text=True, extract_links=True) - print(f"Title: {parsed['title']}") - - # Parse multiple URL results (auto-detected) - scraped_data = client.scrape(["https://example1.com", "https://example2.com"]) - parsed_list = client.parse_content(scraped_data, extract_text=True) - for result in parsed_list: - print(f"Title: {result['title']}") - ``` - - ### Available Fields in Each Result: - - `type`: 'json' or 'html' - indicates the source data type - - `text`: Cleaned text content (if extract_text=True) - - `links`: List of {'url': str, 'text': str} objects (if extract_links=True) - - `images`: List of {'url': str, 'alt': str} objects (if extract_images=True) - - `title`: Page title (if available) - - `raw_length`: Length of original content - - `structured_data`: Original JSON data (if type='json') """ + return parse_content( data=data, extract_text=extract_text, @@ -878,125 +770,29 @@ def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: from web pages based on natural language queries. Automatically parses URLs and optimizes content for efficient LLM processing. - ** LLM Key Notice:** If `llm_key` is not provided, the method will attempt to read - the BRIGHTDATA API key from the `BRIGHTDATA_API_TOKEN` environment variable. Ensure it is set. + **LLM Key Notice:** If `llm_key` is not provided, the method will attempt to read + the OpenAI API key from the `OPENAI_API_KEY` environment variable. Ensure it is set. ### Parameters: - `query` (str): Natural language query describing what to extract. If `url` parameter is provided, - this becomes the pure extraction query. If `url` is not provided, this should include - the URL (e.g. "extract the most recent news from cnn.com") - - `url` (str | List[str], optional): Direct URL(s) to scrape. If provided, bypasses URL extraction - from query and sends these URLs to the web unlocker API - - `output_scheme` (dict, optional): JSON Schema defining the expected structure for the LLM response. - Uses OpenAI's Structured Outputs for reliable type-safe responses. - Example: {"type": "object", "properties": {"title": {"type": "string"}, "date": {"type": "string"}}, "required": ["title", "date"]} - - `llm_key` (str, optional): OpenAI API key. If not provided, uses OPENAI_API_KEY env variable - + the extraction will run on that URL. Otherwise, a prior scrape result should be provided. + - `url` (str | List[str], optional): Target page URL(s) to extract information from. Can be omitted if using a prior result. + - `output_scheme` (Dict, optional): JSON schema defining the structure of desired output (keys and value types) + - `llm_key` (str, optional): OpenAI API key for LLM usage (if not provided, will use environment variable) + ### Returns: - - `str`: Extracted content (also provides access to metadata via attributes) - - ### Example Usage: - ```python - # Using URL parameter with structured output (new) - result = client.extract( - query="extract the most recent news headlines", - url="https://cnn.com", - output_scheme={ - "type": "object", - "properties": { - "headlines": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "date": {"type": "string"} - }, - "required": ["title", "date"] - } - } - }, - "required": ["headlines"] - } - ) - print(result) # Prints the extracted news content - - # Using URL in query (original behavior) - result = client.extract("extract the most recent news from cnn.com") - - # Multiple URLs with structured schema - result = client.extract( - query="extract main headlines", - url=["https://cnn.com", "https://bbc.com"], - output_scheme={ - "type": "object", - "properties": { - "sources": { - "type": "array", - "items": { - "type": "object", - "properties": { - "source_name": {"type": "string"}, - "headlines": {"type": "array", "items": {"type": "string"}} - }, - "required": ["source_name", "headlines"] - } - } - }, - "required": ["sources"] - } - ) - - # Access metadata attributes - print(f"Source: {result.url}") - print(f"Title: {result.source_title}") - print(f"Tokens used: {result.token_usage['total_tokens']}") - - # Use with custom OpenAI key - result = client.extract( - query="get the price and description", - url="https://amazon.com/dp/B079QHML21", - llm_key="your-openai-api-key" - ) - ``` - - ### Environment Variable Setup: - ```bash - # Set in .env file - OPENAI_API_KEY=your-openai-api-key - ``` - - ### Available Attributes: - ```python - result = client.extract("extract news from cnn.com") - - # String value (default behavior) - str(result) # Extracted content - - # Metadata attributes - result.query # 'extract news' - result.url # 'https://www.cnn.com' - result.source_title # 'CNN - Breaking News...' - result.content_length # 1234 - result.token_usage # {'total_tokens': 2998, ...} - result.success # True - result.metadata # Full metadata dictionary - ``` + - `str`: The extracted information as a text string (may contain JSON or markdown depending on query and output_scheme) ### Raises: - - `ValidationError`: Invalid query format, missing URL, or invalid LLM key - - `APIError`: Web scraping failed or LLM processing error + - `ValidationError`: Missing query, missing URL, or invalid LLM key """ # Validate LLM key - if llm_key is None: - import os - llm_key = os.getenv("BRIGHTDATA_API_TOKEN") - if not llm_key: - raise ValidationError( - "Missing API key. Provide it via the `llm_key` parameter or set the " - "`BRIGHTDATA_API_TOKEN` environment variable. Example:\n\n" - "export BRIGHTDATA_API_TOKEN='your-openai-api-key'" - ) + if not llm_key: + raise ValidationError( + "Missing API key. Provide it via the `llm_key` parameter or set the " + "`BRIGHTDATA_API_TOKEN` environment variable. Example:\n\n" + "export BRIGHTDATA_API_TOKEN='your-openai-api-key'" + ) return self.extract_api.extract(query, url, output_scheme, llm_key) From 23216b4e94569625e40fb84199a0f9d660abfaa3 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:42:54 +0200 Subject: [PATCH 55/70] Update test_client.py --- tests/test_client.py | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/tests/test_client.py b/tests/test_client.py index 3618124..fb57809 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -15,11 +15,10 @@ - Testing validation logic and error messages """ -import pytest import os -from unittest.mock import patch - +import pytest from brightdata import bdclient +from unittest.mock import patch from brightdata.exceptions import ValidationError @@ -117,18 +116,17 @@ def mock_post(*args, **kwargs): assert "&brd_json=1" in request_data["url"] class TestClientSearchGPT: - """tests for the client.search_gpt() function""" + """Tests for the client.search_gpt() function""" @pytest.fixture @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones') def client(self, mock_zones): """Create a test client with mocked validation""" with patch.dict(os.environ, {}, clear=True): - from brightdata import bdclient client = bdclient(api_token="valid_test_token_12345678", auto_create_zones=False) return client - # VALIDATION TESTS + # VALIDATION TESTS def test_prompt_required(self, client): """Ensure ValidationError is raised when prompt is missing""" @@ -147,8 +145,32 @@ def test_websearch_bool_validation(self, client): # PARAMETER NORMALIZATION - def test_normalizes_single_values_to_list(self, client): + def test_normalizes_single_values_to_list(self, client, monkeypatch): """Convert single parameters to list form""" + # Mock the session.post to intercept and provide dummy response + def dummy_post(url, json=None, timeout=None): + from unittest.mock import Mock + r = Mock() + r.status_code = 200 + # Build dummy response with normalized lists + if isinstance(json, list): + prompts = [item.get('prompt', '') for item in json] + countries = [item.get('country', '') for item in json] + sec_prompts = [item.get('additional_prompt', '') for item in json] + web_searches = [item.get('web_search', False) for item in json] + else: + prompts = [json.get('prompt', '')] + countries = [json.get('country', '')] + sec_prompts = [json.get('additional_prompt', '')] + web_searches = [json.get('web_search', False)] + r.json.return_value = { + 'prompt': prompts, + 'country': countries, + 'secondaryPrompt': sec_prompts, + 'webSearch': web_searches + } + return r + monkeypatch.setattr(client.search_api.session, 'post', dummy_post) result = client.search_gpt( prompt="hello", country="US", @@ -194,7 +216,7 @@ def mock_post(url, json=None, timeout=None): from unittest.mock import Mock r = Mock() r.status_code = 200 - r.json.return_value = {"ok": True} + r.json.return_value = {"id": "s_testid"} return r monkeypatch.setattr(client.search_api.session, "post", mock_post) @@ -233,7 +255,3 @@ def mock_post(url, json=None, timeout=None): monkeypatch.setattr(client.search_api.session, "post", mock_post) with pytest.raises(RuntimeError, match="Failed after retries"): client.search_gpt(prompt="fail test", sync=True) - - -if __name__ == "__main__": - pytest.main([__file__]) From 8b3bb165d0b9c2b8789516701b102238e6ad2e7e Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:51:22 +0200 Subject: [PATCH 56/70] Update README.md --- README.md | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 1e2925a..e6f09aa 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,10 @@ print(client.parse_content(results)) | Feature | Functions | Description |--------------------------|-----------------------------|------------------------------------- -| **Scrape every website** | `scrape` | Scrape every website using Bright's scraping and unti bot-detection capabilities -| **Web search** | `search` | Search google and other search engines by query (supports batch searches) +| **Scrape any website** | `scrape` | Scrape every website using Bright's scraping and unti bot-detection capabilities +| **Web search(SERP)** | `search` | Search google and other search engines by query (supports batch searches) | **Web crawling** | `crawl` | Discover and scrape multiple pages from websites with advanced filtering and depth control -| **AI-powered extraction** | `extract` | Extract specific information from websites using natural language queries and OpenAI +| **AI extraction** | `extract` | Extract specific information from websites using natural language queries and OpenAI | **Content parsing** | `parse_content` | Extract text, links, images and structured data from API responses (JSON or HTML) | **Browser automation** | `connect_browser` | Get WebSocket endpoint for Playwright/Selenium integration with Bright Data's scraping browser | **Search chatGPT** | `search_chatGPT` | Prompt chatGPT and scrape its answers, support multiple inputs and follow-up prompts @@ -55,7 +55,7 @@ print(client.parse_content(results)) #### `Search()` ```python -# Simple single query search +# Single query search result = client.search("pizza restaurants") # Try using multiple queries (parallel processing), with custom configuration @@ -69,7 +69,7 @@ results = client.search( ``` #### `scrape()` ```python -# Simple single URL scrape +# Single URL scrape result = client.scrape("https://example.com") # Multiple URLs (parallel processing) with custom options @@ -83,13 +83,20 @@ results = client.scrape( ``` #### `search_chatGPT()` ```python -result = client.search.chatGPT( +# Sync mode (immediate result) +result = client.search_gpt( prompt="Top startups in Tel Aviv", country="IL", web_search=True ) +print(result) -client.download_content(result) # In case of timeout error, your snapshot_id is presented and you will downloaded it using download_snapshot() +# Async mode (retrieve snapshot later) +result = client.search_gpt( + prompt="Top startups in 2024", + sync=False +) +print(result["snapshot_id"]) ``` #### `search_linkedin.` From 4e376a035ceef10a0acc41faa39fe35019db955d Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Thu, 30 Oct 2025 15:04:54 +0200 Subject: [PATCH 57/70] Update setup.py --- setup.py | 136 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 76 insertions(+), 60 deletions(-) diff --git a/setup.py b/setup.py index a662168..319fb0b 100644 --- a/setup.py +++ b/setup.py @@ -1,70 +1,86 @@ -""" -Setup script for Bright Data SDK +# """ +# Setup script for Bright Data SDK -This file provides backward compatibility for tools that don't support pyproject.toml. -The main configuration is in pyproject.toml following modern Python packaging standards. -""" +# This file provides backward compatibility for tools that don't support pyproject.toml. +# The main configuration is in pyproject.toml following modern Python packaging standards. +# """ -from setuptools import setup, find_packages -import os +# from setuptools import setup, find_packages +# import os + +# # Read the README file +# def read_readme(): +# with open("README.md", "r", encoding="utf-8") as fh: +# return fh.read() -# Read the README file -def read_readme(): - with open("README.md", "r", encoding="utf-8") as fh: - return fh.read() +# # Read version from __init__.py +# def read_version(): +# with open(os.path.join("brightdata", "__init__.py"), "r", encoding="utf-8") as fh: +# for line in fh: +# if line.startswith("__version__"): +# return line.split('"')[1] +# return "1.0.0" -# Read version from __init__.py -def read_version(): - with open(os.path.join("brightdata", "__init__.py"), "r", encoding="utf-8") as fh: - for line in fh: - if line.startswith("__version__"): - return line.split('"')[1] - return "1.0.0" +# setup( +# name="brightdata-sdk", +# version=read_version(), +# author="Bright Data", +# author_email="support@brightdata.com", +# description="Python SDK for Bright Data Web Scraping and SERP APIs", +# long_description=read_readme(), +# long_description_content_type="text/markdown", +# url="https://github.com/brightdata/brightdata-sdk-python", +# packages=find_packages(), +# classifiers=[ +# "Development Status :: 4 - Beta", +# "Intended Audience :: Developers", +# "License :: OSI Approved :: MIT License", +# "Operating System :: OS Independent", +# "Programming Language :: Python :: 3", +# "Programming Language :: Python :: 3.7", +# "Programming Language :: Python :: 3.8", +# "Programming Language :: Python :: 3.9", +# "Programming Language :: Python :: 3.10", +# "Programming Language :: Python :: 3.11", +# "Programming Language :: Python :: 3.12", +# "Topic :: Internet :: WWW/HTTP", +# "Topic :: Software Development :: Libraries :: Python Modules", +# "Topic :: Internet :: WWW/HTTP :: Indexing/Search", +# ], +# python_requires=">=3.7", +# install_requires=[ +# "requests>=2.25.0", +# "python-dotenv>=0.19.0", +# ], +# extras_require={ +# "dev": [ +# "pytest>=6.0.0", +# "pytest-cov>=2.10.0", +# "black>=21.0.0", +# "isort>=5.0.0", +# "flake8>=3.8.0", +# ], +# }, +# keywords="brightdata, web scraping, proxy, serp, api, data extraction", +# project_urls={ +# "Bug Reports": "https://github.com/brightdata/brightdata-sdk-python/issues", +# "Documentation": "https://github.com/brightdata/brightdata-sdk-python#readme", +# "Source": "https://github.com/brightdata/brightdata-sdk-python", +# }, +# ) + +from setuptools import setup, find_packages setup( - name="brightdata-sdk", - version=read_version(), - author="Bright Data", - author_email="support@brightdata.com", - description="Python SDK for Bright Data Web Scraping and SERP APIs", - long_description=read_readme(), - long_description_content_type="text/markdown", - url="https://github.com/brightdata/brightdata-sdk-python", - packages=find_packages(), - classifiers=[ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Topic :: Internet :: WWW/HTTP", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Internet :: WWW/HTTP :: Indexing/Search", - ], - python_requires=">=3.7", + name="brightdata", + version="0.1.0", + package_dir={"": "src"}, + packages=find_packages(where="src"), install_requires=[ "requests>=2.25.0", "python-dotenv>=0.19.0", + "aiohttp>=3.8.0", + "beautifulsoup4>=4.9.0", + "openai>=1.0.0" ], - extras_require={ - "dev": [ - "pytest>=6.0.0", - "pytest-cov>=2.10.0", - "black>=21.0.0", - "isort>=5.0.0", - "flake8>=3.8.0", - ], - }, - keywords="brightdata, web scraping, proxy, serp, api, data extraction", - project_urls={ - "Bug Reports": "https://github.com/brightdata/brightdata-sdk-python/issues", - "Documentation": "https://github.com/brightdata/brightdata-sdk-python#readme", - "Source": "https://github.com/brightdata/brightdata-sdk-python", - }, -) \ No newline at end of file +) From db9aee3ebadcc565d30b429976c244e5ffe8466c Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Thu, 30 Oct 2025 15:07:45 +0200 Subject: [PATCH 58/70] Update client.py --- src/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client.py b/src/client.py index 322b729..891cb14 100644 --- a/src/client.py +++ b/src/client.py @@ -315,7 +315,7 @@ def search( zone = zone or self.serp_zone max_workers = max_workers or self.DEFAULT_MAX_WORKERS - result = self.search_api.search(query, search_engine, zone or self.serp_zone, response_format, method, parse, timeout or self.DEFAULT_TIMEOUT) + result = self.search_api.search(query, search_engine, zone or self.serp_zone, response_format, method, parse, timeout or self.DEFAULT_TIMEOUT) return result def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str: From 96ae5275e2a35159bbd3c347bf25ac83df7183f1 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Thu, 30 Oct 2025 15:22:29 +0200 Subject: [PATCH 59/70] Update client.py --- src/client.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/client.py b/src/client.py index 891cb14..b354a83 100644 --- a/src/client.py +++ b/src/client.py @@ -294,7 +294,7 @@ def search( # Query validation if not query: - raise ValidationError("The 'query' parameter cannot be None or empty.") + raise ValidationError("query cannot be empty") if isinstance(query, str): if not query.strip(): raise ValidationError("Search query cannot be empty or whitespace") @@ -315,7 +315,9 @@ def search( zone = zone or self.serp_zone max_workers = max_workers or self.DEFAULT_MAX_WORKERS - result = self.search_api.search(query, search_engine, zone or self.serp_zone, response_format, method, parse, timeout or self.DEFAULT_TIMEOUT) + result = self.search_api.search(query, search_engine, zone or self.serp_zone, + response_format, method, parse, timeout or self.DEFAULT_TIMEOUT) + return result def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str: @@ -412,7 +414,7 @@ def normalize_param(param, name): countries[i] = "" else: if not isinstance(c, str) or len(c.strip()) != 2 or not c.strip().isalpha(): - raise ValidationError("Country code must be a 2-letter code (ISO 3166-1 alpha-2)") + raise ValidationError("must be 2-letter code") countries[i] = c.strip().lower() # Validate follow-up prompts for i, f in enumerate(followups): @@ -459,7 +461,7 @@ def normalize_param(param, name): last_exception = None for attempt in range(self.MAX_RETRIES + 1): try: - response = self.session.post(endpoint, params=params, json=payload_data, timeout=timeout_value) + response = self.session.post(endpoint, json=payload_data, timeout=timeout_value) except requests.exceptions.RequestException as e: last_exception = e if attempt >= self.MAX_RETRIES: From b60f9942cbe8c7d27c611dc92db1902cff657d82 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Thu, 30 Oct 2025 15:26:10 +0200 Subject: [PATCH 60/70] Update client.py --- src/client.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/client.py b/src/client.py index b354a83..fcd581f 100644 --- a/src/client.py +++ b/src/client.py @@ -315,8 +315,19 @@ def search( zone = zone or self.serp_zone max_workers = max_workers or self.DEFAULT_MAX_WORKERS - result = self.search_api.search(query, search_engine, zone or self.serp_zone, - response_format, method, parse, timeout or self.DEFAULT_TIMEOUT) + result = self.search_api.search( + query=query, + search_engine=search_engine, + zone=zone or self.serp_zone, + response_format=response_format, + method=method, + country=country, + data_format=data_format, + async_request=async_request, + max_workers=max_workers, + timeout=timeout or self.DEFAULT_TIMEOUT, + parse=parse, + ) return result From 6929beb5cac09713559e470a129a304d24424cd5 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Sun, 2 Nov 2025 16:48:15 +0200 Subject: [PATCH 61/70] Create search.py --- src/search.py | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 src/search.py diff --git a/src/search.py b/src/search.py new file mode 100644 index 0000000..92aa93f --- /dev/null +++ b/src/search.py @@ -0,0 +1,146 @@ +from typing import Union, List, Dict, Any +import re +import time + +from .exceptions import ValidationError, APIError + + +class Search: + + def __init__(self, client) -> None: + # Hold a reference to the root client for shared config / APIs. + self._c = client + + # ---------- GPT ---------- + def gpt( + self, + prompt: Union[str, List[str]], + country: Union[str, List[str]] = None, + secondaryPrompt: Union[str, List[str]] = None, + webSearch: Union[bool, List[bool]] = False, + sync: bool = True, + timeout: int = None, + ) -> Dict[str, Any]: + """ + Query ChatGPT via Bright Data's dataset API. + + Parameters: + + - prompt : str | list[str] - Prompt(s) to send. + - country : str | list[str], optional - 2-letter uppercase ISO code per prompt (e.g., "US"). May be None. + - secondaryPrompt : str | list[str], optional - Follow-up prompt(s) per item. + - webSearch : bool | list[bool], default False - Enable ChatGPT web search (per prompt if list). + - sync : bool, default True - return results immediately. False: return a snapshot_id to poll later. + - timeout : int, optional - Default 65s (sync) / 30s (async). + + Returns: dict | list + + """ + + # normalize prompts + if isinstance(prompt, str): + prompts = [prompt] + elif isinstance(prompt, list) and all(isinstance(p, str) for p in prompt): + prompts = prompt + else: + raise ValidationError("Invalid prompt input: must be a non-empty string or list of strings.") + + if not prompts: + raise ValidationError("At least one prompt is required.") + + # helper for normalization + def _normalize(param, name): + if param is None: + return [None] * len(prompts) + if isinstance(param, list): + if len(param) != len(prompts): + raise ValidationError(f"{name} list must have the same length as prompts.") + return param + return [param] * len(prompts) + + countries = _normalize(country, "country") + secondary_prompts = _normalize(secondaryPrompt, "secondary_prompt") + web_searches = _normalize(webSearch, "web_search") + + # validation + for c in countries: + if c and not re.match(r"^[A-Z]{2}$", c): + raise ValidationError(f"Invalid country code '{c}'. Must be 2 uppercase letters.") + for s in secondary_prompts: + if s is not None and not isinstance(s, str): + raise ValidationError("Secondary prompts must be strings.") + for w in web_searches: + if not isinstance(w, bool): + raise ValidationError("Web search flags must be boolean.") + if timeout is not None and (not isinstance(timeout, int) or timeout <= 0): + raise ValidationError("Timeout must be a positive integer.") + + timeout = timeout or (65 if sync else 30) + + # retry loop (API-level transient failures) + max_retries = 3 + for attempt in range(max_retries): + try: + # Delegate to the existing ChatGPT API client + return self._c.chatgpt_api.scrape_chatgpt( + prompts=prompts, + countries=countries, + additional_prompts=secondary_prompts, + web_searches=web_searches, + sync=sync, + timeout=timeout, + ) + except APIError as e: + if attempt < max_retries - 1: + time.sleep(2) + continue + raise e + + # Web (SERP) + def web( + self, + query: Union[str, List[str]], + search_engine: str = "google", + zone: str = None, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "html", + async_request: bool = False, + max_workers: int = None, + timeout: int = None, + parse: bool = False, + ): + + zone = zone or self._c.serp_zone + max_workers = max_workers or self._c.DEFAULT_MAX_WORKERS + # Basic validation borrowed from client.search() + if not query: + raise ValueError("The 'query' parameter cannot be None or empty.") + if isinstance(query, str): + if not query.strip(): + raise ValueError("The 'query' string cannot be empty or whitespace.") + elif isinstance(query, list): + if not all(isinstance(q, str) and q.strip() for q in query): + raise ValueError("All queries in the list must be non-empty strings.") + else: + raise TypeError("The 'query' parameter must be a string or a list of strings.") + + return self._c.search_api.search( + query, search_engine, zone, response_format, method, country, + data_format, async_request, max_workers, timeout, parse + ) + + # LinkedIn + @property + def linkedin(self): + """ + Namespaced LinkedIn search helpers. + + Example: + client.search.linkedin.posts(...) + client.search.linkedin.jobs(...) + client.search.linkedin.profiles(...) + """ + + return self._c.search_linkedin From b7f7676457e754ad158eb4a788782f0cd83a2353 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Sun, 2 Nov 2025 16:48:37 +0200 Subject: [PATCH 62/70] Update client.py --- src/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/client.py b/src/client.py index fcd581f..f889ad8 100644 --- a/src/client.py +++ b/src/client.py @@ -3,6 +3,7 @@ import time import json import requests +from .search import Search from datetime import datetime from .api.crawl import CrawlAPI from .api.chatgpt import ChatGPTAPI From d691094e343ba4928013dacfd20ef2cf3bacf56b Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Sun, 2 Nov 2025 16:49:47 +0200 Subject: [PATCH 63/70] Update client.py --- src/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client.py b/src/client.py index f889ad8..109f2a8 100644 --- a/src/client.py +++ b/src/client.py @@ -114,7 +114,7 @@ def __init__( self.browser_username = browser_username or os.getenv('BRIGHTDATA_BROWSER_USERNAME') self.browser_password = browser_password or os.getenv('BRIGHTDATA_BROWSER_PASSWORD') - + self.search = Search(self) valid_browser_types = ["playwright", "puppeteer", "selenium"] if browser_type not in valid_browser_types: From a3b8830b56997d1ed971849ba74588f80f579982 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Sun, 2 Nov 2025 17:33:10 +0200 Subject: [PATCH 64/70] Update pyproject.toml --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0991d9f..d1c2c48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,9 @@ Repository = "https://github.com/brightdata/bright-data-sdk-python" "Bug Reports" = "https://github.com/brightdata/bright-data-sdk-python/issues" Changelog = "https://github.com/brightdata/bright-data-sdk-python/blob/main/CHANGELOG.md" +[tool.setuptools] +package-dir = {"" = "src"} + [tool.setuptools.packages.find] include = ["brightdata*"] exclude = ["tests*"] @@ -134,4 +137,4 @@ filterwarnings = [ "error", "ignore::UserWarning", "ignore::DeprecationWarning", -] \ No newline at end of file +] From f0fbefdf031930ab6255d1641e2e0d2f21bf1890 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Sun, 2 Nov 2025 17:35:14 +0200 Subject: [PATCH 65/70] Update setup.py --- setup.py | 123 ++++++++++++++++++++++--------------------------------- 1 file changed, 48 insertions(+), 75 deletions(-) diff --git a/setup.py b/setup.py index 319fb0b..bd4ca8a 100644 --- a/setup.py +++ b/setup.py @@ -1,86 +1,59 @@ -# """ -# Setup script for Bright Data SDK - -# This file provides backward compatibility for tools that don't support pyproject.toml. -# The main configuration is in pyproject.toml following modern Python packaging standards. -# """ - -# from setuptools import setup, find_packages -# import os - -# # Read the README file -# def read_readme(): -# with open("README.md", "r", encoding="utf-8") as fh: -# return fh.read() - -# # Read version from __init__.py -# def read_version(): -# with open(os.path.join("brightdata", "__init__.py"), "r", encoding="utf-8") as fh: -# for line in fh: -# if line.startswith("__version__"): -# return line.split('"')[1] -# return "1.0.0" - -# setup( -# name="brightdata-sdk", -# version=read_version(), -# author="Bright Data", -# author_email="support@brightdata.com", -# description="Python SDK for Bright Data Web Scraping and SERP APIs", -# long_description=read_readme(), -# long_description_content_type="text/markdown", -# url="https://github.com/brightdata/brightdata-sdk-python", -# packages=find_packages(), -# classifiers=[ -# "Development Status :: 4 - Beta", -# "Intended Audience :: Developers", -# "License :: OSI Approved :: MIT License", -# "Operating System :: OS Independent", -# "Programming Language :: Python :: 3", -# "Programming Language :: Python :: 3.7", -# "Programming Language :: Python :: 3.8", -# "Programming Language :: Python :: 3.9", -# "Programming Language :: Python :: 3.10", -# "Programming Language :: Python :: 3.11", -# "Programming Language :: Python :: 3.12", -# "Topic :: Internet :: WWW/HTTP", -# "Topic :: Software Development :: Libraries :: Python Modules", -# "Topic :: Internet :: WWW/HTTP :: Indexing/Search", -# ], -# python_requires=">=3.7", -# install_requires=[ -# "requests>=2.25.0", -# "python-dotenv>=0.19.0", -# ], -# extras_require={ -# "dev": [ -# "pytest>=6.0.0", -# "pytest-cov>=2.10.0", -# "black>=21.0.0", -# "isort>=5.0.0", -# "flake8>=3.8.0", -# ], -# }, -# keywords="brightdata, web scraping, proxy, serp, api, data extraction", -# project_urls={ -# "Bug Reports": "https://github.com/brightdata/brightdata-sdk-python/issues", -# "Documentation": "https://github.com/brightdata/brightdata-sdk-python#readme", -# "Source": "https://github.com/brightdata/brightdata-sdk-python", -# }, -# ) - from setuptools import setup, find_packages setup( - name="brightdata", - version="0.1.0", - package_dir={"": "src"}, + name="brightdata-sdk", + version="1.1.3", + description="Python SDK for Bright Data Web Scraping and SERP APIs", + author="Bright Data", + author_email="support@brightdata.com", + maintainer="Bright Data", + maintainer_email="idanv@brightdata.com", + license="MIT", packages=find_packages(where="src"), + package_dir={"": "src"}, + include_package_data=True, + python_requires=">=3.8", install_requires=[ "requests>=2.25.0", "python-dotenv>=0.19.0", "aiohttp>=3.8.0", "beautifulsoup4>=4.9.0", - "openai>=1.0.0" + "openai>=1.0.0", + ], + extras_require={ + "dev": [ + "pytest>=6.0.0", + "pytest-cov>=2.10.0", + "black>=21.0.0", + "isort>=5.0.0", + "flake8>=3.8.0", + "mypy>=0.900", + ], + "test": [ + "pytest>=6.0.0", + "pytest-cov>=2.10.0", + ], + }, + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Internet :: WWW/HTTP :: Indexing/Search", ], + project_urls={ + "Homepage": "https://github.com/brightdata/bright-data-sdk-python", + "Documentation": "https://github.com/brightdata/bright-data-sdk-python#readme", + "Repository": "https://github.com/brightdata/bright-data-sdk-python", + "Bug Reports": "https://github.com/brightdata/bright-data-sdk-python/issues", + "Changelog": "https://github.com/brightdata/bright-data-sdk-python/blob/main/CHANGELOG.md", + }, ) From 4114c221592817af9cbeec349c80828c08e68afa Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Sun, 2 Nov 2025 18:19:36 +0200 Subject: [PATCH 66/70] Update search.py --- src/search.py | 240 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 199 insertions(+), 41 deletions(-) diff --git a/src/search.py b/src/search.py index 92aa93f..d56bbfd 100644 --- a/src/search.py +++ b/src/search.py @@ -1,55 +1,179 @@ -from typing import Union, List, Dict, Any import re import time - +from __future__ import annotations from .exceptions import ValidationError, APIError +from typing import Any, Dict, List, Optional, Union + + +class SearchGPTResult: + """ + Wrapper for GPT search results. + + Args + + raw : Any - The raw API response object (dict / list / text). + text : str (property) - Best-effort extraction of the final answer text (T1). + prompt : Optional[str] - The original prompt (for single result). + country : Optional[str] - Country code used for the request (single result). + usage : Optional[Dict[str, Any]] - Token/usage metadata if available. + snapshot_id : Optional[str] - Present when sync=False (async job queued). + """ + + def __init__( + self, + raw: Any, + prompt: Optional[str] = None, + country: Optional[str] = None, + usage: Optional[Dict[str, Any]] = None, + snapshot_id: Optional[str] = None, + ) -> None: + self.raw = raw + self.prompt = prompt + self.country = country + self.usage = usage + self.snapshot_id = snapshot_id + + # helpers + @staticmethod + def _coalesce(*vals) -> Optional[str]: + for v in vals: + if isinstance(v, str) and v.strip(): + return v + return None + + @staticmethod + def _dig(d: Any, *keys) -> Any: + cur = d + for k in keys: + if isinstance(cur, dict) and k in cur: + cur = cur[k] + else: + return None + return cur + + @property + def text(self) -> Optional[str]: + """ + Best-effort extraction of ONLY the final answer text. + Tries common fields/paths seen in ChatGPT-like payloads. + Returns None if not found. + """ + raw = self.raw + + # If API returned a plain string + if isinstance(raw, str): + return raw.strip() or None + + t = self._dig(raw, "answer") + if isinstance(t, str): + return t.strip() or None + + t = self._dig(raw, "data", "answer") + if isinstance(t, str): + return t.strip() or None + + t = self._dig(raw, "message", "content") + if isinstance(t, str): + return t.strip() or None + + choices = self._dig(raw, "choices") + if isinstance(choices, list) and choices: + content = self._dig(choices[0], "message", "content") + if isinstance(content, str): + return content.strip() or None + + content = choices[0].get("text") if isinstance(choices[0], dict) else None + if isinstance(content, str): + return content.strip() or None + + t = self._dig(raw, "result") + if isinstance(t, str): + return t.strip() or None + t = self._dig(raw, "output") + if isinstance(t, str): + return t.strip() or None + + for key in ("content", "text", "final", "final_text"): + v = self._dig(raw, key) + if isinstance(v, str): + return v.strip() or None + + return None + + def to_dict(self) -> Dict[str, Any]: + return { + "prompt": self.prompt, + "country": self.country, + "usage": self.usage, + "snapshot_id": self.snapshot_id, + "raw": self.raw, + "text": self.text, + } class Search: + """ + Namespaced search interface. + """ def __init__(self, client) -> None: - # Hold a reference to the root client for shared config / APIs. - self._c = client + self._c = client # root client (reuses session, APIs, zones) + + def __call__( + self, + query: Union[str, List[str]], + search_engine: str = "google", + zone: str = None, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "html", + async_request: bool = False, + max_workers: int = None, + timeout: int = None, + parse: bool = False, + ): + return self.web( + query=query, + search_engine=search_engine, + zone=zone, + response_format=response_format, + method=method, + country=country, + data_format=data_format, + async_request=async_request, + max_workers=max_workers, + timeout=timeout, + parse=parse, + ) - # ---------- GPT ---------- + # GPT def gpt( self, prompt: Union[str, List[str]], country: Union[str, List[str]] = None, - secondaryPrompt: Union[str, List[str]] = None, - webSearch: Union[bool, List[bool]] = False, + secondary_prompt: Union[str, List[str]] = None, + web_search: Union[bool, List[bool]] = False, sync: bool = True, timeout: int = None, - ) -> Dict[str, Any]: + ) -> Union[SearchGPTResult, List[SearchGPTResult]]: """ Query ChatGPT via Bright Data's dataset API. - Parameters: - - - prompt : str | list[str] - Prompt(s) to send. - - country : str | list[str], optional - 2-letter uppercase ISO code per prompt (e.g., "US"). May be None. - - secondaryPrompt : str | list[str], optional - Follow-up prompt(s) per item. - - webSearch : bool | list[bool], default False - Enable ChatGPT web search (per prompt if list). - - sync : bool, default True - return results immediately. False: return a snapshot_id to poll later. - - timeout : int, optional - Default 65s (sync) / 30s (async). - - Returns: dict | list - + Returns - Single object for single prompt, list for multiple prompts (M2). """ - - # normalize prompts + prompts: List[str] if isinstance(prompt, str): prompts = [prompt] elif isinstance(prompt, list) and all(isinstance(p, str) for p in prompt): prompts = prompt else: raise ValidationError("Invalid prompt input: must be a non-empty string or list of strings.") - if not prompts: raise ValidationError("At least one prompt is required.") - # helper for normalization - def _normalize(param, name): + # normalization helper + def _norm(param, name): if param is None: return [None] * len(prompts) if isinstance(param, list): @@ -58,9 +182,9 @@ def _normalize(param, name): return param return [param] * len(prompts) - countries = _normalize(country, "country") - secondary_prompts = _normalize(secondaryPrompt, "secondary_prompt") - web_searches = _normalize(webSearch, "web_search") + countries = _norm(country, "country") + secondary_prompts = _norm(secondary_prompt, "secondary_prompt") + web_searches = _norm(web_search, "web_search") # validation for c in countries: @@ -77,12 +201,12 @@ def _normalize(param, name): timeout = timeout or (65 if sync else 30) - # retry loop (API-level transient failures) + # retries around API call max_retries = 3 + last_err = None for attempt in range(max_retries): try: - # Delegate to the existing ChatGPT API client - return self._c.chatgpt_api.scrape_chatgpt( + result = self._c.chatgpt_api.scrape_chatgpt( prompts=prompts, countries=countries, additional_prompts=secondary_prompts, @@ -90,11 +214,49 @@ def _normalize(param, name): sync=sync, timeout=timeout, ) + # Wrap result(s) + if not sync: + # Async: expect {"snapshot_id": "...", ...} + snapshot_id = result.get("snapshot_id") if isinstance(result, dict) else None + return SearchGPTResult(raw=result, snapshot_id=snapshot_id) + + if isinstance(result, list): + out: List[SearchGPTResult] = [] + if len(result) == len(prompts): + for i, item in enumerate(result): + out.append( + SearchGPTResult( + raw=item, + prompt=prompts[i], + country=countries[i], + usage=None, + ) + ) + else: + for item in result: + out.append(SearchGPTResult(raw=item)) + return out[0] if len(prompts) == 1 and len(out) == 1 else out + + return SearchGPTResult(raw=result, prompt=prompts[0] if len(prompts) == 1 else None) + except APIError as e: + last_err = e if attempt < max_retries - 1: time.sleep(2) continue raise e + except Exception as e: + if isinstance(e, (ValidationError, APIError)): + raise + last_err = e + if attempt < max_retries - 1: + time.sleep(2) + continue + raise APIError(f"Unexpected error in search.gpt: {e}") from e + + if last_err: + raise last_err + raise APIError("Unknown error in search.gpt") # Web (SERP) def web( @@ -111,10 +273,9 @@ def web( timeout: int = None, parse: bool = False, ): - - zone = zone or self._c.serp_zone - max_workers = max_workers or self._c.DEFAULT_MAX_WORKERS - # Basic validation borrowed from client.search() + """ + Web/SERP search wrapper. Thin pass-through to SearchAPI with validation. + """ if not query: raise ValueError("The 'query' parameter cannot be None or empty.") if isinstance(query, str): @@ -126,21 +287,18 @@ def web( else: raise TypeError("The 'query' parameter must be a string or a list of strings.") + zone = zone or self._c.serp_zone + max_workers = max_workers or self._c.DEFAULT_MAX_WORKERS + return self._c.search_api.search( query, search_engine, zone, response_format, method, country, data_format, async_request, max_workers, timeout, parse ) - # LinkedIn + # LinkedIn @property def linkedin(self): """ Namespaced LinkedIn search helpers. - - Example: - client.search.linkedin.posts(...) - client.search.linkedin.jobs(...) - client.search.linkedin.profiles(...) """ - return self._c.search_linkedin From b868ca39f32360c2dfb4677daf5896bf2e25862e Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Sun, 2 Nov 2025 18:25:17 +0200 Subject: [PATCH 67/70] Update client.py --- src/client.py | 276 ++++++++++++++++++++++++++------------------------ 1 file changed, 141 insertions(+), 135 deletions(-) diff --git a/src/client.py b/src/client.py index 109f2a8..c5ed140 100644 --- a/src/client.py +++ b/src/client.py @@ -56,144 +56,150 @@ def __init__( structured_logging: bool = True, verbose: bool = None ): - """ - Initialize the Bright Data client with your API token - - Create an account at https://brightdata.com/ to get your API token. - Go to settings > API keys , and verify that your API key have "Admin" permissions. - - Args: - api_token: Your Bright Data API token (can also be set via BRIGHTDATA_API_TOKEN env var) - auto_create_zones: Automatically create required zones if they don't exist (default: True) - web_unlocker_zone: Custom zone name for web unlocker (default: from env or 'sdk_unlocker') - serp_zone: Custom zone name for SERP API (default: from env or 'sdk_serp') - browser_zone: Custom zone name for Browser API (default: from env or 'sdk_browser') - browser_username: Username for Browser API in format "username-zone-{zone_name}" (can also be set via BRIGHTDATA_BROWSER_USERNAME env var) - browser_password: Password for Browser API authentication (can also be set via BRIGHTDATA_BROWSER_PASSWORD env var) - browser_type: Browser automation tool type - "playwright", "puppeteer", or "selenium" (default: "playwright") - log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) - structured_logging: Whether to use structured JSON logging (default: True) - verbose: Enable verbose logging (default: False). Can also be set via BRIGHTDATA_VERBOSE env var. - When False, only shows WARNING and above. When True, shows all logs per log_level. - """ - - try: - from dotenv import load_dotenv - load_dotenv() - except ImportError: - pass - - if verbose is None: - env_verbose = os.getenv('BRIGHTDATA_VERBOSE', '').lower() - verbose = env_verbose in ('true', '1', 'yes', 'on') - - setup_logging(log_level, structured_logging, verbose) - logger.info("Initializing Bright Data SDK client") - - self.api_token = api_token or os.getenv('BRIGHTDATA_API_TOKEN') - if not self.api_token: - logger.error("API token not provided") - raise ValidationError("API token is required. Provide it as parameter or set BRIGHTDATA_API_TOKEN environment variable") - - if not isinstance(self.api_token, str): - logger.error("API token must be a string") - raise ValidationError("API token must be a string") - - if len(self.api_token.strip()) < 10: - logger.error("API token appears to be invalid (too short)") - raise ValidationError("API token appears to be invalid") - - token_preview = f"{self.api_token[:4]}***{self.api_token[-4:]}" if len(self.api_token) > 8 else "***" - logger.info(f"API token validated successfully: {token_preview}") - - self.web_unlocker_zone = web_unlocker_zone or os.getenv('WEB_UNLOCKER_ZONE', 'sdk_unlocker') - self.serp_zone = serp_zone or os.getenv('SERP_ZONE', 'sdk_serp') - self.browser_zone = browser_zone or os.getenv('BROWSER_ZONE', 'sdk_browser') - self.auto_create_zones = auto_create_zones - - self.browser_username = browser_username or os.getenv('BRIGHTDATA_BROWSER_USERNAME') - self.browser_password = browser_password or os.getenv('BRIGHTDATA_BROWSER_PASSWORD') - - self.search = Search(self) - - valid_browser_types = ["playwright", "puppeteer", "selenium"] - if browser_type not in valid_browser_types: - raise ValidationError(f"Invalid browser_type '{browser_type}'. Must be one of: {valid_browser_types}") - self.browser_type = browser_type - - if self.browser_username and self.browser_password: - browser_preview = f"{self.browser_username[:3]}***" - logger.info(f"Browser credentials configured: {browser_preview} (type: {self.browser_type})") - elif self.browser_username or self.browser_password: - logger.warning("Incomplete browser credentials: both username and password are required for browser API") - else: - logger.debug("No browser credentials provided - browser API will not be available") - - self.session = requests.Session() - - auth_header = f'Bearer {self.api_token}' - self.session.headers.update({ - 'Authorization': auth_header, - 'Content-Type': 'application/json', - 'User-Agent': f'brightdata-sdk/{__version__}' - }) - - logger.info("HTTP session configured with secure headers") - - adapter = requests.adapters.HTTPAdapter( - pool_connections=self.CONNECTION_POOL_SIZE, - pool_maxsize=self.CONNECTION_POOL_SIZE, - max_retries=0 - ) - self.session.mount('https://', adapter) - self.session.mount('http://', adapter) - - self.zone_manager = ZoneManager(self.session) - self.web_scraper = WebScraper( - self.session, - self.DEFAULT_TIMEOUT, - self.MAX_RETRIES, - self.RETRY_BACKOFF_FACTOR - ) - self.search_api = SearchAPI( - self.session, - self.DEFAULT_TIMEOUT, - self.MAX_RETRIES, - self.RETRY_BACKOFF_FACTOR - ) - self.chatgpt_api = ChatGPTAPI( - self.session, - self.api_token, - self.DEFAULT_TIMEOUT, - self.MAX_RETRIES, - self.RETRY_BACKOFF_FACTOR - ) - self.linkedin_api = LinkedInAPI( - self.session, - self.api_token, - self.DEFAULT_TIMEOUT, - self.MAX_RETRIES, - self.RETRY_BACKOFF_FACTOR + """ + Initialize the Bright Data client with your API token. + + Create an account at https://brightdata.com/ to get your API token. + Go to Settings > API Keys and verify that your key has "Admin" permissions. + + Args: + api_token: Your Bright Data API token (or set BRIGHTDATA_API_TOKEN env var) + auto_create_zones: Auto-create required zones if missing (default: True) + web_unlocker_zone: Custom Web Unlocker zone name (default: 'sdk_unlocker') + serp_zone: Custom SERP zone name (default: 'sdk_serp') + browser_zone: Custom Browser zone name (default: 'sdk_browser') + browser_username: Browser API username ("username-zone-{zone_name}") + browser_password: Browser API password + browser_type: "playwright", "puppeteer", or "selenium" (default: "playwright") + log_level: Logging level + structured_logging: Enable structured JSON logging + verbose: When True, show all logs per log_level. Can also use BRIGHTDATA_VERBOSE env var. + """ + + try: + from dotenv import load_dotenv + load_dotenv() + except ImportError: + pass + + if verbose is None: + env_verbose = os.getenv('BRIGHTDATA_VERBOSE', '').lower() + verbose = env_verbose in ('true', '1', 'yes', 'on') + + setup_logging(log_level, structured_logging, verbose) + logger.info("Initializing Bright Data SDK client") + + # API Token Validation + self.api_token = api_token or os.getenv('BRIGHTDATA_API_TOKEN') + if not self.api_token: + logger.error("API token not provided") + raise ValidationError( + "API token is required. Pass api_token or set BRIGHTDATA_API_TOKEN env var." ) - self.download_api = DownloadAPI( - self.session, - self.api_token, - self.DEFAULT_TIMEOUT + + if not isinstance(self.api_token, str): + logger.error("API token must be a string") + raise ValidationError("API token must be a string") + + if len(self.api_token.strip()) < 10: + logger.error("API token appears to be invalid (too short)") + raise ValidationError("API token appears to be invalid") + + token_preview = f"{self.api_token[:4]}***{self.api_token[-4:]}" + logger.info(f"API token validated successfully: {token_preview}") + + self.web_unlocker_zone = web_unlocker_zone or os.getenv('WEB_UNLOCKER_ZONE', 'sdk_unlocker') + self.serp_zone = serp_zone or os.getenv('SERP_ZONE', 'sdk_serp') + self.browser_zone = browser_zone or os.getenv('BROWSER_ZONE', 'sdk_browser') + self.auto_create_zones = auto_create_zones + + self.browser_username = browser_username or os.getenv('BRIGHTDATA_BROWSER_USERNAME') + self.browser_password = browser_password or os.getenv('BRIGHTDATA_BROWSER_PASSWORD') + + valid_browser_types = ["playwright", "puppeteer", "selenium"] + if browser_type not in valid_browser_types: + raise ValidationError( + f"Invalid browser_type '{browser_type}'. Must be one of: {valid_browser_types}" ) - self.crawl_api = CrawlAPI( - self.session, - self.api_token, - self.DEFAULT_TIMEOUT, - self.MAX_RETRIES, - self.RETRY_BACKOFF_FACTOR + self.browser_type = browser_type + + if self.browser_username and self.browser_password: + browser_preview = f"{self.browser_username[:3]}***" + logger.info(f"Browser credentials configured: {browser_preview} (type: {self.browser_type})") + elif self.browser_username or self.browser_password: + logger.warning("Incomplete browser credentials: both username and password are required.") + else: + logger.debug("No browser credentials provided - browser API will not be available.") + + self.session = requests.Session() + self.session.headers.update({ + 'Authorization': f'Bearer {self.api_token}', + 'Content-Type': 'application/json', + 'User-Agent': f'brightdata-sdk/{__version__}' + }) + logger.info("HTTP session configured with secure headers") + + adapter = requests.adapters.HTTPAdapter( + pool_connections=self.CONNECTION_POOL_SIZE, + pool_maxsize=self.CONNECTION_POOL_SIZE, + max_retries=0 + ) + self.session.mount('https://', adapter) + self.session.mount('http://', adapter) + + self.zone_manager = ZoneManager(self.session) + + self.web_scraper = WebScraper( + self.session, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.search_api = SearchAPI( + self.session, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.chatgpt_api = ChatGPTAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.linkedin_api = LinkedInAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.download_api = DownloadAPI(self.session, self.api_token, self.DEFAULT_TIMEOUT) + + self.crawl_api = CrawlAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.extract_api = ExtractAPI(self) + + from .search import Search + self.search = Search(self) + + if self.auto_create_zones: + self.zone_manager.ensure_required_zones( + self.web_unlocker_zone, + self.serp_zone ) - self.extract_api = ExtractAPI(self) - - if self.auto_create_zones: - self.zone_manager.ensure_required_zones( - self.web_unlocker_zone, - self.serp_zone - ) + def scrape( self, From fc79dedbf398e67a03fb77b8102a1201b671c461 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Mon, 3 Nov 2025 09:27:36 +0200 Subject: [PATCH 68/70] Update client.py --- src/client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/client.py b/src/client.py index c5ed140..8814f12 100644 --- a/src/client.py +++ b/src/client.py @@ -191,7 +191,6 @@ def __init__( self.extract_api = ExtractAPI(self) - from .search import Search self.search = Search(self) if self.auto_create_zones: From 4857448c6a1ef2631fd238361290c159ccf64fe5 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Mon, 3 Nov 2025 09:36:06 +0200 Subject: [PATCH 69/70] Update client.py --- src/client.py | 278 +++++++++++++++++++++++++------------------------- 1 file changed, 139 insertions(+), 139 deletions(-) diff --git a/src/client.py b/src/client.py index 8814f12..1d4f794 100644 --- a/src/client.py +++ b/src/client.py @@ -55,149 +55,149 @@ def __init__( log_level: str = "INFO", structured_logging: bool = True, verbose: bool = None - ): - """ - Initialize the Bright Data client with your API token. - - Create an account at https://brightdata.com/ to get your API token. - Go to Settings > API Keys and verify that your key has "Admin" permissions. - - Args: - api_token: Your Bright Data API token (or set BRIGHTDATA_API_TOKEN env var) - auto_create_zones: Auto-create required zones if missing (default: True) - web_unlocker_zone: Custom Web Unlocker zone name (default: 'sdk_unlocker') - serp_zone: Custom SERP zone name (default: 'sdk_serp') - browser_zone: Custom Browser zone name (default: 'sdk_browser') - browser_username: Browser API username ("username-zone-{zone_name}") - browser_password: Browser API password - browser_type: "playwright", "puppeteer", or "selenium" (default: "playwright") - log_level: Logging level - structured_logging: Enable structured JSON logging - verbose: When True, show all logs per log_level. Can also use BRIGHTDATA_VERBOSE env var. - """ - - try: - from dotenv import load_dotenv - load_dotenv() - except ImportError: - pass - - if verbose is None: - env_verbose = os.getenv('BRIGHTDATA_VERBOSE', '').lower() - verbose = env_verbose in ('true', '1', 'yes', 'on') - - setup_logging(log_level, structured_logging, verbose) - logger.info("Initializing Bright Data SDK client") + ): + """ + Initialize the Bright Data client with your API token. + + Create an account at https://brightdata.com/ to get your API token. + Go to Settings > API Keys and verify that your key has "Admin" permissions. + + Args: + api_token: Your Bright Data API token (or set BRIGHTDATA_API_TOKEN env var) + auto_create_zones: Auto-create required zones if missing (default: True) + web_unlocker_zone: Custom Web Unlocker zone name (default: 'sdk_unlocker') + serp_zone: Custom SERP zone name (default: 'sdk_serp') + browser_zone: Custom Browser zone name (default: 'sdk_browser') + browser_username: Browser API username ("username-zone-{zone_name}") + browser_password: Browser API password + browser_type: "playwright", "puppeteer", or "selenium" (default: "playwright") + log_level: Logging level + structured_logging: Enable structured JSON logging + verbose: When True, show all logs per log_level. Can also use BRIGHTDATA_VERBOSE env var. + """ - # API Token Validation - self.api_token = api_token or os.getenv('BRIGHTDATA_API_TOKEN') - if not self.api_token: - logger.error("API token not provided") - raise ValidationError( - "API token is required. Pass api_token or set BRIGHTDATA_API_TOKEN env var." + try: + from dotenv import load_dotenv + load_dotenv() + except ImportError: + pass + + if verbose is None: + env_verbose = os.getenv('BRIGHTDATA_VERBOSE', '').lower() + verbose = env_verbose in ('true', '1', 'yes', 'on') + + setup_logging(log_level, structured_logging, verbose) + logger.info("Initializing Bright Data SDK client") + + # API Token Validation + self.api_token = api_token or os.getenv('BRIGHTDATA_API_TOKEN') + if not self.api_token: + logger.error("API token not provided") + raise ValidationError( + "API token is required. Pass api_token or set BRIGHTDATA_API_TOKEN env var." + ) + + if not isinstance(self.api_token, str): + logger.error("API token must be a string") + raise ValidationError("API token must be a string") + + if len(self.api_token.strip()) < 10: + logger.error("API token appears to be invalid (too short)") + raise ValidationError("API token appears to be invalid") + + token_preview = f"{self.api_token[:4]}***{self.api_token[-4:]}" + logger.info(f"API token validated successfully: {token_preview}") + + self.web_unlocker_zone = web_unlocker_zone or os.getenv('WEB_UNLOCKER_ZONE', 'sdk_unlocker') + self.serp_zone = serp_zone or os.getenv('SERP_ZONE', 'sdk_serp') + self.browser_zone = browser_zone or os.getenv('BROWSER_ZONE', 'sdk_browser') + self.auto_create_zones = auto_create_zones + + self.browser_username = browser_username or os.getenv('BRIGHTDATA_BROWSER_USERNAME') + self.browser_password = browser_password or os.getenv('BRIGHTDATA_BROWSER_PASSWORD') + + valid_browser_types = ["playwright", "puppeteer", "selenium"] + if browser_type not in valid_browser_types: + raise ValidationError( + f"Invalid browser_type '{browser_type}'. Must be one of: {valid_browser_types}" + ) + self.browser_type = browser_type + + if self.browser_username and self.browser_password: + browser_preview = f"{self.browser_username[:3]}***" + logger.info(f"Browser credentials configured: {browser_preview} (type: {self.browser_type})") + elif self.browser_username or self.browser_password: + logger.warning("Incomplete browser credentials: both username and password are required.") + else: + logger.debug("No browser credentials provided - browser API will not be available.") + + self.session = requests.Session() + self.session.headers.update({ + 'Authorization': f'Bearer {self.api_token}', + 'Content-Type': 'application/json', + 'User-Agent': f'brightdata-sdk/{__version__}' + }) + logger.info("HTTP session configured with secure headers") + + adapter = requests.adapters.HTTPAdapter( + pool_connections=self.CONNECTION_POOL_SIZE, + pool_maxsize=self.CONNECTION_POOL_SIZE, + max_retries=0 ) - - if not isinstance(self.api_token, str): - logger.error("API token must be a string") - raise ValidationError("API token must be a string") - - if len(self.api_token.strip()) < 10: - logger.error("API token appears to be invalid (too short)") - raise ValidationError("API token appears to be invalid") - - token_preview = f"{self.api_token[:4]}***{self.api_token[-4:]}" - logger.info(f"API token validated successfully: {token_preview}") - - self.web_unlocker_zone = web_unlocker_zone or os.getenv('WEB_UNLOCKER_ZONE', 'sdk_unlocker') - self.serp_zone = serp_zone or os.getenv('SERP_ZONE', 'sdk_serp') - self.browser_zone = browser_zone or os.getenv('BROWSER_ZONE', 'sdk_browser') - self.auto_create_zones = auto_create_zones - - self.browser_username = browser_username or os.getenv('BRIGHTDATA_BROWSER_USERNAME') - self.browser_password = browser_password or os.getenv('BRIGHTDATA_BROWSER_PASSWORD') - - valid_browser_types = ["playwright", "puppeteer", "selenium"] - if browser_type not in valid_browser_types: - raise ValidationError( - f"Invalid browser_type '{browser_type}'. Must be one of: {valid_browser_types}" + self.session.mount('https://', adapter) + self.session.mount('http://', adapter) + + self.zone_manager = ZoneManager(self.session) + + self.web_scraper = WebScraper( + self.session, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR ) - self.browser_type = browser_type - - if self.browser_username and self.browser_password: - browser_preview = f"{self.browser_username[:3]}***" - logger.info(f"Browser credentials configured: {browser_preview} (type: {self.browser_type})") - elif self.browser_username or self.browser_password: - logger.warning("Incomplete browser credentials: both username and password are required.") - else: - logger.debug("No browser credentials provided - browser API will not be available.") - - self.session = requests.Session() - self.session.headers.update({ - 'Authorization': f'Bearer {self.api_token}', - 'Content-Type': 'application/json', - 'User-Agent': f'brightdata-sdk/{__version__}' - }) - logger.info("HTTP session configured with secure headers") - - adapter = requests.adapters.HTTPAdapter( - pool_connections=self.CONNECTION_POOL_SIZE, - pool_maxsize=self.CONNECTION_POOL_SIZE, - max_retries=0 - ) - self.session.mount('https://', adapter) - self.session.mount('http://', adapter) - - self.zone_manager = ZoneManager(self.session) - - self.web_scraper = WebScraper( - self.session, - self.DEFAULT_TIMEOUT, - self.MAX_RETRIES, - self.RETRY_BACKOFF_FACTOR - ) - - self.search_api = SearchAPI( - self.session, - self.DEFAULT_TIMEOUT, - self.MAX_RETRIES, - self.RETRY_BACKOFF_FACTOR - ) - - self.chatgpt_api = ChatGPTAPI( - self.session, - self.api_token, - self.DEFAULT_TIMEOUT, - self.MAX_RETRIES, - self.RETRY_BACKOFF_FACTOR - ) - - self.linkedin_api = LinkedInAPI( - self.session, - self.api_token, - self.DEFAULT_TIMEOUT, - self.MAX_RETRIES, - self.RETRY_BACKOFF_FACTOR - ) - - self.download_api = DownloadAPI(self.session, self.api_token, self.DEFAULT_TIMEOUT) - - self.crawl_api = CrawlAPI( - self.session, - self.api_token, - self.DEFAULT_TIMEOUT, - self.MAX_RETRIES, - self.RETRY_BACKOFF_FACTOR - ) - - self.extract_api = ExtractAPI(self) - - self.search = Search(self) - - if self.auto_create_zones: - self.zone_manager.ensure_required_zones( - self.web_unlocker_zone, - self.serp_zone + + self.search_api = SearchAPI( + self.session, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.chatgpt_api = ChatGPTAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.linkedin_api = LinkedInAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR ) + + self.download_api = DownloadAPI(self.session, self.api_token, self.DEFAULT_TIMEOUT) + + self.crawl_api = CrawlAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.extract_api = ExtractAPI(self) + + self.search = Search(self) + + if self.auto_create_zones: + self.zone_manager.ensure_required_zones( + self.web_unlocker_zone, + self.serp_zone + ) def scrape( From db8c569e9babd2ada94933b4acdb03c349102f29 Mon Sep 17 00:00:00 2001 From: Nadav Toledo <136907680+NadavToledo1@users.noreply.github.com> Date: Mon, 3 Nov 2025 09:38:05 +0200 Subject: [PATCH 70/70] Update search.py --- src/search.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/search.py b/src/search.py index d56bbfd..215e65d 100644 --- a/src/search.py +++ b/src/search.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import re import time -from __future__ import annotations from .exceptions import ValidationError, APIError from typing import Any, Dict, List, Optional, Union