diff --git a/README.md b/README.md index 04fa2cc..4252525 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,11 @@ sdk-banner(1) +<<<<<<< HEAD

Python SDK by Bright Data, Easy-to-use scalable methods for web search & scraping

+======= +

Python SDK by Bright Data, Easy to use scalable methods for web search & scraping

+>>>>>>> mywork/main

## Installation @@ -10,11 +14,19 @@ To install the package, open your terminal: ```python pip install brightdata-sdk ``` +<<<<<<< HEAD > If using macOS, first open a virtual environment for your project ## Quick Start Create a [Bright Data](https://brightdata.com/cp/setting/) account and copy your API key +======= +> If using macOS, first open a virtual environment for your project. + +## Quick Start + +Create a [Bright Data](https://brightdata.com/cp/setting/) account and copy your API key. +>>>>>>> mywork/main ### Initialize the Client @@ -25,7 +37,11 @@ client = bdclient(api_token="your_api_token_here") # can also be defined as BRIG ``` ### Launch first request +<<<<<<< HEAD Add to your code a serp function +======= +Add to your code a serp function. +>>>>>>> mywork/main ```python results = client.search("best selling shoes") @@ -38,10 +54,17 @@ print(client.parse_content(results)) | Feature | Functions | Description |--------------------------|-----------------------------|------------------------------------- +<<<<<<< HEAD | **Scrape every website** | `scrape` | Scrape every website using Bright's scraping and unti bot-detection capabilities | **Web search** | `search` | Search google and other search engines by query (supports batch searches) | **Web crawling** | `crawl` | Discover and scrape multiple pages from websites with advanced filtering and depth control | **AI-powered extraction** | `extract` | Extract specific information from websites using natural language queries and OpenAI +======= +| **Scrape any website** | `scrape` | Scrape every website using Bright's scraping and unti bot-detection capabilities +| **Web search(SERP)** | `search` | Search google and other search engines by query (supports batch searches) +| **Web crawling** | `crawl` | Discover and scrape multiple pages from websites with advanced filtering and depth control +| **AI extraction** | `extract` | Extract specific information from websites using natural language queries and OpenAI +>>>>>>> mywork/main | **Content parsing** | `parse_content` | Extract text, links, images and structured data from API responses (JSON or HTML) | **Browser automation** | `connect_browser` | Get WebSocket endpoint for Playwright/Selenium integration with Bright Data's scraping browser | **Search chatGPT** | `search_chatGPT` | Prompt chatGPT and scrape its answers, support multiple inputs and follow-up prompts @@ -51,11 +74,19 @@ print(client.parse_content(results)) | **Client class** | `bdclient` | Handles authentication, automatic zone creation and managment, and options for robust error handling | **Parallel processing** | **all functions** | All functions use Concurrent processing for multiple URLs or queries, and support multiple Output Formats +<<<<<<< HEAD ### Try usig one of the functions #### `Search()` ```python # Simple single query search +======= +### Try using one of the functions + +#### `Search()` +```python +# Single query search +>>>>>>> mywork/main result = client.search("pizza restaurants") # Try using multiple queries (parallel processing), with custom configuration @@ -69,7 +100,11 @@ results = client.search( ``` #### `scrape()` ```python +<<<<<<< HEAD # Simple single URL scrape +======= +# Single URL scrape +>>>>>>> mywork/main result = client.scrape("https://example.com") # Multiple URLs (parallel processing) with custom options @@ -83,6 +118,7 @@ results = client.scrape( ``` #### `search_chatGPT()` ```python +<<<<<<< HEAD result = client.search_chatGPT( prompt="what day is it today?" # prompt=["What are the top 3 programming languages in 2024?", "Best hotels in New York", "Explain quantum computing"], @@ -90,6 +126,22 @@ result = client.search_chatGPT( ) client.download_content(result) # In case of timeout error, your snapshot_id is presented and you will downloaded it using download_snapshot() +======= +# Sync mode (immediate result) +result = client.search_gpt( + prompt="Top startups in Tel Aviv", + country="IL", + web_search=True +) +print(result) + +# Async mode (retrieve snapshot later) +result = client.search_gpt( + prompt="Top startups in 2024", + sync=False +) +print(result["snapshot_id"]) +>>>>>>> mywork/main ``` #### `search_linkedin.` @@ -125,7 +177,11 @@ print(results) # will print the snapshot_id, which can be downloaded using the d result = client.crawl( url="https://example.com/", depth=2, +<<<<<<< HEAD filter="/product/", # Only crawl URLs containing "/product/" +======= + include_filter="/product/", # Only crawl URLs containing "/product/" +>>>>>>> mywork/main exclude_filter="/ads/", # Exclude URLs containing "/ads/" custom_output_fields=["markdown", "url", "page_title"] ) @@ -202,12 +258,23 @@ client.download_content(data) ``` **`download_snapshot`** (for async requests) ```python +<<<<<<< HEAD # Save this function to seperate file client.download_snapshot("") # Insert your snapshot_id ``` > [!TIP] > Hover over the "search" or each function in the package, to see all its available parameters. +======= +# Save this function to a seperate file +# Download snapshot +snapshot_id = "your_snapshot_id_here" # <-- Replace with your actual snapshot ID +client.download_snapshot(snapshot_id) # Insert your snapshot_id +``` + +> [!TIP] +> Hover over the "search" or each function in the package to see all its available parameters. +>>>>>>> mywork/main ![Hover-Over1](https://github.com/user-attachments/assets/51324485-5769-48d5-8f13-0b534385142e) @@ -251,7 +318,11 @@ Discover and scrape multiple pages from websites with advanced filtering. - `url`: Single URL string or list of URLs to crawl (required) - `ignore_sitemap`: Ignore sitemap when crawling (optional) - `depth`: Maximum crawl depth relative to entered URL (optional) +<<<<<<< HEAD - `filter`: Regex to include only certain URLs (e.g. "/product/") +======= +- `include_filter`: Regex to include only certain URLs (e.g. "/product/") +>>>>>>> mywork/main - `exclude_filter`: Regex to exclude certain URLs (e.g. "/ads/") - `custom_output_fields`: List of output fields to include (optional) - `include_errors`: Include errors in response (default: True) diff --git a/pyproject.toml b/pyproject.toml index 0991d9f..d1c2c48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,9 @@ Repository = "https://github.com/brightdata/bright-data-sdk-python" "Bug Reports" = "https://github.com/brightdata/bright-data-sdk-python/issues" Changelog = "https://github.com/brightdata/bright-data-sdk-python/blob/main/CHANGELOG.md" +[tool.setuptools] +package-dir = {"" = "src"} + [tool.setuptools.packages.find] include = ["brightdata*"] exclude = ["tests*"] @@ -134,4 +137,4 @@ filterwarnings = [ "error", "ignore::UserWarning", "ignore::DeprecationWarning", -] \ No newline at end of file +] diff --git a/setup.py b/setup.py index a662168..502f386 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +<<<<<<< HEAD """ Setup script for Bright Data SDK @@ -51,6 +52,29 @@ def read_version(): install_requires=[ "requests>=2.25.0", "python-dotenv>=0.19.0", +======= +from setuptools import setup, find_packages + +setup( + name="brightdata-sdk", + version="1.1.3", + description="Python SDK for Bright Data Web Scraping and SERP APIs", + author="Bright Data", + author_email="support@brightdata.com", + maintainer="Bright Data", + maintainer_email="idanv@brightdata.com", + license="MIT", + packages=find_packages(where="src"), + package_dir={"": "src"}, + include_package_data=True, + python_requires=">=3.8", + install_requires=[ + "requests>=2.25.0", + "python-dotenv>=0.19.0", + "aiohttp>=3.8.0", + "beautifulsoup4>=4.9.0", + "openai>=1.0.0", +>>>>>>> mywork/main ], extras_require={ "dev": [ @@ -59,6 +83,7 @@ def read_version(): "black>=21.0.0", "isort>=5.0.0", "flake8>=3.8.0", +<<<<<<< HEAD ], }, keywords="brightdata, web scraping, proxy, serp, api, data extraction", @@ -67,4 +92,36 @@ def read_version(): "Documentation": "https://github.com/brightdata/brightdata-sdk-python#readme", "Source": "https://github.com/brightdata/brightdata-sdk-python", }, -) \ No newline at end of file +) +======= + "mypy>=0.900", + ], + "test": [ + "pytest>=6.0.0", + "pytest-cov>=2.10.0", + ], + }, + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Internet :: WWW/HTTP :: Indexing/Search", + ], + project_urls={ + "Homepage": "https://github.com/brightdata/bright-data-sdk-python", + "Documentation": "https://github.com/brightdata/bright-data-sdk-python#readme", + "Repository": "https://github.com/brightdata/bright-data-sdk-python", + "Bug Reports": "https://github.com/brightdata/bright-data-sdk-python/issues", + "Changelog": "https://github.com/brightdata/bright-data-sdk-python/blob/main/CHANGELOG.md", + }, +) +>>>>>>> mywork/main diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..c815a6c --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,82 @@ +""" +## Bright Data SDK for Python + +A comprehensive SDK for Bright Data's Web Scraping and SERP APIs, providing +easy-to-use methods for web scraping, search engine result parsing, and data management. +## Functions: +First import the package and create a client: +```python +from brightdata import bdclient +client = bdclient(your-apy-key) +``` +Then use the client to call the desired functions: +#### scrape() +- Scrapes a website using Bright Data Web Unblocker API with proxy support (or multiple websites sequentially) +- syntax: `results = client.scrape(url, country, max_workers, ...)` +#### .scrape_linkedin. class +- Scrapes LinkedIn data including posts, jobs, companies, and profiles, recieve structured data as a result +- syntax: `results = client.scrape_linkedin.posts()/jobs()/companies()/profiles() # insert parameters per function` +#### search() +- Performs web searches using Bright Data SERP API with customizable search engines (or multiple search queries sequentially) +- syntax: `results = client.search(query, search_engine, country, ...)` +#### .search_linkedin. class +- Search LinkedIn data including for specific posts, jobs, profiles. recieve the relevent data as a result +- syntax: `results = client.search_linkedin.posts()/jobs()/profiles() # insert parameters per function` +#### search_chatGPT() +- Interact with ChatGPT using Bright Data's ChatGPT API, sending prompts and receiving responses +- syntax: `results = client.search_chatGPT(prompt, additional_prompt, max_workers, ...)` +#### download_content() / download_snapshot() +- Saves the scraped content to local files in various formats (JSON, CSV, etc.) +- syntax: `client.download_content(results)` +- syntax: `client.download_snapshot(results)` +#### connect_browser() +- Get WebSocket endpoint for connecting to Bright Data's scraping browser with Playwright/Selenium +- syntax: `endpoint_url = client.connect_browser()` then use with browser automation tools +#### crawl() +- Crawl websites to discover and scrape multiple pages using Bright Data's Web Crawl API +- syntax: `result = client.crawl(url, filter, exclude_filter, depth, ...)` +#### parse_content() +- Parse and extract useful information from API responses (JSON or HTML) +- syntax: `parsed = client.parse_content(data, extract_text=True, extract_links=True)` + +### Features: +- Web Scraping: Scrape websites using Bright Data Web Unlocker API with proxy support +- Search Engine Results: Perform web searches using Bright Data SERP API +- Web Crawling: Discover and scrape multiple pages from websites with advanced filtering +- Content Parsing: Extract text, links, images, and structured data from API responses +- Browser Automation: Simple authentication for Bright Data's scraping browser with Playwright/Selenium +- Multiple Search Engines: Support for Google, Bing, and Yandex +- Parallel Processing: Concurrent processing for multiple URLs or queries +- Robust Error Handling: Comprehensive error handling with retry logic +- Input Validation: Automatic validation of URLs, zone names, and parameters +- Zone Management: Automatic zone creation and management +- Multiple Output Formats: JSON, raw HTML, markdown, and more +""" + +from .client import bdclient +from .exceptions import ( + BrightDataError, + ValidationError, + AuthenticationError, + ZoneError, + NetworkError, + APIError +) +from .utils import parse_content, parse_multiple, extract_structured_data + +__version__ = "1.1.3" +__author__ = "Bright Data" +__email__ = "support@brightdata.com" + +__all__ = [ + 'bdclient', + 'BrightDataError', + 'ValidationError', + 'AuthenticationError', + 'ZoneError', + 'NetworkError', + 'APIError', + 'parse_content', + 'parse_multiple', + 'extract_structured_data' +] \ No newline at end of file diff --git a/src/api/__init__.py b/src/api/__init__.py new file mode 100644 index 0000000..a79c0fd --- /dev/null +++ b/src/api/__init__.py @@ -0,0 +1,13 @@ +from .scraper import WebScraper +from .search import SearchAPI +from .chatgpt import ChatGPTAPI +from .linkedin import LinkedInAPI +from .crawl import CrawlAPI + +__all__ = [ + 'WebScraper', + 'SearchAPI', + 'ChatGPTAPI', + 'LinkedInAPI', + 'CrawlAPI' +] \ No newline at end of file diff --git a/src/api/chatgpt.py b/src/api/chatgpt.py new file mode 100644 index 0000000..e9edb90 --- /dev/null +++ b/src/api/chatgpt.py @@ -0,0 +1,126 @@ +import json +import requests +from typing import Union, Dict, Any, List + +from ..utils import get_logger +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.chatgpt') + + +class ChatGPTAPI: + """Handles ChatGPT scraping operations using Bright Data's ChatGPT dataset API""" + + def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5): + self.session = session + self.api_token = api_token + self.default_timeout = default_timeout + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + def scrape_chatgpt( + self, + prompts: List[str], + countries: List[str], + additional_prompts: List[str], + web_searches: List[bool], + sync: bool = True, + timeout: int = None + ) -> Dict[str, Any]: + """ + Internal method to handle ChatGPT scraping API requests + + Parameters: + - prompts: List of prompts to send to ChatGPT + - countries: List of country codes matching prompts + - additional_prompts: List of follow-up prompts matching prompts + - web_searches: List of web_search flags matching prompts + - sync: If True, uses synchronous API for immediate results + - timeout: Request timeout in seconds + + Returns: + - Dict containing response with snapshot_id or direct data (if sync=True) + """ + url = "https://api.brightdata.com/datasets/v3/scrape" if sync else "https://api.brightdata.com/datasets/v3/trigger" + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": "gd_m7aof0k82r803d5bjm", + "include_errors": "true" + } + + data = [ + { + "url": "https://chatgpt.com/", + "prompt": prompts[i], + "country": countries[i], + "additional_prompt": additional_prompts[i], + "web_search": web_searches[i] + } + for i in range(len(prompts)) + ] + + try: + response = self.session.post( + url, + headers=headers, + params=params, + json=data, + timeout=timeout or (65 if sync else self.default_timeout) + ) + + if response.status_code == 401: + raise AuthenticationError("Invalid API token or insufficient permissions") + elif response.status_code != 200: + raise APIError(f"ChatGPT scraping request failed with status {response.status_code}: {response.text}") + + if sync: + response_text = response.text + if '\n{' in response_text and response_text.strip().startswith('{'): + json_objects = [] + for line in response_text.strip().split('\n'): + if line.strip(): + try: + json_objects.append(json.loads(line)) + except json.JSONDecodeError: + continue + result = json_objects + else: + try: + result = response.json() + except json.JSONDecodeError: + result = response_text + + logger.info(f"ChatGPT data retrieved synchronously for {len(prompts)} prompt(s)") + print(f"Retrieved {len(result) if isinstance(result, list) else 1} ChatGPT response(s)") + else: + result = response.json() + snapshot_id = result.get('snapshot_id') + if snapshot_id: + logger.info(f"ChatGPT scraping job initiated successfully for {len(prompts)} prompt(s)") + print("") + print("Snapshot ID:") + print(snapshot_id) + print("") + + return result + + except requests.exceptions.Timeout: + raise APIError("Timeout while initiating ChatGPT scraping") + except requests.exceptions.RequestException as e: + raise APIError(f"Network error during ChatGPT scraping: {str(e)}") + except json.JSONDecodeError as e: + raise APIError(f"Failed to parse ChatGPT scraping response: {str(e)}") + except Exception as e: + if isinstance(e, (ValidationError, AuthenticationError, APIError)): + raise + raise APIError(f"Unexpected error during ChatGPT scraping: {str(e)}") \ No newline at end of file diff --git a/src/api/crawl.py b/src/api/crawl.py new file mode 100644 index 0000000..4fe047a --- /dev/null +++ b/src/api/crawl.py @@ -0,0 +1,175 @@ +import json +from typing import Union, Dict, Any, List, Optional +from ..utils import get_logger, validate_url +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.crawl') + + +class CrawlAPI: + """Handles crawl operations using Bright Data's Web Crawl API""" + + CRAWL_DATASET_ID = "gd_m6gjtfmeh43we6cqc" + + AVAILABLE_OUTPUT_FIELDS = [ + "markdown", "url", "html2text", "page_html", "ld_json", + "page_title", "timestamp", "input", "discovery_input", + "error", "error_code", "warning", "warning_code" + ] + + def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5): + self.session = session + self.api_token = api_token + self.default_timeout = default_timeout + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + def crawl( + self, + url: Union[str, List[str]], + ignore_sitemap: Optional[bool] = None, + depth: Optional[int] = None, + filter: Optional[str] = None, + exclude_filter: Optional[str] = None, + custom_output_fields: Optional[List[str]] = None, + include_errors: bool = True + ) -> Dict[str, Any]: + """ + ## Crawl websites using Bright Data's Web Crawl API + + Performs web crawling to discover and scrape multiple pages from a website + starting from the specified URL(s). + + ### Parameters: + - `url` (str | List[str]): Domain URL(s) to crawl (required) + - `ignore_sitemap` (bool, optional): Ignore sitemap when crawling + - `depth` (int, optional): Maximum depth to crawl relative to the entered URL + - `filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/") + - `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/") + - `custom_output_fields` (List[str], optional): Custom output schema fields to include + - `include_errors` (bool, optional): Include errors in response (default: True) + + ### Returns: + - `Dict[str, Any]`: Crawl response with snapshot_id for tracking + + ### Example Usage: + ```python + # Single URL crawl + result = client.crawl("https://example.com/") + + # Multiple URLs with filters + urls = ["https://example.com/", "https://example2.com/"] + result = client.crawl( + url=urls, + filter="/product/", + exclude_filter="/ads/", + depth=2, + ignore_sitemap=True + ) + + # Custom output schema + result = client.crawl( + url="https://example.com/", + custom_output_fields=["markdown", "url", "page_title"] + ) + ``` + + ### Raises: + - `ValidationError`: Invalid URL or parameters + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + if isinstance(url, str): + urls = [url] + elif isinstance(url, list): + urls = url + else: + raise ValidationError("URL must be a string or list of strings") + + if not urls: + raise ValidationError("At least one URL is required") + + for u in urls: + if not isinstance(u, str) or not u.strip(): + raise ValidationError("All URLs must be non-empty strings") + validate_url(u) + + if custom_output_fields is not None: + if not isinstance(custom_output_fields, list): + raise ValidationError("custom_output_fields must be a list") + + invalid_fields = [field for field in custom_output_fields if field not in self.AVAILABLE_OUTPUT_FIELDS] + if invalid_fields: + raise ValidationError(f"Invalid output fields: {invalid_fields}. Available fields: {self.AVAILABLE_OUTPUT_FIELDS}") + + crawl_inputs = [] + for u in urls: + crawl_input = {"url": u} + + if ignore_sitemap is not None: + crawl_input["ignore_sitemap"] = ignore_sitemap + if depth is not None: + crawl_input["depth"] = depth + if filter is not None: + crawl_input["filter"] = filter + if exclude_filter is not None: + crawl_input["exclude_filter"] = exclude_filter + + crawl_inputs.append(crawl_input) + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + params = { + "dataset_id": self.CRAWL_DATASET_ID, + "include_errors": str(include_errors).lower(), + "type": "discover_new", + "discover_by": "domain_url" + } + + if custom_output_fields: + payload = { + "input": crawl_inputs, + "custom_output_fields": custom_output_fields + } + else: + payload = crawl_inputs + + logger.info(f"Starting crawl for {len(urls)} URL(s)") + logger.debug(f"Crawl parameters: depth={depth}, filter={filter}, exclude_filter={exclude_filter}") + + try: + response = self.session.post( + api_url, + params=params, + json=payload, + timeout=self.default_timeout + ) + + if response.status_code == 200: + result = response.json() + snapshot_id = result.get('snapshot_id') + logger.info(f"Crawl initiated successfully. Snapshot ID: {snapshot_id}") + return result + + elif response.status_code == 401: + logger.error("Unauthorized (401): Check API token") + raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}") + elif response.status_code == 403: + logger.error("Forbidden (403): Insufficient permissions") + raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}") + elif response.status_code == 400: + logger.error(f"Bad request (400): {response.text}") + raise APIError(f"Bad request (400): {response.text}") + else: + logger.error(f"Crawl request failed ({response.status_code}): {response.text}") + raise APIError( + f"Crawl request failed ({response.status_code}): {response.text}", + status_code=response.status_code, + response_text=response.text + ) + + except Exception as e: + if isinstance(e, (ValidationError, AuthenticationError, APIError)): + raise + logger.error(f"Unexpected error during crawl: {e}") + raise APIError(f"Unexpected error during crawl: {str(e)}") \ No newline at end of file diff --git a/src/api/download.py b/src/api/download.py new file mode 100644 index 0000000..4bccdc0 --- /dev/null +++ b/src/api/download.py @@ -0,0 +1,265 @@ +import json +import requests +from datetime import datetime +from typing import Union, Dict, Any, List + +from ..utils import get_logger +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.download') + + +class DownloadAPI: + """Handles snapshot and content download operations using Bright Data's download API""" + + def __init__(self, session, api_token, default_timeout=30): + self.session = session + self.api_token = api_token + self.default_timeout = default_timeout + + def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str: + """ + ## Download content to a file based on its format + + ### Args: + content: The content to download (dict for JSON, string for other formats) + filename: Optional filename. If not provided, generates one with timestamp + format: Format of the content ("json", "csv", "ndjson", "jsonl", "txt") + parse: If True, automatically parse JSON strings in 'body' fields to objects (default: False) + + ### Returns: + Path to the downloaded file + """ + + if not filename: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"brightdata_results_{timestamp}.{format}" + + if not filename.endswith(f".{format}"): + filename = f"{filename}.{format}" + + if parse and isinstance(content, (list, dict)): + content = self._parse_body_json(content) + + try: + if format == "json": + with open(filename, 'w', encoding='utf-8') as f: + if isinstance(content, dict) or isinstance(content, list): + json.dump(content, f, indent=2, ensure_ascii=False) + else: + f.write(str(content)) + else: + with open(filename, 'w', encoding='utf-8') as f: + f.write(str(content)) + + logger.info(f"Content downloaded to: {filename}") + return filename + + except IOError as e: + raise APIError(f"Failed to write file {filename}: {str(e)}") + except Exception as e: + raise APIError(f"Failed to download content: {str(e)}") + + def download_snapshot( + self, + snapshot_id: str, + format: str = "json", + compress: bool = False, + batch_size: int = None, + part: int = None + ) -> Union[Dict[str, Any], List[Dict[str, Any]], str]: + """ + ## Download snapshot content from Bright Data dataset API + + Downloads the snapshot content using the snapshot ID returned from scrape_chatGPT() + or other dataset collection triggers. + + ### Parameters: + - `snapshot_id` (str): The snapshot ID returned when collection was triggered (required) + - `format` (str, optional): Format of the data - "json", "ndjson", "jsonl", or "csv" (default: "json") + - `compress` (bool, optional): Whether the result should be compressed (default: False) + - `batch_size` (int, optional): Divide into batches of X records (minimum: 1000) + - `part` (int, optional): If batch_size provided, specify which part to download + + ### Returns: + - `Union[Dict, List, str]`: Snapshot data in the requested format + + ### Example Usage: + ```python + # Download complete snapshot + data = client.download_snapshot("s_m4x7enmven8djfqak") + + # Download as CSV format + csv_data = client.download_snapshot("s_m4x7enmven8djfqak", format="csv") + + # Download in batches + batch_data = client.download_snapshot( + "s_m4x7enmven8djfqak", + batch_size=1000, + part=1 + ) + ``` + + ### Raises: + - `ValidationError`: Invalid parameters or snapshot_id format + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed, snapshot not found, or server error + """ + if not snapshot_id or not isinstance(snapshot_id, str): + raise ValidationError("Snapshot ID is required and must be a non-empty string") + + if format not in ["json", "ndjson", "jsonl", "csv"]: + raise ValidationError("Format must be one of: json, ndjson, jsonl, csv") + + if not isinstance(compress, bool): + raise ValidationError("Compress must be a boolean") + + if batch_size is not None: + if not isinstance(batch_size, int) or batch_size < 1000: + raise ValidationError("Batch size must be an integer >= 1000") + + if part is not None: + if not isinstance(part, int) or part < 1: + raise ValidationError("Part must be a positive integer") + if batch_size is None: + raise ValidationError("Part parameter requires batch_size to be specified") + + url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}" + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.api_token}", + "Accept": "application/json", + "User-Agent": user_agent + } + params = { + "format": format + } + + if compress: + params["compress"] = "true" + + if batch_size is not None: + params["batch_size"] = batch_size + + if part is not None: + params["part"] = part + + try: + logger.info(f"Downloading snapshot {snapshot_id} in {format} format") + + response = self.session.get( + url, + headers=headers, + params=params, + timeout=self.default_timeout + ) + + if response.status_code == 200: + pass + elif response.status_code == 202: + try: + response_data = response.json() + message = response_data.get('message', 'Snapshot is not ready yet') + print("Snapshot is not ready yet, try again soon") + return {"status": "not_ready", "message": message, "snapshot_id": snapshot_id} + except json.JSONDecodeError: + print("Snapshot is not ready yet, try again soon") + return {"status": "not_ready", "message": "Snapshot is not ready yet, check again soon", "snapshot_id": snapshot_id} + elif response.status_code == 401: + raise AuthenticationError("Invalid API token or insufficient permissions") + elif response.status_code == 404: + raise APIError(f"Snapshot '{snapshot_id}' not found") + else: + raise APIError(f"Download request failed with status {response.status_code}: {response.text}") + + if format == "csv": + data = response.text + save_data = data + else: + response_text = response.text + if '\n{' in response_text and response_text.strip().startswith('{'): + json_objects = [] + for line in response_text.strip().split('\n'): + if line.strip(): + try: + json_objects.append(json.loads(line)) + except json.JSONDecodeError: + continue + data = json_objects + save_data = json_objects + else: + try: + data = response.json() + save_data = data + except json.JSONDecodeError: + data = response_text + save_data = response_text + + try: + output_file = f"snapshot_{snapshot_id}.{format}" + if format == "csv" or isinstance(save_data, str): + with open(output_file, 'w', encoding='utf-8') as f: + f.write(str(save_data)) + else: + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(save_data, f, indent=2, ensure_ascii=False) + logger.info(f"Data saved to: {output_file}") + except Exception: + pass + + logger.info(f"Successfully downloaded snapshot {snapshot_id}") + return data + + except requests.exceptions.Timeout: + raise APIError("Timeout while downloading snapshot") + except requests.exceptions.RequestException as e: + raise APIError(f"Network error during snapshot download: {str(e)}") + except Exception as e: + if isinstance(e, (ValidationError, AuthenticationError, APIError)): + raise + raise APIError(f"Unexpected error during snapshot download: {str(e)}") + + def _parse_body_json(self, content: Union[Dict, List]) -> Union[Dict, List]: + """ + Parse JSON strings in 'body' fields to objects + + Args: + content: The content to process + + Returns: + Content with parsed body fields + """ + if content is None: + return content + + if isinstance(content, list): + for item in content: + if isinstance(item, dict) and 'body' in item: + body = item['body'] + if isinstance(body, str): + try: + item['body'] = json.loads(body) + except (json.JSONDecodeError, TypeError): + pass + elif isinstance(item, (dict, list)): + self._parse_body_json(item) + + elif isinstance(content, dict): + if 'body' in content: + body = content['body'] + if isinstance(body, str): + try: + content['body'] = json.loads(body) + except (json.JSONDecodeError, TypeError): + pass + + for key, value in content.items(): + if isinstance(value, (dict, list)): + content[key] = self._parse_body_json(value) + + return content \ No newline at end of file diff --git a/src/api/extract.py b/src/api/extract.py new file mode 100644 index 0000000..1b04b84 --- /dev/null +++ b/src/api/extract.py @@ -0,0 +1,419 @@ +import os +import re +import json +import openai +from typing import Dict, Any, Tuple, Union, List +from urllib.parse import urlparse + +from ..utils import get_logger +from ..exceptions import ValidationError, APIError + +logger = get_logger('api.extract') + + +class ExtractResult(str): + """ + Custom result class that behaves like a string (extracted content) + but also provides access to metadata attributes + """ + def __new__(cls, extracted_content, metadata): + obj = str.__new__(cls, extracted_content) + obj._metadata = metadata + return obj + + def __getattr__(self, name): + if name in self._metadata: + return self._metadata[name] + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") + + def __getitem__(self, key): + return self._metadata[key] + + def get(self, key, default=None): + return self._metadata.get(key, default) + + def keys(self): + return self._metadata.keys() + + def values(self): + return self._metadata.values() + + def items(self): + return self._metadata.items() + + @property + def metadata(self): + """Access full metadata dictionary""" + return self._metadata + + +class ExtractAPI: + """Handles content extraction using web scraping + LLM processing""" + + def __init__(self, client): + self.client = client + + def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: Dict[str, Any] = None, llm_key: str = None) -> Dict[str, Any]: + """ + ## Extract specific information from websites using AI + + Combines web scraping with OpenAI's language models to extract targeted information + from web pages based on natural language queries. + + ### Parameters: + - `query` (str): Natural language query describing what to extract. If `url` parameter is provided, + this becomes the pure extraction query. If `url` is not provided, this should include + the URL (e.g. "extract the most recent news from cnn.com") + - `url` (str | List[str], optional): Direct URL(s) to scrape. If provided, bypasses URL extraction + from query and sends these URLs to the web unlocker API + - `output_scheme` (dict, optional): JSON Schema defining the expected structure for the LLM response. + Uses OpenAI's Structured Outputs for reliable type-safe responses. + Example: {"type": "object", "properties": {"title": {"type": "string"}, "date": {"type": "string"}}, "required": ["title", "date"]} + - `llm_key` (str, optional): OpenAI API key. If not provided, uses OPENAI_API_KEY env variable + + ### Returns: + - `ExtractResult`: String containing extracted content with metadata attributes access + + ### Example Usage: + ```python + # Using URL parameter with structured output + result = client.extract( + query="extract the most recent news headlines", + url="https://cnn.com", + output_scheme={ + "type": "object", + "properties": { + "headlines": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "date": {"type": "string"} + }, + "required": ["title", "date"] + } + } + }, + "required": ["headlines"] + } + ) + + # Using URL in query (original behavior) + result = client.extract( + query="extract the most recent news from cnn.com", + llm_key="your-openai-api-key" + ) + + # Multiple URLs with structured schema + result = client.extract( + query="extract main headlines", + url=["https://cnn.com", "https://bbc.com"], + output_scheme={ + "type": "object", + "properties": { + "sources": { + "type": "array", + "items": { + "type": "object", + "properties": { + "source_name": {"type": "string"}, + "headlines": {"type": "array", "items": {"type": "string"}} + }, + "required": ["source_name", "headlines"] + } + } + }, + "required": ["sources"] + } + ) + ``` + + ### Raises: + - `ValidationError`: Invalid query format or missing LLM key + - `APIError`: Scraping failed or LLM processing error + """ + if not query or not isinstance(query, str): + raise ValidationError("Query must be a non-empty string") + + query = query.strip() + if len(query) > 10000: + raise ValidationError("Query is too long (maximum 10,000 characters)") + if len(query) < 5: + raise ValidationError("Query is too short (minimum 5 characters)") + + if not llm_key: + llm_key = os.getenv('OPENAI_API_KEY') + + if not llm_key or not isinstance(llm_key, str): + raise ValidationError("OpenAI API key is required. Provide it as parameter or set OPENAI_API_KEY environment variable") + + if output_scheme is not None: + if not isinstance(output_scheme, dict): + raise ValidationError("output_scheme must be a dict containing a valid JSON Schema") + if "type" not in output_scheme: + raise ValidationError("output_scheme must have a 'type' property") + + self._validate_structured_outputs_schema(output_scheme) + + logger.info(f"Processing extract query: {query[:50]}...") + + try: + if url is not None: + parsed_query = query.strip() + target_urls = url if isinstance(url, list) else [url] + logger.info(f"Using provided URL(s): {target_urls}") + else: + parsed_query, extracted_url = self._parse_query_and_url(query) + target_urls = [extracted_url] + logger.info(f"Parsed - Query: '{parsed_query}', URL: '{extracted_url}'") + + if len(target_urls) == 1: + scraped_content = self.client.scrape(target_urls[0], response_format="raw") + source_url = target_urls[0] + else: + scraped_content = self.client.scrape(target_urls, response_format="raw") + source_url = ', '.join(target_urls) + + logger.info(f"Scraped content from {len(target_urls)} URL(s)") + + if isinstance(scraped_content, list): + all_text = [] + all_titles = [] + for i, content in enumerate(scraped_content): + parsed = self.client.parse_content( + content, + extract_text=True, + extract_links=False, + extract_images=False + ) + all_text.append(f"--- Content from {target_urls[i]} ---\n{parsed.get('text', '')}") + all_titles.append(parsed.get('title', 'Unknown')) + + combined_text = "\n\n".join(all_text) + combined_title = " | ".join(all_titles) + parsed_content = {'text': combined_text, 'title': combined_title} + else: + parsed_content = self.client.parse_content( + scraped_content, + extract_text=True, + extract_links=False, + extract_images=False + ) + + logger.info(f"Parsed content - text length: {len(parsed_content.get('text', ''))}") + + extracted_info, token_usage = self._process_with_llm( + parsed_query, + parsed_content.get('text', ''), + llm_key, + source_url, + output_scheme + ) + + metadata = { + 'query': parsed_query, + 'url': source_url, + 'extracted_content': extracted_info, + 'source_title': parsed_content.get('title', 'Unknown'), + 'content_length': len(parsed_content.get('text', '')), + 'token_usage': token_usage, + 'success': True + } + + return ExtractResult(extracted_info, metadata) + + except Exception as e: + if isinstance(e, (ValidationError, APIError)): + raise + logger.error(f"Unexpected error during extraction: {e}") + raise APIError(f"Extraction failed: {str(e)}") + + def _parse_query_and_url(self, query: str) -> Tuple[str, str]: + """ + Parse natural language query to extract the task and URL + + Args: + query: Natural language query like "extract news from cnn.com" + + Returns: + Tuple of (parsed_query, full_url) + """ + query = query.strip() + + url_patterns = [ + r'from\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)', + r'on\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)', + r'at\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)', + r'((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)' + ] + + url = None + for pattern in url_patterns: + match = re.search(pattern, query, re.IGNORECASE) + if match: + url = match.group(1) + break + + if not url: + raise ValidationError("Could not extract URL from query. Please include a website URL.") + + full_url = self._build_full_url(url) + + extract_query = re.sub(r'\b(?:from|on|at)\s+(?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*', '', query, flags=re.IGNORECASE) + extract_query = re.sub(r'\b(?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*', '', extract_query, flags=re.IGNORECASE) + extract_query = re.sub(r'\s+', ' ', extract_query).strip() + + if not extract_query: + extract_query = "extract the main content" + + return extract_query, full_url + + def _build_full_url(self, url: str) -> str: + """ + Build a complete URL from potentially partial URL + + Args: + url: Potentially partial URL like "cnn.com" or "https://example.com" + + Returns: + Complete URL with https:// and www if needed + """ + url = url.strip() + + if not url.startswith(('http://', 'https://')): + if not url.startswith('www.'): + url = f'www.{url}' + url = f'https://{url}' + + parsed = urlparse(url) + if not parsed.netloc: + raise ValidationError(f"Invalid URL format: {url}") + + return url + + def _validate_structured_outputs_schema(self, schema: Dict[str, Any], path: str = "") -> None: + """ + Validate JSON Schema for OpenAI Structured Outputs compatibility + + Args: + schema: JSON Schema to validate + path: Current path in schema (for error reporting) + """ + if not isinstance(schema, dict): + return + + schema_type = schema.get("type") + + if schema_type == "object": + if "properties" not in schema: + raise ValidationError(f"Object schema at '{path}' must have 'properties' defined") + if "required" not in schema: + raise ValidationError(f"Object schema at '{path}' must have 'required' array (OpenAI Structured Outputs requirement)") + if "additionalProperties" not in schema or schema["additionalProperties"] is not False: + raise ValidationError(f"Object schema at '{path}' must have 'additionalProperties': false (OpenAI Structured Outputs requirement)") + + properties = set(schema["properties"].keys()) + required = set(schema["required"]) + if properties != required: + missing = properties - required + extra = required - properties + error_msg = f"OpenAI Structured Outputs requires ALL properties to be in 'required' array at '{path}'." + if missing: + error_msg += f" Missing from required: {list(missing)}" + if extra: + error_msg += f" Extra in required: {list(extra)}" + raise ValidationError(error_msg) + + for prop_name, prop_schema in schema["properties"].items(): + self._validate_structured_outputs_schema(prop_schema, f"{path}.{prop_name}") + + elif schema_type == "array": + if "items" in schema: + self._validate_structured_outputs_schema(schema["items"], f"{path}[]") + + def _process_with_llm(self, query: str, content: str, llm_key: str, source_url: str, output_scheme: Dict[str, Any] = None) -> Tuple[str, Dict[str, int]]: + """ + Process scraped content with OpenAI to extract requested information + + Args: + query: What to extract from the content + content: Scraped and parsed text content + llm_key: OpenAI API key + source_url: Source URL for context + output_scheme: JSON Schema dict for structured outputs (optional) + + Returns: + Tuple of (extracted information, token usage dict) + """ + if len(content) > 15000: + beginning = content[:8000] + end = content[-4000:] + content = f"{beginning}\n\n... [middle content truncated for token efficiency] ...\n\n{end}" + elif len(content) > 12000: + content = content[:12000] + "\n\n... [content truncated to optimize tokens]" + + client = openai.OpenAI(api_key=llm_key) + + system_prompt = f"""You are a precise web content extraction specialist. Your task: {query} + +SOURCE: {source_url} + +INSTRUCTIONS: +1. Extract ONLY the specific information requested +2. Include relevant details (dates, numbers, names) when available +3. If requested info isn't found, briefly state what content IS available +4. Keep response concise but complete +5. Be accurate and factual""" + + user_prompt = f"CONTENT TO ANALYZE:\n\n{content}\n\nEXTRACT: {query}" + + try: + call_params = { + "model": "gpt-4o-2024-08-06", + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + "max_tokens": 1000, + "temperature": 0.1 + } + + if output_scheme: + call_params["response_format"] = { + "type": "json_schema", + "json_schema": { + "name": "extracted_content", + "strict": True, + "schema": output_scheme + } + } + logger.info("Using OpenAI Structured Outputs with provided schema") + else: + logger.info("Using regular OpenAI completion (no structured schema provided)") + + response = client.chat.completions.create(**call_params) + + if not response.choices or not response.choices[0].message.content: + raise APIError("OpenAI returned empty response") + + extracted_content = response.choices[0].message.content.strip() + + if output_scheme: + logger.info("Received structured JSON response from OpenAI") + else: + logger.info("Received text response from OpenAI") + + token_usage = { + 'prompt_tokens': response.usage.prompt_tokens, + 'completion_tokens': response.usage.completion_tokens, + 'total_tokens': response.usage.total_tokens + } + + logger.info(f"OpenAI token usage: {token_usage['total_tokens']} total ({token_usage['prompt_tokens']} prompt + {token_usage['completion_tokens']} completion)") + + return extracted_content, token_usage + + except Exception as e: + logger.error(f"OpenAI API error: {e}") + raise APIError(f"Failed to process content with LLM: {str(e)}") \ No newline at end of file diff --git a/src/api/linkedin.py b/src/api/linkedin.py new file mode 100644 index 0000000..19ede6b --- /dev/null +++ b/src/api/linkedin.py @@ -0,0 +1,803 @@ +import json +import re +import requests +from typing import Union, Dict, Any, List + +from ..utils import get_logger +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.linkedin') + + +class LinkedInAPI: + """Handles LinkedIn data collection using Bright Data's collect API""" + + DATASET_IDS = { + 'profile': 'gd_l1viktl72bvl7bjuj0', + 'company': 'gd_l1vikfnt1wgvvqz95w', + 'job': 'gd_lpfll7v5hcqtkxl6l', + 'post': 'gd_lyy3tktm25m4avu764' + } + + URL_PATTERNS = { + 'profile': re.compile(r'linkedin\.com/in/[^/?]+/?(\?.*)?$'), + 'company': re.compile(r'linkedin\.com/(company|organization-guest/company)/[^/?]+/?(\?.*)?$'), + 'job': re.compile(r'linkedin\.com/jobs/view/[^/?]+/?(\?.*)?$'), + 'post': re.compile(r'linkedin\.com/(posts|pulse)/[^/?]+/?(\?.*)?$') + } + + def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5): + self.session = session + self.api_token = api_token + self.default_timeout = default_timeout + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + def _identify_dataset_type(self, url: str) -> str: + """ + Identify LinkedIn dataset type based on URL pattern + + Args: + url: LinkedIn URL to analyze + + Returns: + Dataset type ('profile', 'company', 'job', 'post') + + Raises: + ValidationError: If URL doesn't match any known LinkedIn pattern + """ + if not url or not isinstance(url, str): + raise ValidationError("URL must be a non-empty string") + + url = url.strip().lower() + for dataset_type, pattern in self.URL_PATTERNS.items(): + if pattern.search(url): + logger.debug(f"URL '{url}' identified as LinkedIn {dataset_type}") + return dataset_type + + raise ValidationError(f"URL '{url}' does not match any supported LinkedIn data type") + + def _scrape_linkedin_dataset( + self, + urls: Union[str, List[str]], + dataset_id: str, + dataset_type: str, + sync: bool = True, + timeout: int = None + ) -> Dict[str, Any]: + """ + Internal method to scrape LinkedIn data using Bright Data's collect API + + Args: + urls: Single LinkedIn URL or list of LinkedIn URLs + dataset_id: Bright Data dataset ID for the specific LinkedIn data type + dataset_type: Type of LinkedIn data (for logging purposes) + sync: If True (default), uses synchronous API for immediate results + timeout: Request timeout in seconds + + Returns: + Dict containing response with snapshot_id or direct data (if sync=True) + + Raises: + ValidationError: Invalid URL format + AuthenticationError: Invalid API token or insufficient permissions + APIError: Request failed or server error + """ + if isinstance(urls, str): + url_list = [urls] + else: + url_list = urls + + if not url_list or len(url_list) == 0: + raise ValidationError("At least one URL is required") + for url in url_list: + if not url or not isinstance(url, str): + raise ValidationError("All URLs must be non-empty strings") + + logger.info(f"Processing {len(url_list)} LinkedIn {dataset_type} URL(s) {'synchronously' if sync else 'asynchronously'}") + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + + if sync: + api_url = "https://api.brightdata.com/datasets/v3/scrape" + data = { + "input": [{"url": url} for url in url_list] + } + params = { + "dataset_id": dataset_id, + "notify": "false", + "include_errors": "true" + } + else: + api_url = "https://api.brightdata.com/datasets/v3/trigger" + data = [{"url": url} for url in url_list] + params = { + "dataset_id": dataset_id, + "include_errors": "true" + } + + try: + if sync: + response = self.session.post( + api_url, + headers=headers, + params=params, + json=data, + timeout=timeout or 65 + ) + else: + response = self.session.post( + api_url, + headers=headers, + params=params, + json=data, + timeout=timeout or self.default_timeout + ) + + if response.status_code == 401: + raise AuthenticationError("Invalid API token or insufficient permissions") + elif response.status_code not in [200, 202]: + raise APIError(f"LinkedIn data collection request failed with status {response.status_code}: {response.text}") + + if sync: + response_text = response.text + if '\n{' in response_text and response_text.strip().startswith('{'): + json_objects = [] + for line in response_text.strip().split('\n'): + if line.strip(): + try: + json_objects.append(json.loads(line)) + except json.JSONDecodeError: + continue + result = json_objects + else: + try: + result = response.json() + except json.JSONDecodeError: + result = response_text + + logger.info(f"LinkedIn {dataset_type} data retrieved synchronously for {len(url_list)} URL(s)") + print(f"Retrieved {len(result) if isinstance(result, list) else 1} LinkedIn {dataset_type} record(s)") + else: + result = response.json() + snapshot_id = result.get('snapshot_id') + if snapshot_id: + logger.info(f"LinkedIn {dataset_type} data collection job initiated successfully for {len(url_list)} URL(s)") + print("") + print("Snapshot ID:") + print(snapshot_id) + print("") + + return result + + except requests.exceptions.Timeout: + raise APIError("Timeout while initiating LinkedIn data collection") + except requests.exceptions.RequestException as e: + raise APIError(f"Network error during LinkedIn data collection: {str(e)}") + except json.JSONDecodeError as e: + raise APIError(f"Failed to parse LinkedIn data collection response: {str(e)}") + except Exception as e: + if isinstance(e, (ValidationError, AuthenticationError, APIError)): + raise + raise APIError(f"Unexpected error during LinkedIn data collection: {str(e)}") + + +class LinkedInScraper: + """LinkedIn data scraping interface with specialized methods for different data types""" + + def __init__(self, linkedin_api): + self.linkedin_api = linkedin_api + + def profiles(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]: + """ + ## Scrape LinkedIn Profile Data + + Scrapes structured data from LinkedIn profiles using the profiles dataset. + + ### Parameters: + - `url` (str | List[str]): Single LinkedIn profile URL or list of profile URLs + - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing + - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async) + + ### Returns: + - `Dict[str, Any]`: If sync=True, returns scraped profile data directly. If sync=False, returns response with snapshot_id for async processing + + ### Example URLs: + - `https://www.linkedin.com/in/username/` + - `https://linkedin.com/in/first-last-123456/` + + ### Example Usage: + ```python + # Single profile (synchronous - returns data immediately) + result = client.scrape_linkedin.profiles("https://www.linkedin.com/in/elad-moshe-05a90413/") + + # Multiple profiles (synchronous - returns data immediately) + profiles = [ + "https://www.linkedin.com/in/user1/", + "https://www.linkedin.com/in/user2/" + ] + result = client.scrape_linkedin.profiles(profiles) + + # Asynchronous processing (returns snapshot_id) + result = client.scrape_linkedin.profiles(profiles, sync=False) + ``` + """ + return self.linkedin_api._scrape_linkedin_dataset( + url, + self.linkedin_api.DATASET_IDS['profile'], + 'profile', + sync, + timeout + ) + + def companies(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]: + """ + ## Scrape LinkedIn Company Data + + Scrapes structured data from LinkedIn company pages using the companies dataset. + + ### Parameters: + - `url` (str | List[str]): Single LinkedIn company URL or list of company URLs + - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing + - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async) + + ### Returns: + - `Dict[str, Any]`: If sync=True, returns scraped company data directly. If sync=False, returns response with snapshot_id for async processing + + ### Example URLs: + - `https://www.linkedin.com/company/company-name/` + - `https://linkedin.com/company/bright-data/` + + ### Example Usage: + ```python + # Single company (synchronous) + result = client.scrape_linkedin.companies("https://www.linkedin.com/company/bright-data/") + + # Multiple companies (synchronous) + companies = [ + "https://www.linkedin.com/company/ibm/", + "https://www.linkedin.com/company/microsoft/" + ] + result = client.scrape_linkedin.companies(companies) + + # Asynchronous processing + result = client.scrape_linkedin.companies(companies, sync=False) + ``` + """ + return self.linkedin_api._scrape_linkedin_dataset( + url, + self.linkedin_api.DATASET_IDS['company'], + 'company', + sync, + timeout + ) + + def jobs(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]: + """ + ## Scrape LinkedIn Job Data + + Scrapes structured data from LinkedIn job listings using the jobs dataset. + + ### Parameters: + - `url` (str | List[str]): Single LinkedIn job URL or list of job URLs + - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing + - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async) + + ### Returns: + - `Dict[str, Any]`: If sync=True, returns scraped job data directly. If sync=False, returns response with snapshot_id for async processing + + ### Example URLs: + - `https://www.linkedin.com/jobs/view/1234567890/` + - `https://linkedin.com/jobs/view/job-id/` + + ### Example Usage: + ```python + # Single job listing (synchronous) + result = client.scrape_linkedin.jobs("https://www.linkedin.com/jobs/view/1234567890/") + + # Multiple job listings (synchronous) + jobs = [ + "https://www.linkedin.com/jobs/view/1111111/", + "https://www.linkedin.com/jobs/view/2222222/" + ] + result = client.scrape_linkedin.jobs(jobs) + + # Asynchronous processing + result = client.scrape_linkedin.jobs(jobs, sync=False) + ``` + """ + return self.linkedin_api._scrape_linkedin_dataset( + url, + self.linkedin_api.DATASET_IDS['job'], + 'job', + sync, + timeout + ) + + def posts(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]: + """ + ## Scrape LinkedIn Post Data + + Scrapes structured data from LinkedIn posts and articles using the posts dataset. + + ### Parameters: + - `url` (str | List[str]): Single LinkedIn post URL or list of post URLs + - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing + - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async) + + ### Returns: + - `Dict[str, Any]`: If sync=True, returns scraped post data directly. If sync=False, returns response with snapshot_id for async processing + + ### Example URLs: + - `https://www.linkedin.com/posts/username-activity-123456/` + - `https://www.linkedin.com/pulse/article-title-author/` + + ### Example Usage: + ```python + # Single post (synchronous) + result = client.scrape_linkedin.posts("https://www.linkedin.com/posts/user-activity-123/") + + # Multiple posts (synchronous) + posts = [ + "https://www.linkedin.com/posts/user1-activity-111/", + "https://www.linkedin.com/pulse/article-author/" + ] + result = client.scrape_linkedin.posts(posts) + + # Asynchronous processing + result = client.scrape_linkedin.posts(posts, sync=False) + ``` + """ + return self.linkedin_api._scrape_linkedin_dataset( + url, + self.linkedin_api.DATASET_IDS['post'], + 'post', + sync, + timeout + ) + + +class LinkedInSearcher: + """LinkedIn search interface for discovering new LinkedIn data by various criteria""" + + def __init__(self, linkedin_api): + self.linkedin_api = linkedin_api + + def profiles( + self, + first_name: Union[str, List[str]], + last_name: Union[str, List[str]], + timeout: int = None + ) -> Dict[str, Any]: + """ + ## Search LinkedIn Profiles by Name + + Discovers LinkedIn profiles by searching for first and last names. + + ### Parameters: + - `first_name` (str | List[str]): Single first name or list of first names to search for + - `last_name` (str | List[str]): Single last name or list of last names to search for + - `timeout` (int, optional): Request timeout in seconds (default: 30) + + ### Returns: + - `Dict[str, Any]`: Response containing snapshot_id for async processing + + ### Example Usage: + ```python + # Single name search (returns snapshot_id) + result = client.search_linkedin.profiles("James", "Smith") + + # Multiple names search (returns snapshot_id) + first_names = ["James", "Idan"] + last_names = ["Smith", "Vilenski"] + result = client.search_linkedin.profiles(first_names, last_names) + ``` + """ + if isinstance(first_name, str): + first_names = [first_name] + else: + first_names = first_name + + if isinstance(last_name, str): + last_names = [last_name] + else: + last_names = last_name + + if len(first_names) != len(last_names): + raise ValidationError("first_name and last_name must have the same length") + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['profile'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "name" + } + + data = [ + { + "first_name": first_names[i], + "last_name": last_names[i] + } + for i in range(len(first_names)) + ] + + return self._make_request(api_url, headers, params, data, 'profile search', len(data), timeout) + + def jobs( + self, + url: Union[str, List[str]] = None, + location: Union[str, List[str]] = None, + keyword: Union[str, List[str]] = "", + country: Union[str, List[str]] = "", + time_range: Union[str, List[str]] = "", + job_type: Union[str, List[str]] = "", + experience_level: Union[str, List[str]] = "", + remote: Union[str, List[str]] = "", + company: Union[str, List[str]] = "", + location_radius: Union[str, List[str]] = "", + selective_search: Union[bool, List[bool]] = False, + timeout: int = None + ) -> Dict[str, Any]: + """ + ## Search LinkedIn Jobs by URL or Keywords + + Discovers LinkedIn jobs either by searching specific job search URLs or by keyword criteria. + + ### Parameters: + - `url` (str | List[str], optional): LinkedIn job search URLs to scrape + - `location` (str | List[str], optional): Job location(s) - required when searching by keyword + - `keyword` (str | List[str], optional): Job keyword(s) to search for (default: "") + - `country` (str | List[str], optional): Country code(s) (default: "") + - `time_range` (str | List[str], optional): Time range filter (default: "") + - `job_type` (str | List[str], optional): Job type filter (default: "") + - `experience_level` (str | List[str], optional): Experience level filter (default: "") + - `remote` (str | List[str], optional): Remote work filter (default: "") + - `company` (str | List[str], optional): Company name filter (default: "") + - `location_radius` (str | List[str], optional): Location radius filter (default: "") + - `selective_search` (bool | List[bool], optional): Enable selective search (default: False) + - `timeout` (int, optional): Request timeout in seconds (default: 30) + + ### Returns: + - `Dict[str, Any]`: Response containing snapshot_id for async processing + + ### Example Usage: + ```python + # Search by job URLs (returns snapshot_id) + job_urls = [ + "https://www.linkedin.com/jobs/search?keywords=Software&location=Tel%20Aviv-Yafo", + "https://www.linkedin.com/jobs/reddit-inc.-jobs-worldwide?f_C=150573" + ] + result = client.search_linkedin.jobs(url=job_urls) + + # Search by keyword (returns snapshot_id) + result = client.search_linkedin.jobs( + location="Paris", + keyword="product manager", + country="FR", + time_range="Past month", + job_type="Full-time" + ) + ``` + """ + if url is not None: + return self._search_jobs_by_url(url, timeout) + elif location is not None: + return self._search_jobs_by_keyword( + location, keyword, country, time_range, job_type, + experience_level, remote, company, location_radius, + selective_search, timeout + ) + else: + raise ValidationError("Either 'url' or 'location' parameter must be provided") + + def posts( + self, + profile_url: Union[str, List[str]] = None, + company_url: Union[str, List[str]] = None, + url: Union[str, List[str]] = None, + start_date: Union[str, List[str]] = "", + end_date: Union[str, List[str]] = "", + timeout: int = None + ) -> Dict[str, Any]: + """ + ## Search LinkedIn Posts by Profile, Company, or General URL + + Discovers LinkedIn posts using various search methods. + + ### Parameters: + - `profile_url` (str | List[str], optional): LinkedIn profile URL(s) to get posts from + - `company_url` (str | List[str], optional): LinkedIn company URL(s) to get posts from + - `url` (str | List[str], optional): General LinkedIn URL(s) for posts + - `start_date` (str | List[str], optional): Start date filter (ISO format, default: "") + - `end_date` (str | List[str], optional): End date filter (ISO format, default: "") + - `timeout` (int, optional): Request timeout in seconds (default: 30) + + ### Returns: + - `Dict[str, Any]`: Response containing snapshot_id for async processing + + ### Example Usage: + ```python + # Search posts by profile URL with date range (returns snapshot_id) + result = client.search_linkedin.posts( + profile_url="https://www.linkedin.com/in/bettywliu", + start_date="2018-04-25T00:00:00.000Z", + end_date="2021-05-25T00:00:00.000Z" + ) + + # Search posts by company URL (returns snapshot_id) + result = client.search_linkedin.posts( + company_url="https://www.linkedin.com/company/bright-data" + ) + + # Search posts by general URL (returns snapshot_id) + result = client.search_linkedin.posts( + url="https://www.linkedin.com/posts/activity-123456" + ) + ``` + """ + if profile_url is not None: + return self._search_posts_by_profile(profile_url, start_date, end_date, timeout) + elif company_url is not None: + return self._search_posts_by_company(company_url, timeout) + elif url is not None: + return self._search_posts_by_url(url, timeout) + else: + raise ValidationError("One of 'profile_url', 'company_url', or 'url' parameter must be provided") + + def _search_jobs_by_url(self, urls, timeout): + """Search jobs by LinkedIn job search URLs""" + if isinstance(urls, str): + url_list = [urls] + else: + url_list = urls + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['job'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "url" + } + + data = [{"url": url} for url in url_list] + return self._make_request(api_url, headers, params, data, 'job search by URL', len(data), timeout) + + def _search_jobs_by_keyword(self, location, keyword, country, time_range, job_type, experience_level, remote, company, location_radius, selective_search, timeout): + """Search jobs by keyword criteria""" + params_dict = { + 'location': location, 'keyword': keyword, 'country': country, + 'time_range': time_range, 'job_type': job_type, 'experience_level': experience_level, + 'remote': remote, 'company': company, 'location_radius': location_radius, + 'selective_search': selective_search + } + + max_length = 1 + for key, value in params_dict.items(): + if isinstance(value, list): + max_length = max(max_length, len(value)) + normalized_params = {} + for key, value in params_dict.items(): + if isinstance(value, list): + if len(value) != max_length and len(value) != 1: + raise ValidationError(f"Parameter '{key}' list length must be 1 or {max_length}") + normalized_params[key] = value * max_length if len(value) == 1 else value + else: + normalized_params[key] = [value] * max_length + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['job'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "keyword" + } + + data = [] + for i in range(max_length): + data.append({ + "location": normalized_params['location'][i], + "keyword": normalized_params['keyword'][i], + "country": normalized_params['country'][i], + "time_range": normalized_params['time_range'][i], + "job_type": normalized_params['job_type'][i], + "experience_level": normalized_params['experience_level'][i], + "remote": normalized_params['remote'][i], + "company": normalized_params['company'][i], + "location_radius": normalized_params['location_radius'][i], + "selective_search": normalized_params['selective_search'][i] + }) + + return self._make_request(api_url, headers, params, data, 'job search by keyword', len(data), timeout) + + def _search_posts_by_profile(self, profile_urls, start_dates, end_dates, timeout): + """Search posts by profile URL with optional date filtering""" + if isinstance(profile_urls, str): + url_list = [profile_urls] + else: + url_list = profile_urls + + if isinstance(start_dates, str): + start_list = [start_dates] * len(url_list) + else: + start_list = start_dates if len(start_dates) == len(url_list) else [start_dates[0]] * len(url_list) + + if isinstance(end_dates, str): + end_list = [end_dates] * len(url_list) + else: + end_list = end_dates if len(end_dates) == len(url_list) else [end_dates[0]] * len(url_list) + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['post'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "profile_url" + } + + data = [] + for i in range(len(url_list)): + item = {"url": url_list[i]} + if start_list[i]: + item["start_date"] = start_list[i] + if end_list[i]: + item["end_date"] = end_list[i] + data.append(item) + + return self._make_request(api_url, headers, params, data, 'post search by profile', len(data), timeout) + + def _search_posts_by_company(self, company_urls, timeout): + """Search posts by company URL""" + if isinstance(company_urls, str): + url_list = [company_urls] + else: + url_list = company_urls + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['post'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "company_url" + } + + data = [{"url": url} for url in url_list] + return self._make_request(api_url, headers, params, data, 'post search by company', len(data), timeout) + + def _search_posts_by_url(self, urls, timeout): + """Search posts by general URL""" + if isinstance(urls, str): + url_list = [urls] + else: + url_list = urls + + api_url = "https://api.brightdata.com/datasets/v3/trigger" + + try: + from .. import __version__ + user_agent = f"brightdata-sdk/{__version__}" + except ImportError: + user_agent = "brightdata-sdk/unknown" + + headers = { + "Authorization": f"Bearer {self.linkedin_api.api_token}", + "Content-Type": "application/json", + "User-Agent": user_agent + } + params = { + "dataset_id": self.linkedin_api.DATASET_IDS['post'], + "include_errors": "true", + "type": "discover_new", + "discover_by": "url" + } + + data = [{"url": url} for url in url_list] + return self._make_request(api_url, headers, params, data, 'post search by URL', len(data), timeout) + + def _make_request(self, api_url, headers, params, data, operation_type, count, timeout): + """Common method to make API requests (async only for search operations)""" + try: + response = self.linkedin_api.session.post( + api_url, + headers=headers, + params=params, + json=data, + timeout=timeout or self.linkedin_api.default_timeout + ) + + if response.status_code == 401: + raise AuthenticationError("Invalid API token or insufficient permissions") + elif response.status_code != 200: + raise APIError(f"LinkedIn {operation_type} request failed with status {response.status_code}: {response.text}") + + result = response.json() + snapshot_id = result.get('snapshot_id') + if snapshot_id: + logger.info(f"LinkedIn {operation_type} job initiated successfully for {count} item(s)") + print("") + print("Snapshot ID:") + print(snapshot_id) + print("") + + return result + + except requests.exceptions.Timeout: + raise APIError(f"Timeout while initiating LinkedIn {operation_type}") + except requests.exceptions.RequestException as e: + raise APIError(f"Network error during LinkedIn {operation_type}: {str(e)}") + except json.JSONDecodeError as e: + raise APIError(f"Failed to parse LinkedIn {operation_type} response: {str(e)}") + except Exception as e: + if isinstance(e, (ValidationError, AuthenticationError, APIError)): + raise + raise APIError(f"Unexpected error during LinkedIn {operation_type}: {str(e)}") \ No newline at end of file diff --git a/src/api/scraper.py b/src/api/scraper.py new file mode 100644 index 0000000..0d4fc31 --- /dev/null +++ b/src/api/scraper.py @@ -0,0 +1,205 @@ +import time +from typing import Union, Dict, Any, List +from concurrent.futures import ThreadPoolExecutor, as_completed + +from ..utils import ( + validate_url, validate_zone_name, validate_country_code, + validate_timeout, validate_max_workers, validate_url_list, + validate_response_format, validate_http_method, retry_request, + get_logger, log_request, safe_json_parse, validate_response_size +) +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.scraper') + + +class WebScraper: + """Handles web scraping operations using Bright Data Web Unlocker API""" + + def __init__(self, session, default_timeout=30, max_retries=3, retry_backoff=1.5): + self.session = session + self.default_timeout = default_timeout + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + def scrape( + self, + url: Union[str, List[str]], + zone: str, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "markdown", + async_request: bool = False, + max_workers: int = 10, + timeout: int = None + ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]: + """ + **Unlock and scrape websites using Bright Data Web Unlocker API** + + Scrapes one or multiple URLs through Bright Data's proxy network with anti-bot detection bypass. + + **Parameters:** + - `url` (str | List[str]): Single URL string or list of URLs to scrape + - `zone` (str): Your Bright Data zone identifier + - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`) + - `method` (str, optional): HTTP method for the request (default: `"GET"`) + - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`) + - `data_format` (str, optional): Additional format transformation (default: `"html"`) + - `async_request` (bool, optional): Enable asynchronous processing (default: `False`) + - `max_workers` (int, optional): Maximum parallel workers for multiple URLs (default: `10`) + - `timeout` (int, optional): Request timeout in seconds (default: `30`) + + **Returns:** + - Single URL: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"` + - Multiple URLs: `List[Union[Dict[str, Any], str]]` corresponding to each input URL + + **Example Usage:** + ```python + # Single URL scraping + result = client.scrape( + url="https://example.com", + zone="your_zone_name", + response_format="json" + ) + + # Multiple URLs scraping + urls = ["https://site1.com", "https://site2.com"] + results = client.scrape( + url=urls, + zone="your_zone_name", + response_format="raw", + max_workers=5 + ) + ``` + + **Raises:** + - `ValidationError`: Invalid URL format or empty URL list + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + + timeout = timeout or self.default_timeout + validate_zone_name(zone) + validate_response_format(response_format) + validate_http_method(method) + validate_country_code(country) + validate_timeout(timeout) + validate_max_workers(max_workers) + + if isinstance(url, list): + validate_url_list(url) + effective_max_workers = min(len(url), max_workers or 10) + + results = [None] * len(url) + + with ThreadPoolExecutor(max_workers=effective_max_workers) as executor: + future_to_index = { + executor.submit( + self._perform_single_scrape, + single_url, zone, response_format, method, country, + data_format, async_request, timeout + ): i + for i, single_url in enumerate(url) + } + for future in as_completed(future_to_index): + index = future_to_index[future] + try: + result = future.result() + results[index] = result + except Exception as e: + raise APIError(f"Failed to scrape {url[index]}: {str(e)}") + + return results + else: + validate_url(url) + return self._perform_single_scrape( + url, zone, response_format, method, country, + data_format, async_request, timeout + ) + + def _perform_single_scrape( + self, + url: str, + zone: str, + response_format: str, + method: str, + country: str, + data_format: str, + async_request: bool, + timeout: int + ) -> Union[Dict[str, Any], str]: + """ + Perform a single scrape operation with comprehensive logging + """ + endpoint = "https://api.brightdata.com/request" + start_time = time.time() + + logger.info(f"Starting scrape request for URL: {url[:100]}{'...' if len(url) > 100 else ''}") + + payload = { + "zone": zone, + "url": url, + "format": response_format, + "method": method, + "data_format": data_format + } + + params = {} + if async_request: + params['async'] = 'true' + + @retry_request( + max_retries=self.max_retries, + backoff_factor=self.retry_backoff, + retry_statuses={429, 500, 502, 503, 504} + ) + def make_request(): + return self.session.post( + endpoint, + json=payload, + params=params, + timeout=timeout + ) + + try: + response = make_request() + response_time = (time.time() - start_time) * 1000 + + # Log request details + log_request(logger, 'POST', endpoint, response.status_code, response_time) + + if response.status_code == 200: + logger.info(f"Scrape completed successfully in {response_time:.2f}ms") + + validate_response_size(response.text) + + if response_format == "json": + result = safe_json_parse(response.text) + logger.debug(f"Processed response with {len(str(result))} characters") + return result + else: + logger.debug(f"Returning raw response with {len(response.text)} characters") + return response.text + + elif response.status_code == 400: + logger.error(f"Bad Request (400) for URL {url}: {response.text}") + raise ValidationError(f"Bad Request (400): {response.text}") + elif response.status_code == 401: + logger.error(f"Unauthorized (401) for URL {url}: Check API token") + raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}") + elif response.status_code == 403: + logger.error(f"Forbidden (403) for URL {url}: Insufficient permissions") + raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}") + elif response.status_code == 404: + logger.error(f"Not Found (404) for URL {url}: {response.text}") + raise APIError(f"Not Found (404): {response.text}") + else: + logger.error(f"API Error ({response.status_code}) for URL {url}: {response.text}") + raise APIError(f"API Error ({response.status_code}): {response.text}", + status_code=response.status_code, response_text=response.text) + + except Exception as e: + response_time = (time.time() - start_time) * 1000 + logger.error(f"Request failed after {response_time:.2f}ms for URL {url}: {str(e)}", exc_info=True) + raise \ No newline at end of file diff --git a/src/api/search.py b/src/api/search.py new file mode 100644 index 0000000..24e6365 --- /dev/null +++ b/src/api/search.py @@ -0,0 +1,212 @@ +import json +import time +from typing import Union, Dict, Any, List +from concurrent.futures import ThreadPoolExecutor, as_completed +from urllib.parse import quote_plus + +from ..utils import ( + validate_zone_name, validate_country_code, validate_timeout, + validate_max_workers, validate_search_engine, validate_query, + validate_response_format, validate_http_method, retry_request, + get_logger, log_request, safe_json_parse, validate_response_size +) +from ..exceptions import ValidationError, APIError, AuthenticationError + +logger = get_logger('api.search') + + +class SearchAPI: + """Handles search operations using Bright Data SERP API""" + + def __init__(self, session, default_timeout=30, max_retries=3, retry_backoff=1.5): + self.session = session + self.default_timeout = default_timeout + self.max_retries = max_retries + self.retry_backoff = retry_backoff + + def search( + self, + query: Union[str, List[str]], + search_engine: str = "google", + zone: str = None, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "markdown", + async_request: bool = False, + max_workers: int = 10, + timeout: int = None, + parse: bool = False + ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]: + """ + ## Search the web using Bright Data SERP API + + Performs web searches through major search engines using Bright Data's proxy network + for reliable, bot-detection-free results. + + ### Parameters: + - `query` (str | List[str]): Search query string or list of search queries + - `search_engine` (str, optional): Search engine to use - `"google"`, `"bing"`, or `"yandex"` (default: `"google"`) + - `zone` (str, optional): Your Bright Data zone identifier (default: `None`) + - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`) + - `method` (str, optional): HTTP method for the request (default: `"GET"`) + - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`) + - `data_format` (str, optional): Additional format transformation (default: `"markdown"`) + - `async_request` (bool, optional): Enable asynchronous processing (default: `False`) + - `max_workers` (int, optional): Maximum parallel workers for multiple queries (default: `10`) + - `timeout` (int, optional): Request timeout in seconds (default: `30`) + - `parse` (bool, optional): Enable JSON parsing by adding brd_json=1 to URL (default: `False`) + + ### Returns: + - Single query: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"` + - Multiple queries: `List[Union[Dict[str, Any], str]]` corresponding to each input query + + ### Example Usage: + ```python + # Single search query + result = client.search( + query="best laptops 2024", + search_engine="google", + response_format="json" + ) + + # Multiple search queries + queries = ["python tutorials", "machine learning courses", "web development"] + results = client.search( + query=queries, + search_engine="bing", + zone="your_zone_name", + max_workers=3 + ) + ``` + + ### Supported Search Engines: + - `"google"` - Google Search + - `"bing"` - Microsoft Bing + - `"yandex"` - Yandex Search + + ### Raises: + - `ValidationError`: Invalid search engine, empty query, or validation errors + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + + timeout = timeout or self.default_timeout + validate_zone_name(zone) + validate_search_engine(search_engine) + validate_query(query) + validate_response_format(response_format) + validate_http_method(method) + validate_country_code(country) + validate_timeout(timeout) + validate_max_workers(max_workers) + + base_url_map = { + "google": "https://www.google.com/search?q=", + "bing": "https://www.bing.com/search?q=", + "yandex": "https://yandex.com/search/?text=" + } + + base_url = base_url_map[search_engine.lower()] + + if isinstance(query, list): + effective_max_workers = min(len(query), max_workers or 10) + results = [None] * len(query) + + with ThreadPoolExecutor(max_workers=effective_max_workers) as executor: + future_to_index = { + executor.submit( + self._perform_single_search, + single_query, zone, response_format, method, country, + data_format, async_request, base_url, timeout, parse + ): i + for i, single_query in enumerate(query) + } + + for future in as_completed(future_to_index): + index = future_to_index[future] + try: + result = future.result() + results[index] = result + except Exception as e: + raise APIError(f"Failed to search '{query[index]}': {str(e)}") + + return results + else: + return self._perform_single_search( + query, zone, response_format, method, country, + data_format, async_request, base_url, timeout, parse + ) + + def _perform_single_search( + self, + query: str, + zone: str, + response_format: str, + method: str, + country: str, + data_format: str, + async_request: bool, + base_url: str, + timeout: int, + parse: bool + ) -> Union[Dict[str, Any], str]: + """ + Perform a single search operation + """ + encoded_query = quote_plus(query) + url = f"{base_url}{encoded_query}" + + if parse: + url += "&brd_json=1" + + endpoint = "https://api.brightdata.com/request" + + payload = { + "zone": zone, + "url": url, + "format": response_format, + "method": method, + "data_format": data_format + } + + params = {} + if async_request: + params['async'] = 'true' + + @retry_request( + max_retries=self.max_retries, + backoff_factor=self.retry_backoff, + retry_statuses={429, 500, 502, 503, 504} + ) + def make_request(): + return self.session.post( + endpoint, + json=payload, + params=params, + timeout=timeout + ) + + response = make_request() + + if response.status_code == 200: + if response_format == "json": + try: + return response.json() + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse JSON response: {e}") + return response.text + else: + return response.text + + elif response.status_code == 400: + raise ValidationError(f"Bad Request (400): {response.text}") + elif response.status_code == 401: + raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}") + elif response.status_code == 403: + raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}") + elif response.status_code == 404: + raise APIError(f"Not Found (404): {response.text}") + else: + raise APIError(f"API Error ({response.status_code}): {response.text}", + status_code=response.status_code, response_text=response.text) \ No newline at end of file diff --git a/src/client.py b/src/client.py new file mode 100644 index 0000000..1d4f794 --- /dev/null +++ b/src/client.py @@ -0,0 +1,817 @@ +import os +import re +import time +import json +import requests +from .search import Search +from datetime import datetime +from .api.crawl import CrawlAPI +from .api.chatgpt import ChatGPTAPI +from .api.extract import ExtractAPI +from .api.download import DownloadAPI +from .api import WebScraper, SearchAPI +from typing import Union, Dict, Any, List +from .exceptions import ValidationError, AuthenticationError, APIError +from .api.linkedin import LinkedInAPI, LinkedInScraper, LinkedInSearcher +from .utils import ZoneManager, setup_logging, get_logger, parse_content + +def _get_version(): + """Get version from __init__.py, cached at module import time.""" + try: + init_file = os.path.join(os.path.dirname(__file__), '__init__.py') + with open(init_file, 'r', encoding='utf-8') as f: + for line in f: + if line.startswith('__version__'): + return line.split('"')[1] + except (OSError, IndexError): + pass + return "unknown" + +__version__ = _get_version() + +logger = get_logger('client') + + +class bdclient: + """Main client for the Bright Data SDK""" + + DEFAULT_MAX_WORKERS = 10 + DEFAULT_TIMEOUT = 30 + CONNECTION_POOL_SIZE = 20 + MAX_RETRIES = 3 + RETRY_BACKOFF_FACTOR = 1.5 + RETRY_STATUSES = {429, 500, 502, 503, 504} + + def __init__( + self, + api_token: str = None, + auto_create_zones: bool = True, + web_unlocker_zone: str = None, + serp_zone: str = None, + browser_zone: str = None, + browser_username: str = None, + browser_password: str = None, + browser_type: str = "playwright", + log_level: str = "INFO", + structured_logging: bool = True, + verbose: bool = None + ): + """ + Initialize the Bright Data client with your API token. + + Create an account at https://brightdata.com/ to get your API token. + Go to Settings > API Keys and verify that your key has "Admin" permissions. + + Args: + api_token: Your Bright Data API token (or set BRIGHTDATA_API_TOKEN env var) + auto_create_zones: Auto-create required zones if missing (default: True) + web_unlocker_zone: Custom Web Unlocker zone name (default: 'sdk_unlocker') + serp_zone: Custom SERP zone name (default: 'sdk_serp') + browser_zone: Custom Browser zone name (default: 'sdk_browser') + browser_username: Browser API username ("username-zone-{zone_name}") + browser_password: Browser API password + browser_type: "playwright", "puppeteer", or "selenium" (default: "playwright") + log_level: Logging level + structured_logging: Enable structured JSON logging + verbose: When True, show all logs per log_level. Can also use BRIGHTDATA_VERBOSE env var. + """ + + try: + from dotenv import load_dotenv + load_dotenv() + except ImportError: + pass + + if verbose is None: + env_verbose = os.getenv('BRIGHTDATA_VERBOSE', '').lower() + verbose = env_verbose in ('true', '1', 'yes', 'on') + + setup_logging(log_level, structured_logging, verbose) + logger.info("Initializing Bright Data SDK client") + + # API Token Validation + self.api_token = api_token or os.getenv('BRIGHTDATA_API_TOKEN') + if not self.api_token: + logger.error("API token not provided") + raise ValidationError( + "API token is required. Pass api_token or set BRIGHTDATA_API_TOKEN env var." + ) + + if not isinstance(self.api_token, str): + logger.error("API token must be a string") + raise ValidationError("API token must be a string") + + if len(self.api_token.strip()) < 10: + logger.error("API token appears to be invalid (too short)") + raise ValidationError("API token appears to be invalid") + + token_preview = f"{self.api_token[:4]}***{self.api_token[-4:]}" + logger.info(f"API token validated successfully: {token_preview}") + + self.web_unlocker_zone = web_unlocker_zone or os.getenv('WEB_UNLOCKER_ZONE', 'sdk_unlocker') + self.serp_zone = serp_zone or os.getenv('SERP_ZONE', 'sdk_serp') + self.browser_zone = browser_zone or os.getenv('BROWSER_ZONE', 'sdk_browser') + self.auto_create_zones = auto_create_zones + + self.browser_username = browser_username or os.getenv('BRIGHTDATA_BROWSER_USERNAME') + self.browser_password = browser_password or os.getenv('BRIGHTDATA_BROWSER_PASSWORD') + + valid_browser_types = ["playwright", "puppeteer", "selenium"] + if browser_type not in valid_browser_types: + raise ValidationError( + f"Invalid browser_type '{browser_type}'. Must be one of: {valid_browser_types}" + ) + self.browser_type = browser_type + + if self.browser_username and self.browser_password: + browser_preview = f"{self.browser_username[:3]}***" + logger.info(f"Browser credentials configured: {browser_preview} (type: {self.browser_type})") + elif self.browser_username or self.browser_password: + logger.warning("Incomplete browser credentials: both username and password are required.") + else: + logger.debug("No browser credentials provided - browser API will not be available.") + + self.session = requests.Session() + self.session.headers.update({ + 'Authorization': f'Bearer {self.api_token}', + 'Content-Type': 'application/json', + 'User-Agent': f'brightdata-sdk/{__version__}' + }) + logger.info("HTTP session configured with secure headers") + + adapter = requests.adapters.HTTPAdapter( + pool_connections=self.CONNECTION_POOL_SIZE, + pool_maxsize=self.CONNECTION_POOL_SIZE, + max_retries=0 + ) + self.session.mount('https://', adapter) + self.session.mount('http://', adapter) + + self.zone_manager = ZoneManager(self.session) + + self.web_scraper = WebScraper( + self.session, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.search_api = SearchAPI( + self.session, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.chatgpt_api = ChatGPTAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.linkedin_api = LinkedInAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.download_api = DownloadAPI(self.session, self.api_token, self.DEFAULT_TIMEOUT) + + self.crawl_api = CrawlAPI( + self.session, + self.api_token, + self.DEFAULT_TIMEOUT, + self.MAX_RETRIES, + self.RETRY_BACKOFF_FACTOR + ) + + self.extract_api = ExtractAPI(self) + + self.search = Search(self) + + if self.auto_create_zones: + self.zone_manager.ensure_required_zones( + self.web_unlocker_zone, + self.serp_zone + ) + + + def scrape( + self, + url: Union[str, List[str]], + zone: str = None, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "html", + async_request: bool = False, + max_workers: int = None, + timeout: int = None + ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]: + """ + ## Unlock and scrape websites using Bright Data Web Unlocker API + + Scrapes one or multiple websites using Bright Data's Web Unlocker and proxy network. + Automatically handles bot-detection, CAPTCHAs, and retries. + + ### Parameters: + - `url` (str | List[str]): Single URL string or list of URLs to scrape + - `zone` (str, optional): Zone identifier (default: auto-configured web_unlocker_zone) + - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`) + - `method` (str, optional): HTTP method for the request (default: `"GET"`) + - `country` (str, optional): Two-letter ISO country code for proxy location (defaults to fastest connection) + - `data_format` (str, optional): Additional format transformation (default: `"html"`) + - `async_request` (bool, optional): Enable asynchronous processing (default: `False`) + - `max_workers` (int, optional): Maximum parallel workers for multiple URLs (default: `10`) + - `timeout` (int, optional): Request timeout in seconds (default: `30`) + + ### Returns: + - Single URL: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"` + - Multiple URLs: `List[Union[Dict[str, Any], str]]` corresponding to each input URL + + ### Raises: + - `ValidationError`: Invalid URL or parameters + - `APIError`: Scraping failed (non-2xx response or server error) + """ + + # URL validation + + if not url: + raise ValidationError("The 'url' parameter cannot be None or empty.") + + if isinstance(url, str): + if not url.strip(): + raise ValidationError("The 'url' string cannot be empty or whitespace.") + elif isinstance(url, list): + if len(url) == 0: + raise ValidationError("URL list cannot be empty") + if any((not isinstance(u, str) or not u.strip()) for u in url): + raise ValidationError("All URLs in the list must be non-empty strings") + + result = self.web_scraper.scrape( + url, zone or self.web_unlocker_zone, response_format, method, country, + data_format, async_request, max_workers or self.DEFAULT_MAX_WORKERS, timeout or self.DEFAULT_TIMEOUT + ) + return result + + def search( + self, + query: Union[str, List[str]], + search_engine: str = "google", + zone: str = None, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "html", + async_request: bool = False, + max_workers: int = None, + timeout: int = None, + parse: bool = False + ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]: + """ + ## Perform web search using Bright Data's SERP + + ### Parameters: + - `query` (str | List[str]): Search query string or list of search queries + - `search_engine` (str, optional): Search engine to use - `"google"`, `"bing"`, or `"yandex"` (default: `"google"`) + - `zone` (str, optional): Zone identifier (default: auto-configured serp_zone) + - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`) + - `method` (str, optional): HTTP method for the request (default: `"GET"`) + - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`) + - `data_format` (str, optional): Additional format transformation (default: `"html"`) + - `async_request` (bool, optional): Enable asynchronous processing (default: `False`) + - `max_workers` (int, optional): Maximum parallel workers for multiple queries (default: `10`) + - `timeout` (int, optional): Request timeout in seconds (default: `30`) + - `parse` (bool, optional): Enable JSON parsing by adding brd_json=1 to URL (default: `False`) + + ### Returns: + - Single query: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"` + - Multiple queries: `List[Union[Dict[str, Any], str]]` corresponding to each input query + + ### Raises: + - `ValidationError`: Query is missing or invalid + - `APIError`: Search request failed or returned an error + """ + + # Query validation + + if not query: + raise ValidationError("query cannot be empty") + if isinstance(query, str): + if not query.strip(): + raise ValidationError("Search query cannot be empty or whitespace") + elif isinstance(query, list): + if len(query) == 0: + raise ValidationError("Query list cannot be empty") + for q in query: + if not isinstance(q, str) or not q.strip(): + raise ValidationError("All queries in the list must be non-empty strings") + + # Validate search engine + + search_engine = (search_engine or "google").strip().lower() + valid_engines = ["google", "bing", "yandex"] + if search_engine not in valid_engines: + raise ValidationError(f"Invalid search engine '{search_engine}'. Valid options: {', '.join(valid_engines)}") + + zone = zone or self.serp_zone + max_workers = max_workers or self.DEFAULT_MAX_WORKERS + + result = self.search_api.search( + query=query, + search_engine=search_engine, + zone=zone or self.serp_zone, + response_format=response_format, + method=method, + country=country, + data_format=data_format, + async_request=async_request, + max_workers=max_workers, + timeout=timeout or self.DEFAULT_TIMEOUT, + parse=parse, + ) + + return result + + def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str: + """ + ## Download content to a file based on its format + + ### Args: + content: The content to download (dict for JSON, string for other formats) + filename: Optional filename. If not provided, generates one with timestamp + format: Format of the content ("json", "csv", "ndjson", "jsonl", "txt") + parse: If True, automatically parse JSON strings in 'body' fields to objects (default: False) + + ### Returns: + The file path of the saved file. + """ + if not content: + raise ValidationError("Content is empty or None") + return self.download_api.download_content(content, filename, response_format, parse) + + + def search_gpt( + self, + prompt: Union[str, List[str]], + country: Union[str, List[str]] = None, + additional_prompt: Union[str, List[str]] = None, + web_search: Union[bool, List[bool]] = False, + sync: bool = True, + timeout: int = None, + **kwargs + ) -> Dict[str, Any]: + + """ + ## Search ChatGPT responses using Bright Data's ChatGPT dataset API + + Sends one or multiple prompts to ChatGPT through Bright Data's proxy network + with support for both synchronous and asynchronous processing. + + ### Parameters: + - `prompt` (str | List[str]): Single prompt string or list of prompts to send to ChatGPT + - `country` (str | List[str], optional): Two-letter ISO country code(s) for proxy location (default: "") + - `additional_prompt` (str | List[str], optional): Follow-up prompt(s) after receiving the first answer (default: "") + - `web_search` (bool | List[bool], optional): Whether to click the web search button in ChatGPT (default: False) + - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing + + ### Returns: + - `Dict[str, Any]`: If sync=True, returns ChatGPT response data directly. If sync=False, returns response with snapshot_id for async processing + + ### Raises: + - `ValidationError`: Invalid prompt or parameters + - `AuthenticationError`: Invalid API token or insufficient permissions + - `APIError`: Request failed or server error + """ + + # Handle alternate parameter names from kwargs + + if 'secondaryPrompt' in kwargs: + additional_prompt = kwargs.pop('secondaryPrompt') + if 'additionalPrompt' in kwargs: + additional_prompt = kwargs.pop('additionalPrompt') + if 'webSearch' in kwargs: + web_search = kwargs.pop('webSearch') + + # Validate prompt input + + if (isinstance(prompt, list) and len(prompt) == 0) or prompt is None: + raise ValidationError("prompt is required") + + # Ensure prompts list + + prompts = prompt if isinstance(prompt, list) else [prompt] + + # Validate each prompt is a non-empty string + + for p in prompts: + if not isinstance(p, str) or not p.strip(): + raise ValidationError("All prompts must be non-empty strings") + + def normalize_param(param, name): + if param is None: + return [None] * len(prompts) + if isinstance(param, list): + if len(param) != len(prompts): + raise ValidationError(f"Length of {name} list must match number of prompts") + return param + return [param] * len(prompts) + + countries = normalize_param(country, "country") + followups = normalize_param(additional_prompt, "additional_prompt") + web_searches = normalize_param(web_search, "web_search") + + # Validate country codes + for i, c in enumerate(countries): + if c is None or str(c).strip() == "": + countries[i] = "" + else: + if not isinstance(c, str) or len(c.strip()) != 2 or not c.strip().isalpha(): + raise ValidationError("must be 2-letter code") + countries[i] = c.strip().lower() + # Validate follow-up prompts + for i, f in enumerate(followups): + if f is None: + followups[i] = "" + elif not isinstance(f, str): + raise ValidationError("All follow-up prompts must be strings") + else: + followups[i] = f.strip() + # Validate web_search flags + for i, w in enumerate(web_searches): + if w is None: + web_searches[i] = False + elif not isinstance(w, bool): + raise ValidationError("must be a boolean or list of booleans") + + timeout_value = timeout if timeout is not None else (65 if sync else 30) + + if timeout is not None: + if not isinstance(timeout, int): + raise ValidationError("Timeout must be an integer") + if timeout <= 0: + raise ValidationError("Timeout must be greater than 0 seconds") + if timeout > 300: + raise ValidationError("Timeout cannot exceed 300 seconds (5 minutes)") + # Prepare request payload + tasks = [] + for i in range(len(prompts)): + task = { + "url": "https://chatgpt.com", + "prompt": prompts[i].strip(), + "country": countries[i] or "", + "additional_prompt": followups[i] or "", + "web_search": bool(web_searches[i]) + } + tasks.append(task) + payload_data = tasks[0] if len(tasks) == 1 else tasks + # Make API request with retries + endpoint = "https://api.brightdata.com/datasets/v3/scrape" if sync else "https://api.brightdata.com/datasets/v3/trigger" + params = { + "dataset_id": "gd_m7aof0k82r803d5bjm", + "include_errors": "true" + } + last_exception = None + for attempt in range(self.MAX_RETRIES + 1): + try: + response = self.session.post(endpoint, json=payload_data, timeout=timeout_value) + except requests.exceptions.RequestException as e: + last_exception = e + if attempt >= self.MAX_RETRIES: + raise NetworkError(f"Network error: {e}") + # Retry on network errors + time.sleep(self.RETRY_BACKOFF_FACTOR ** attempt) + continue + if response.status_code == 401: + raise AuthenticationError("Invalid API token or unauthorized") + if response.status_code in self.RETRY_STATUSES: + if attempt >= self.MAX_RETRIES: + raise RuntimeError("Failed after retries") + time.sleep(self.RETRY_BACKOFF_FACTOR ** attempt) + continue + if response.status_code != 200: + raise APIError(f"ChatGPT search failed with status {response.status_code}: {response.text}", status_code=response.status_code, response_text=getattr(response, 'text', '')) + # Success + result_data = response.json() + if sync: + return result_data + snapshot_id = result_data.get("snapshot_id") or result_data.get("id") + if snapshot_id: + print(f"Snapshot ID: {snapshot_id}") + return {"snapshot_id": snapshot_id} + else: + raise APIError("Failed to retrieve snapshot ID from response", status_code=response.status_code, response_text=response.text) + + + @property + def scrape_linkedin(self): + """ + ## LinkedIn Data Scraping Interface + + Provides specialized methods for scraping different types of LinkedIn data + using Bright Data's collect API with pre-configured dataset IDs. + + ### Available Methods: + - `profiles(url)` - Scrape LinkedIn profile data + - `companies(url)` - Scrape LinkedIn company data + - `jobs(url)` - Scrape LinkedIn job listing data + - `posts(url)` - Scrape LinkedIn post content + + ### Example Usage: + ```python + # Scrape LinkedIn profiles + result = client.scrape_linkedin.profiles("https://www.linkedin.com/in/username/") + + # Scrape multiple companies + companies = [ + "https://www.linkedin.com/company/ibm", + "https://www.linkedin.com/company/bright-data" + ] + result = client.scrape_linkedin.companies(companies) + + # Scrape job listings + result = client.scrape_linkedin.jobs("https://www.linkedin.com/jobs/view/123456/") + + # Scrape posts + result = client.scrape_linkedin.posts("https://www.linkedin.com/posts/user-activity-123/") + ``` + + ### Returns: + Each method returns a `Dict[str, Any]` containing snapshot_id and metadata for tracking the request. + Use the snapshot_id with `download_snapshot()` to retrieve the collected data. + """ + if not hasattr(self, '_linkedin_scraper'): + self._linkedin_scraper = LinkedInScraper(self.linkedin_api) + return self._linkedin_scraper + + @property + def search_linkedin(self): + """ + ## LinkedIn Data Search Interface + + Provides specialized methods for discovering new LinkedIn data by various search criteria + using Bright Data's collect API with pre-configured dataset IDs. + + ### Available Methods: + - `profiles(first_name, last_name)` - Search LinkedIn profiles by name + - `jobs(url=..., location=...)` - Search LinkedIn jobs by URL or keyword criteria + - `posts(profile_url=..., company_url=..., url=...)` - Search LinkedIn posts by various methods + + ### Example Usage: + ```python + # Search profiles by name + result = client.search_linkedin.profiles("James", "Smith") + + # Search jobs by location and keywords + result = client.search_linkedin.jobs( + location="Paris", + keyword="product manager", + country="FR" + ) + + # Search posts by profile URL with date range + result = client.search_linkedin.posts( + profile_url="https://www.linkedin.com/in/username", + start_date="2018-04-25T00:00:00.000Z", + end_date="2021-05-25T00:00:00.000Z" + ) + ``` + + ### Returns: + Each method returns a `Dict[str, Any]` containing snapshot_id (async) or direct data (sync) for tracking the request. + Use the snapshot_id with `download_snapshot()` to retrieve the collected data. + """ + if not hasattr(self, '_linkedin_searcher'): + self._linkedin_searcher = LinkedInSearcher(self.linkedin_api) + return self._linkedin_searcher + + def download_snapshot( + self, + snapshot_id: str, + response_format: str = "json", + compress: bool = False, + batch_size: int = None, + part: int = None + ) -> Union[Dict[str, Any], List[Dict[str, Any]], str]: + """ + ## Download snapshot content from Bright Data dataset API + + Downloads the snapshot content using the snapshot ID returned from scrape_chatGPT() + or other dataset collection triggers. + + ### Parameters: + - `snapshot_id` (str): The snapshot ID returned when collection was triggered (required) + - `response_format` (str, optional): Format of the output data: "json", "csv", "ndjson", "jsonl" (default: "json") + - `compress` (bool, optional): Whether the result should be compressed (default: False) + - `batch_size` (int, optional): Divide into batches of X records (minimum: 1000) + - `part` (int, optional): If batch_size provided, specify which part to download + + ### Returns: + - `Union[Dict, List, str]`: Snapshot data in the requested format, OR + - `Dict`: Status response if snapshot is not ready yet (status="not_ready") + + + ### Raises: + - `ValidationError`: Invalid parameters or snapshot_id format + - `APIError`: Request failed, snapshot not found, or server error + """ + + # snapshot_id validation + + if not snapshot_id or not isinstance(snapshot_id, str): + raise ValidationError("The 'snapshot_id' parameter must be a non-empty string.") + if not snapshot_id.startswith("s_"): + raise ValidationError("Invalid 'snapshot_id' format. Expected an ID starting with 's_' (e.g., 's_m4x7enmven8djfqak').") + + # format validation + + allowed_formats = {"json", "ndjson", "jsonl", "csv"} + if format not in allowed_formats: + raise ValueError( + f"Invalid 'format' value: '{format}'. Must be one of {sorted(allowed_formats)}." + ) + + return self.download_api.download_snapshot(snapshot_id, response_format, compress, batch_size, part) + + + def list_zones(self) -> List[Dict[str, Any]]: + """ + ## List all active zones in your Bright Data account + + ### Returns: + List of zone dictionaries with their configurations + """ + return self.zone_manager.list_zones() + + def connect_browser(self) -> str: + """ + ## Get WebSocket endpoint URL for connecting to Bright Data's scraping browser + + Returns the WebSocket endpoint URL that can be used with Playwright or Selenium + to connect to Bright Data's scraping browser service. + + **Security Warning:** The returned URL contains authentication credentials. Do not share this URL or expose it publicly. + + ### Returns: + WebSocket URL (str) for connecting to the browser (contains one-time token) + + ### Raises: + - `AuthenticationError`: If the API token or browser zone credentials are invalid + - `APIError`: If retrieving the browser endpoint fails + """ + + if not self.browser_username or not self.browser_password: + logger.error("Browser credentials not configured") + raise ValidationError( + "Browser credentials are required. Provide browser_username and browser_password " + "parameters or set BRIGHTDATA_BROWSER_USERNAME and BRIGHTDATA_BROWSER_PASSWORD " + "environment variables." + ) + + if not isinstance(self.browser_username, str) or not isinstance(self.browser_password, str): + logger.error("Browser credentials must be strings") + raise ValidationError("Browser username and password must be strings") + + if len(self.browser_username.strip()) == 0 or len(self.browser_password.strip()) == 0: + logger.error("Browser credentials cannot be empty") + raise ValidationError("Browser username and password cannot be empty") + + auth_string = f"{self.browser_username}:{self.browser_password}" + + if self.browser_type == "selenium": + endpoint_url = f"https://{auth_string}@brd.superproxy.io:9515" + logger.debug(f"Browser endpoint URL: https://***:***@brd.superproxy.io:9515") + else: + endpoint_url = f"wss://{auth_string}@brd.superproxy.io:9222" + logger.debug(f"Browser endpoint URL: wss://***:***@brd.superproxy.io:9222") + + logger.info(f"Generated {self.browser_type} connection endpoint for user: {self.browser_username[:3]}***") + + return endpoint_url + + def crawl( + self, + url: Union[str, List[str]], + ignore_sitemap: bool = None, + depth: int = None, + include_filter: str = None, + exclude_filter: str = None, + custom_output_fields: List[str] = None, + include_errors: bool = True + ) -> Dict[str, Any]: + """ + ## Crawl websites using Bright Data's Web Crawl API + + Performs web crawling to discover and scrape multiple pages from a website + starting from the specified URL(s). Returns a snapshot_id for tracking the crawl progress. + + ### Parameters: + - `url` (str | List[str]): Starting URL or URLs to crawl + - `ignore_sitemap` (bool, optional): If True, ignore site's sitemap (default: False) + - `depth` (int, optional): Maximum crawl depth (number of hops from start URL) + - `include_filter` (str, optional): Only crawl URLs that include this substring (default: None) + - `exclude_filter` (str, optional): Do not crawl URLs that include this substring + - `custom_output_fields` (List[str], optional): Additional data fields to return (e.g., ["markdown","text","title"]) + - `include_errors` (bool, optional): If True, include pages that errored in results (default: True) + + ### Returns: + - A dict containing the crawl job details, including a `snapshot_id` to retrieve results via download_snapshot() + + ### Raises: + - `ValidationError`: Missing URL or invalid parameters + - `APIError`: Crawl request failed + """ + + # URL validation + + if not url: + raise ValidationError("The 'url' parameter cannot be None or empty.") + if isinstance(url, str): + if not url.strip(): + raise ValidationError("The 'url' string cannot be empty or whitespace.") + elif isinstance(url, list): + if len(url) == 0: + raise ValidationError("URL list cannot be empty") + for u in url: + if not isinstance(u, str) or not u.strip(): + raise ValidationError("All URLs in the list must be non-empty strings") + if depth is not None: + if not isinstance(depth, int): + raise ValidationError("Depth must be an integer") + if depth <= 0: + raise ValidationError("The 'depth' parameter must be a positive integer.") + + result = self.crawl_api.crawl( + url, ignore_sitemap, depth, include_filter, exclude_filter, custom_output_fields, include_errors + ) + return result + + def parse_content( + self, + data: Union[str, Dict, List], + extract_text: bool = True, + extract_links: bool = False, + extract_images: bool = False + ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: + """ + ## Parse content from API responses + + Extract and parse useful information from scraping, search, or crawling results. + Automatically detects and handles both single and multiple results from batch operations. + + ### Parameters: + - `data` (str | Dict | List): Response data from scrape(), search(), or crawl() methods + - `extract_text` (bool, optional): Extract clean text content (default: True) + - `extract_links` (bool, optional): Extract all links from content (default: False) + - `extract_images` (bool, optional): Extract image URLs from content (default: False) + + ### Returns: + - `Dict[str, Any]`: Parsed content for single results + - `List[Dict[str, Any]]`: List of parsed content for multiple results (auto-detected) + """ + + return parse_content( + data=data, + extract_text=extract_text, + extract_links=extract_links, + extract_images=extract_images + ) + + def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: Dict[str, Any] = None, llm_key: str = None) -> str: + """ + ## Extract specific information from websites using AI + + Combines web scraping with OpenAI's language models to extract targeted information + from web pages based on natural language queries. Automatically parses URLs and + optimizes content for efficient LLM processing. + + **LLM Key Notice:** If `llm_key` is not provided, the method will attempt to read + the OpenAI API key from the `OPENAI_API_KEY` environment variable. Ensure it is set. + + ### Parameters: + - `query` (str): Natural language query describing what to extract. If `url` parameter is provided, + the extraction will run on that URL. Otherwise, a prior scrape result should be provided. + - `url` (str | List[str], optional): Target page URL(s) to extract information from. Can be omitted if using a prior result. + - `output_scheme` (Dict, optional): JSON schema defining the structure of desired output (keys and value types) + - `llm_key` (str, optional): OpenAI API key for LLM usage (if not provided, will use environment variable) + + ### Returns: + - `str`: The extracted information as a text string (may contain JSON or markdown depending on query and output_scheme) + + ### Raises: + - `ValidationError`: Missing query, missing URL, or invalid LLM key + """ + + # Validate LLM key + if not llm_key: + raise ValidationError( + "Missing API key. Provide it via the `llm_key` parameter or set the " + "`BRIGHTDATA_API_TOKEN` environment variable. Example:\n\n" + "export BRIGHTDATA_API_TOKEN='your-openai-api-key'" + ) + + return self.extract_api.extract(query, url, output_scheme, llm_key) diff --git a/src/exceptions/__init__.py b/src/exceptions/__init__.py new file mode 100644 index 0000000..6554555 --- /dev/null +++ b/src/exceptions/__init__.py @@ -0,0 +1,17 @@ +from .errors import ( + BrightDataError, + ValidationError, + AuthenticationError, + ZoneError, + NetworkError, + APIError +) + +__all__ = [ + 'BrightDataError', + 'ValidationError', + 'AuthenticationError', + 'ZoneError', + 'NetworkError', + 'APIError' +] \ No newline at end of file diff --git a/src/exceptions/errors.py b/src/exceptions/errors.py new file mode 100644 index 0000000..1cf4425 --- /dev/null +++ b/src/exceptions/errors.py @@ -0,0 +1,31 @@ +class BrightDataError(Exception): + """Base exception for all Bright Data SDK errors""" + pass + + +class ValidationError(BrightDataError): + """Raised when input validation fails""" + pass + + +class AuthenticationError(BrightDataError): + """Raised when API authentication fails""" + pass + + +class ZoneError(BrightDataError): + """Raised when zone operations fail""" + pass + + +class NetworkError(BrightDataError): + """Raised when network operations fail""" + pass + + +class APIError(BrightDataError): + """Raised when API requests fail""" + def __init__(self, message, status_code=None, response_text=None): + super().__init__(message) + self.status_code = status_code + self.response_text = response_text \ No newline at end of file diff --git a/src/schemas/tst b/src/schemas/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/schemas/tst @@ -0,0 +1 @@ + diff --git a/src/search.py b/src/search.py new file mode 100644 index 0000000..215e65d --- /dev/null +++ b/src/search.py @@ -0,0 +1,305 @@ +from __future__ import annotations + +import re +import time +from .exceptions import ValidationError, APIError +from typing import Any, Dict, List, Optional, Union + + +class SearchGPTResult: + """ + Wrapper for GPT search results. + + Args + + raw : Any - The raw API response object (dict / list / text). + text : str (property) - Best-effort extraction of the final answer text (T1). + prompt : Optional[str] - The original prompt (for single result). + country : Optional[str] - Country code used for the request (single result). + usage : Optional[Dict[str, Any]] - Token/usage metadata if available. + snapshot_id : Optional[str] - Present when sync=False (async job queued). + """ + + def __init__( + self, + raw: Any, + prompt: Optional[str] = None, + country: Optional[str] = None, + usage: Optional[Dict[str, Any]] = None, + snapshot_id: Optional[str] = None, + ) -> None: + self.raw = raw + self.prompt = prompt + self.country = country + self.usage = usage + self.snapshot_id = snapshot_id + + # helpers + @staticmethod + def _coalesce(*vals) -> Optional[str]: + for v in vals: + if isinstance(v, str) and v.strip(): + return v + return None + + @staticmethod + def _dig(d: Any, *keys) -> Any: + cur = d + for k in keys: + if isinstance(cur, dict) and k in cur: + cur = cur[k] + else: + return None + return cur + + @property + def text(self) -> Optional[str]: + """ + Best-effort extraction of ONLY the final answer text. + Tries common fields/paths seen in ChatGPT-like payloads. + Returns None if not found. + """ + raw = self.raw + + # If API returned a plain string + if isinstance(raw, str): + return raw.strip() or None + + t = self._dig(raw, "answer") + if isinstance(t, str): + return t.strip() or None + + t = self._dig(raw, "data", "answer") + if isinstance(t, str): + return t.strip() or None + + t = self._dig(raw, "message", "content") + if isinstance(t, str): + return t.strip() or None + + choices = self._dig(raw, "choices") + if isinstance(choices, list) and choices: + content = self._dig(choices[0], "message", "content") + if isinstance(content, str): + return content.strip() or None + + content = choices[0].get("text") if isinstance(choices[0], dict) else None + if isinstance(content, str): + return content.strip() or None + + t = self._dig(raw, "result") + if isinstance(t, str): + return t.strip() or None + t = self._dig(raw, "output") + if isinstance(t, str): + return t.strip() or None + + for key in ("content", "text", "final", "final_text"): + v = self._dig(raw, key) + if isinstance(v, str): + return v.strip() or None + + return None + + def to_dict(self) -> Dict[str, Any]: + return { + "prompt": self.prompt, + "country": self.country, + "usage": self.usage, + "snapshot_id": self.snapshot_id, + "raw": self.raw, + "text": self.text, + } + + +class Search: + """ + Namespaced search interface. + """ + + def __init__(self, client) -> None: + self._c = client # root client (reuses session, APIs, zones) + + def __call__( + self, + query: Union[str, List[str]], + search_engine: str = "google", + zone: str = None, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "html", + async_request: bool = False, + max_workers: int = None, + timeout: int = None, + parse: bool = False, + ): + return self.web( + query=query, + search_engine=search_engine, + zone=zone, + response_format=response_format, + method=method, + country=country, + data_format=data_format, + async_request=async_request, + max_workers=max_workers, + timeout=timeout, + parse=parse, + ) + + # GPT + def gpt( + self, + prompt: Union[str, List[str]], + country: Union[str, List[str]] = None, + secondary_prompt: Union[str, List[str]] = None, + web_search: Union[bool, List[bool]] = False, + sync: bool = True, + timeout: int = None, + ) -> Union[SearchGPTResult, List[SearchGPTResult]]: + """ + Query ChatGPT via Bright Data's dataset API. + + Returns - Single object for single prompt, list for multiple prompts (M2). + """ + prompts: List[str] + if isinstance(prompt, str): + prompts = [prompt] + elif isinstance(prompt, list) and all(isinstance(p, str) for p in prompt): + prompts = prompt + else: + raise ValidationError("Invalid prompt input: must be a non-empty string or list of strings.") + if not prompts: + raise ValidationError("At least one prompt is required.") + + # normalization helper + def _norm(param, name): + if param is None: + return [None] * len(prompts) + if isinstance(param, list): + if len(param) != len(prompts): + raise ValidationError(f"{name} list must have the same length as prompts.") + return param + return [param] * len(prompts) + + countries = _norm(country, "country") + secondary_prompts = _norm(secondary_prompt, "secondary_prompt") + web_searches = _norm(web_search, "web_search") + + # validation + for c in countries: + if c and not re.match(r"^[A-Z]{2}$", c): + raise ValidationError(f"Invalid country code '{c}'. Must be 2 uppercase letters.") + for s in secondary_prompts: + if s is not None and not isinstance(s, str): + raise ValidationError("Secondary prompts must be strings.") + for w in web_searches: + if not isinstance(w, bool): + raise ValidationError("Web search flags must be boolean.") + if timeout is not None and (not isinstance(timeout, int) or timeout <= 0): + raise ValidationError("Timeout must be a positive integer.") + + timeout = timeout or (65 if sync else 30) + + # retries around API call + max_retries = 3 + last_err = None + for attempt in range(max_retries): + try: + result = self._c.chatgpt_api.scrape_chatgpt( + prompts=prompts, + countries=countries, + additional_prompts=secondary_prompts, + web_searches=web_searches, + sync=sync, + timeout=timeout, + ) + # Wrap result(s) + if not sync: + # Async: expect {"snapshot_id": "...", ...} + snapshot_id = result.get("snapshot_id") if isinstance(result, dict) else None + return SearchGPTResult(raw=result, snapshot_id=snapshot_id) + + if isinstance(result, list): + out: List[SearchGPTResult] = [] + if len(result) == len(prompts): + for i, item in enumerate(result): + out.append( + SearchGPTResult( + raw=item, + prompt=prompts[i], + country=countries[i], + usage=None, + ) + ) + else: + for item in result: + out.append(SearchGPTResult(raw=item)) + return out[0] if len(prompts) == 1 and len(out) == 1 else out + + return SearchGPTResult(raw=result, prompt=prompts[0] if len(prompts) == 1 else None) + + except APIError as e: + last_err = e + if attempt < max_retries - 1: + time.sleep(2) + continue + raise e + except Exception as e: + if isinstance(e, (ValidationError, APIError)): + raise + last_err = e + if attempt < max_retries - 1: + time.sleep(2) + continue + raise APIError(f"Unexpected error in search.gpt: {e}") from e + + if last_err: + raise last_err + raise APIError("Unknown error in search.gpt") + + # Web (SERP) + def web( + self, + query: Union[str, List[str]], + search_engine: str = "google", + zone: str = None, + response_format: str = "raw", + method: str = "GET", + country: str = "", + data_format: str = "html", + async_request: bool = False, + max_workers: int = None, + timeout: int = None, + parse: bool = False, + ): + """ + Web/SERP search wrapper. Thin pass-through to SearchAPI with validation. + """ + if not query: + raise ValueError("The 'query' parameter cannot be None or empty.") + if isinstance(query, str): + if not query.strip(): + raise ValueError("The 'query' string cannot be empty or whitespace.") + elif isinstance(query, list): + if not all(isinstance(q, str) and q.strip() for q in query): + raise ValueError("All queries in the list must be non-empty strings.") + else: + raise TypeError("The 'query' parameter must be a string or a list of strings.") + + zone = zone or self._c.serp_zone + max_workers = max_workers or self._c.DEFAULT_MAX_WORKERS + + return self._c.search_api.search( + query, search_engine, zone, response_format, method, country, + data_format, async_request, max_workers, timeout, parse + ) + + # LinkedIn + @property + def linkedin(self): + """ + Namespaced LinkedIn search helpers. + """ + return self._c.search_linkedin diff --git a/src/types/tst b/src/types/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/types/tst @@ -0,0 +1 @@ + diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..75a2f6c --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1,35 @@ +from .validation import ( + validate_url, validate_zone_name, validate_country_code, + validate_timeout, validate_max_workers, validate_url_list, + validate_search_engine, validate_query, validate_response_format, + validate_http_method +) +from .retry import retry_request +from .zone_manager import ZoneManager +from .logging_config import setup_logging, get_logger, log_request +from .response_validator import safe_json_parse, validate_response_size, check_response_not_empty +from .parser import parse_content, parse_multiple, extract_structured_data + +__all__ = [ + 'validate_url', + 'validate_zone_name', + 'validate_country_code', + 'validate_timeout', + 'validate_max_workers', + 'validate_url_list', + 'validate_search_engine', + 'validate_query', + 'validate_response_format', + 'validate_http_method', + 'retry_request', + 'ZoneManager', + 'setup_logging', + 'get_logger', + 'log_request', + 'safe_json_parse', + 'validate_response_size', + 'check_response_not_empty', + 'parse_content', + 'parse_multiple', + 'extract_structured_data' +] \ No newline at end of file diff --git a/src/utils/logging_config.py b/src/utils/logging_config.py new file mode 100644 index 0000000..89289da --- /dev/null +++ b/src/utils/logging_config.py @@ -0,0 +1,177 @@ +""" +Structured logging configuration for Bright Data SDK +""" +import logging +import json +import time +from typing import Dict, Any +import uuid + + +class StructuredFormatter(logging.Formatter): + """Custom formatter that outputs structured JSON logs""" + + def __init__(self): + super().__init__() + self.start_time = time.time() + + def format(self, record): + log_data = { + 'timestamp': self.formatTime(record), + 'level': record.levelname, + 'logger': record.name, + 'message': record.getMessage(), + 'module': record.module, + 'function': record.funcName, + 'line': record.lineno + } + + correlation_id = getattr(record, 'correlation_id', None) + if correlation_id: + log_data['correlation_id'] = correlation_id + + if hasattr(record, 'url'): + log_data['url'] = record.url + if hasattr(record, 'method'): + log_data['method'] = record.method + if hasattr(record, 'status_code'): + log_data['status_code'] = record.status_code + if hasattr(record, 'response_time'): + log_data['response_time_ms'] = record.response_time + + if record.exc_info: + log_data['exception'] = { + 'type': record.exc_info[0].__name__ if record.exc_info[0] else None, + 'message': str(record.exc_info[1]) if record.exc_info[1] else None, + 'traceback': self.formatException(record.exc_info) + } + + log_data = self._sanitize_log_data(log_data) + + return json.dumps(log_data, default=str) + + def _sanitize_log_data(self, log_data: Dict[str, Any]) -> Dict[str, Any]: + """Remove or mask sensitive information from log data""" + sensitive_keys = ['authorization', 'token', 'api_token', 'password', 'secret'] + + def sanitize_value(key: str, value: Any) -> Any: + if isinstance(key, str) and any(sensitive in key.lower() for sensitive in sensitive_keys): + return "***REDACTED***" + elif isinstance(value, str) and len(value) > 20: + if value.isalnum() and len(value) > 32: + return f"{value[:8]}***REDACTED***{value[-4:]}" + return value + + def recursive_sanitize(obj): + if isinstance(obj, dict): + return {k: recursive_sanitize(sanitize_value(k, v)) for k, v in obj.items()} + elif isinstance(obj, list): + return [recursive_sanitize(item) for item in obj] + else: + return obj + + return recursive_sanitize(log_data) + + +def setup_logging(level: str = "INFO", structured: bool = True, verbose: bool = True) -> None: + """ + Setup logging configuration for the SDK + + Args: + level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + structured: Whether to use structured JSON logging + verbose: Whether to show verbose logging (default: True) + When False, only WARNING and above are shown + When True, uses the specified level + """ + if not verbose: + log_level = logging.WARNING + else: + log_level = getattr(logging, level.upper(), logging.INFO) + + root_logger = logging.getLogger('brightdata') + root_logger.handlers.clear() + + handler = logging.StreamHandler() + handler.setLevel(log_level) + + if structured: + formatter = StructuredFormatter() + else: + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + handler.setFormatter(formatter) + root_logger.addHandler(handler) + root_logger.setLevel(log_level) + + root_logger.propagate = False + + +def get_logger(name: str) -> logging.Logger: + """ + Get a logger instance with the specified name + + Args: + name: Logger name + + Returns: + Configured logger instance + """ + return logging.getLogger(f'brightdata.{name}') + + +def log_request(logger: logging.Logger, method: str, url: str, + status_code: int = None, response_time: float = None, + correlation_id: str = None) -> None: + """ + Log HTTP request details + + Args: + logger: Logger instance + method: HTTP method + url: Request URL (will be sanitized) + status_code: HTTP response status code + response_time: Response time in milliseconds + correlation_id: Request correlation ID + """ + extra = { + 'method': method, + 'url': _sanitize_url(url), + 'correlation_id': correlation_id or str(uuid.uuid4()) + } + + if status_code is not None: + extra['status_code'] = status_code + if response_time is not None: + extra['response_time'] = response_time + + if status_code and status_code >= 400: + logger.error(f"HTTP request failed: {method} {_sanitize_url(url)}", extra=extra) + else: + logger.info(f"HTTP request: {method} {_sanitize_url(url)}", extra=extra) + + +def _sanitize_url(url: str) -> str: + """Sanitize URL to remove sensitive query parameters""" + try: + from urllib.parse import urlparse, parse_qs, urlencode, urlunparse + + parsed = urlparse(url) + query_params = parse_qs(parsed.query) + + sensitive_params = ['token', 'api_key', 'secret', 'password'] + for param in sensitive_params: + if param in query_params: + query_params[param] = ['***REDACTED***'] + + sanitized_query = urlencode(query_params, doseq=True) + sanitized = urlunparse(( + parsed.scheme, parsed.netloc, parsed.path, + parsed.params, sanitized_query, parsed.fragment + )) + + return sanitized + except Exception: + return url.split('?')[0] + ('?***PARAMS_REDACTED***' if '?' in url else '') \ No newline at end of file diff --git a/src/utils/parser.py b/src/utils/parser.py new file mode 100644 index 0000000..686ad39 --- /dev/null +++ b/src/utils/parser.py @@ -0,0 +1,264 @@ +""" +Content parsing utilities for Bright Data SDK responses + +Provides functions to extract and parse content from scraping and search results. +""" +import json +import re +from typing import Any, Dict, List, Union, Optional + +from bs4 import BeautifulSoup + + +def parse_content(data: Union[str, Dict, List], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> Union[Dict[str, Any], List[Dict[str, Any]]]: + """ + Parse content from Bright Data API responses + + Automatically detects and handles both single and multiple results from scrape/search operations. + Can be used as a standalone function or called from the client. + + Args: + data: Response data from scrape() or search() - can be JSON dict/list or HTML string + extract_text: Extract clean text content (default: True) + extract_links: Extract all links from content (default: False) + extract_images: Extract image URLs from content (default: False) + + Returns: + Dict containing parsed content for single results, or List[Dict] for multiple results with keys: + - 'type': 'json' or 'html' + - 'text': Cleaned text content (if extract_text=True) + - 'links': List of extracted links (if extract_links=True) + - 'images': List of image URLs (if extract_images=True) + - 'title': Page title (if available) + - 'raw_length': Length of original content + - 'structured_data': Original JSON data (if type='json') + """ + if _is_multiple_results(data): + return parse_multiple(data, extract_text=extract_text, extract_links=extract_links, extract_images=extract_images) + + return _parse_single_content(data, extract_text, extract_links, extract_images) + + +def parse_multiple(data_list: List[Union[str, Dict]], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> List[Dict[str, Any]]: + """ + Parse multiple content items (useful for batch scraping results) + + Args: + data_list: List of response data items + extract_text: Extract clean text content (default: True) + extract_links: Extract all links from content (default: False) + extract_images: Extract image URLs from content (default: False) + + Returns: + List of parsed content dictionaries + """ + if not isinstance(data_list, list): + return [] + + return [_parse_single_content(item, extract_text, extract_links, extract_images) for item in data_list] + + +def _is_multiple_results(data: Union[str, Dict, List]) -> bool: + """ + Detect if data contains multiple scraping/search results + + Args: + data: Response data to analyze + + Returns: + True if data appears to be multiple results, False otherwise + """ + if not isinstance(data, list): + return False + + if len(data) <= 1: + return False + + multiple_result_indicators = 0 + + for item in data[:3]: + if isinstance(item, dict): + common_keys = {'html', 'body', 'content', 'page_html', 'raw_html', 'url', 'status_code'} + if any(key in item for key in common_keys): + multiple_result_indicators += 1 + elif isinstance(item, str) and len(item) > 100: + if '= 2 + + +def _parse_single_content(data: Union[str, Dict, List], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> Dict[str, Any]: + """ + Parse single content item from Bright Data API responses + + Args: + data: Single response data item - can be JSON dict or HTML string + extract_text: Extract clean text content (default: True) + extract_links: Extract all links from content (default: False) + extract_images: Extract image URLs from content (default: False) + + Returns: + Dict containing parsed content + """ + result = { + 'type': None, + 'raw_length': 0, + 'title': None + } + + if data is None: + return result + + if isinstance(data, (dict, list)): + result['type'] = 'json' + result['structured_data'] = data + result['raw_length'] = len(str(data)) + + html_content = _extract_html_from_json(data) + if html_content and (extract_text or extract_links or extract_images): + _parse_html_content(html_content, result, extract_text, extract_links, extract_images) + + result['title'] = _extract_title_from_json(data) + + elif isinstance(data, str): + result['type'] = 'html' + result['raw_length'] = len(data) + + if extract_text or extract_links or extract_images: + _parse_html_content(data, result, extract_text, extract_links, extract_images) + + return result + + +def extract_structured_data(data: Union[str, Dict, List]) -> Optional[Dict]: + """ + Extract structured data (JSON-LD, microdata) from content + + Args: + data: Response data + + Returns: + Structured data if found, None otherwise + """ + html_content = None + + if isinstance(data, str): + html_content = data + elif isinstance(data, (dict, list)): + html_content = _extract_html_from_json(data) + + if not html_content: + return None + + try: + soup = BeautifulSoup(html_content, 'html.parser') + + scripts = soup.find_all('script', type='application/ld+json') + if scripts: + structured_data = [] + for script in scripts: + try: + data = json.loads(script.string) + structured_data.append(data) + except json.JSONDecodeError: + continue + if structured_data: + return {'json_ld': structured_data} + + except Exception: + pass + + return None + + +def _extract_html_from_json(data: Union[Dict, List]) -> Optional[str]: + """Extract HTML content from JSON response structure""" + if isinstance(data, dict): + html_keys = ['html', 'body', 'content', 'page_html', 'raw_html'] + for key in html_keys: + if key in data and isinstance(data[key], str): + return data[key] + + for value in data.values(): + if isinstance(value, (dict, list)): + html = _extract_html_from_json(value) + if html: + return html + + elif isinstance(data, list): + for item in data: + if isinstance(item, (dict, list)): + html = _extract_html_from_json(item) + if html: + return html + + return None + + +def _extract_title_from_json(data: Union[Dict, List]) -> Optional[str]: + """Extract title from JSON response structure""" + if isinstance(data, dict): + title_keys = ['title', 'page_title', 'name'] + for key in title_keys: + if key in data and isinstance(data[key], str): + return data[key].strip() + + for value in data.values(): + if isinstance(value, (dict, list)): + title = _extract_title_from_json(value) + if title: + return title + + elif isinstance(data, list): + for item in data: + if isinstance(item, (dict, list)): + title = _extract_title_from_json(item) + if title: + return title + + return None + + +def _parse_html_content(html: str, result: Dict, extract_text: bool, extract_links: bool, extract_images: bool): + """Parse HTML content and update result dictionary""" + try: + soup = BeautifulSoup(html, 'html.parser') + + if not result.get('title'): + title_tag = soup.find('title') + if title_tag: + result['title'] = title_tag.get_text().strip() + + if extract_text: + for script in soup(["script", "style"]): + script.decompose() + + text = soup.get_text() + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + result['text'] = '\n'.join(chunk for chunk in chunks if chunk) + + if extract_links: + links = [] + for a_tag in soup.find_all('a', href=True): + href = a_tag['href'] + text = a_tag.get_text().strip() + links.append({'url': href, 'text': text}) + result['links'] = links + + if extract_images: + images = [] + for img_tag in soup.find_all('img', src=True): + src = img_tag['src'] + alt = img_tag.get('alt', '').strip() + images.append({'url': src, 'alt': alt}) + result['images'] = images + + except Exception as e: + if extract_text: + result['text'] = f"HTML parsing failed: {str(e)}" + if extract_links: + result['links'] = [] + if extract_images: + result['images'] = [] \ No newline at end of file diff --git a/src/utils/response_validator.py b/src/utils/response_validator.py new file mode 100644 index 0000000..83a9aa7 --- /dev/null +++ b/src/utils/response_validator.py @@ -0,0 +1,49 @@ +""" +Minimal response validation utilities for Bright Data SDK +""" +import json +from typing import Any, Dict, Union +from ..exceptions import ValidationError + + +def safe_json_parse(response_text: str) -> Dict[str, Any]: + """ + Safely parse JSON response with minimal validation + + Args: + response_text: Raw response text from API + + Returns: + Parsed JSON data or original text if parsing fails + """ + if not response_text: + return {} + + try: + return json.loads(response_text) + except (json.JSONDecodeError, TypeError): + # Return original text if JSON parsing fails + return response_text + + +def validate_response_size(response_text: str, max_size_mb: float = 100.0) -> None: + """ + Quick size check to prevent memory issues + + Args: + response_text: Response text to validate + max_size_mb: Maximum allowed size in megabytes + """ + if response_text and len(response_text) > (max_size_mb * 1024 * 1024): + raise ValidationError(f"Response too large (>{max_size_mb}MB)") + + +def check_response_not_empty(data: Any) -> None: + """ + Minimal check that response contains data + + Args: + data: Response data to check + """ + if data is None or (isinstance(data, str) and len(data.strip()) == 0): + raise ValidationError("Empty response received") \ No newline at end of file diff --git a/src/utils/retry.py b/src/utils/retry.py new file mode 100644 index 0000000..361645a --- /dev/null +++ b/src/utils/retry.py @@ -0,0 +1,90 @@ +import time +import random +import requests +from functools import wraps +from ..exceptions import NetworkError, APIError + + +def retry_request(max_retries=3, backoff_factor=1.5, retry_statuses=None, max_backoff=60): + """ + Decorator for retrying requests with exponential backoff and jitter + + Args: + max_retries: Maximum number of retry attempts + backoff_factor: Exponential backoff multiplier + retry_statuses: HTTP status codes that should trigger retries + max_backoff: Maximum backoff time in seconds + """ + if retry_statuses is None: + retry_statuses = {429, 500, 502, 503, 504} + + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + last_exception = None + + for attempt in range(max_retries + 1): # +1 to include initial attempt + try: + response = func(*args, **kwargs) + + # Check if we should retry based on status code + if hasattr(response, 'status_code') and response.status_code in retry_statuses: + if attempt >= max_retries: + raise APIError( + f"Server error after {max_retries} retries: HTTP {response.status_code}", + status_code=response.status_code, + response_text=getattr(response, 'text', '') + ) + + # Calculate backoff with jitter + backoff_time = min(backoff_factor ** attempt, max_backoff) + jitter = backoff_time * 0.1 * random.random() # Add up to 10% jitter + total_delay = backoff_time + jitter + + time.sleep(total_delay) + continue + + return response + + except requests.exceptions.ConnectTimeout as e: + last_exception = NetworkError(f"Connection timeout: {str(e)}") + except requests.exceptions.ReadTimeout as e: + last_exception = NetworkError(f"Read timeout: {str(e)}") + except requests.exceptions.Timeout as e: + last_exception = NetworkError(f"Request timeout: {str(e)}") + except requests.exceptions.ConnectionError as e: + # Handle DNS resolution, connection refused, etc. + if "Name or service not known" in str(e): + last_exception = NetworkError(f"DNS resolution failed: {str(e)}") + elif "Connection refused" in str(e): + last_exception = NetworkError(f"Connection refused: {str(e)}") + else: + last_exception = NetworkError(f"Connection error: {str(e)}") + except requests.exceptions.SSLError as e: + last_exception = NetworkError(f"SSL/TLS error: {str(e)}") + except requests.exceptions.ProxyError as e: + last_exception = NetworkError(f"Proxy error: {str(e)}") + except requests.exceptions.RequestException as e: + last_exception = NetworkError(f"Network error: {str(e)}") + except Exception as e: + # Catch any other unexpected exceptions + last_exception = NetworkError(f"Unexpected error: {str(e)}") + + # If this was the last attempt, raise the exception + if attempt >= max_retries: + raise last_exception + + # Calculate backoff with jitter for network errors + backoff_time = min(backoff_factor ** attempt, max_backoff) + jitter = backoff_time * 0.1 * random.random() + total_delay = backoff_time + jitter + + time.sleep(total_delay) + + # This should never be reached, but just in case + if last_exception: + raise last_exception + return None + + return wrapper + return decorator \ No newline at end of file diff --git a/src/utils/tst b/src/utils/tst new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/utils/tst @@ -0,0 +1 @@ + diff --git a/src/utils/validation.py b/src/utils/validation.py new file mode 100644 index 0000000..938cb43 --- /dev/null +++ b/src/utils/validation.py @@ -0,0 +1,183 @@ +from urllib.parse import urlparse +from typing import Union, List +from ..exceptions import ValidationError + + +def validate_url(url: str) -> None: + """Validate URL format with comprehensive checks""" + if not isinstance(url, str): + raise ValidationError(f"URL must be a string, got {type(url).__name__}") + + if not url.strip(): + raise ValidationError("URL cannot be empty or whitespace") + + # Check URL length + if len(url) > 8192: # Common URL length limit + raise ValidationError("URL exceeds maximum length of 8192 characters") + + try: + parsed = urlparse(url.strip()) + if not parsed.scheme: + raise ValidationError(f"URL must include a scheme (http/https): {url}") + if parsed.scheme.lower() not in ['http', 'https']: + raise ValidationError(f"URL scheme must be http or https, got: {parsed.scheme}") + if not parsed.netloc: + raise ValidationError(f"URL must include a valid domain: {url}") + # Check for suspicious characters + if any(char in url for char in ['<', '>', '"', "'"]): + raise ValidationError("URL contains invalid characters") + except Exception as e: + if isinstance(e, ValidationError): + raise + raise ValidationError(f"Invalid URL format '{url}': {str(e)}") + + +def validate_zone_name(zone: str = None) -> None: + """Validate zone name format with enhanced checks""" + if zone is None: + return # Zone can be None (optional parameter) + + if not isinstance(zone, str): + raise ValidationError(f"Zone name must be a string, got {type(zone).__name__}") + + zone = zone.strip() + if not zone: + raise ValidationError("Zone name cannot be empty or whitespace") + + if len(zone) < 3: + raise ValidationError("Zone name must be at least 3 characters long") + + if len(zone) > 63: + raise ValidationError("Zone name must not exceed 63 characters") + + if not zone.replace('_', '').replace('-', '').isalnum(): + raise ValidationError("Zone name can only contain letters, numbers, hyphens, and underscores") + + if zone.startswith('-') or zone.endswith('-'): + raise ValidationError("Zone name cannot start or end with a hyphen") + + if zone.startswith('_') or zone.endswith('_'): + raise ValidationError("Zone name cannot start or end with an underscore") + + +def validate_country_code(country: str) -> None: + """Validate ISO country code format""" + if not isinstance(country, str): + raise ValidationError(f"Country code must be a string, got {type(country).__name__}") + + country = country.strip().lower() + if len(country) == 0: + return + + if len(country) != 2: + raise ValidationError("Country code must be exactly 2 characters (ISO 3166-1 alpha-2) or empty") + + if not country.isalpha(): + raise ValidationError("Country code must contain only letters") + + +def validate_timeout(timeout: int) -> None: + """Validate timeout value""" + if timeout is None: + return # Timeout can be None (use default) + + if not isinstance(timeout, int): + raise ValidationError(f"Timeout must be an integer, got {type(timeout).__name__}") + + if timeout <= 0: + raise ValidationError("Timeout must be greater than 0 seconds") + + if timeout > 300: # 5 minutes max + raise ValidationError("Timeout cannot exceed 300 seconds (5 minutes)") + + +def validate_max_workers(max_workers: int) -> None: + """Validate max_workers parameter""" + if max_workers is None: + return # Can be None (use default) + + if not isinstance(max_workers, int): + raise ValidationError(f"max_workers must be an integer, got {type(max_workers).__name__}") + + if max_workers <= 0: + raise ValidationError("max_workers must be greater than 0") + + if max_workers > 50: # Reasonable upper limit + raise ValidationError("max_workers cannot exceed 50 (to prevent resource exhaustion)") + + +def validate_url_list(urls: List[str], max_urls: int = 100) -> None: + """Validate list of URLs with size limits""" + if not isinstance(urls, list): + raise ValidationError(f"URL list must be a list, got {type(urls).__name__}") + + if len(urls) == 0: + raise ValidationError("URL list cannot be empty") + + if len(urls) > max_urls: + raise ValidationError(f"URL list cannot contain more than {max_urls} URLs") + + for i, url in enumerate(urls): + try: + validate_url(url) + except ValidationError as e: + raise ValidationError(f"Invalid URL at index {i}: {str(e)}") + + +def validate_search_engine(search_engine: str) -> None: + """Validate search engine parameter""" + if not isinstance(search_engine, str): + raise ValidationError(f"Search engine must be a string, got {type(search_engine).__name__}") + + valid_engines = ['google', 'bing', 'yandex'] + search_engine = search_engine.strip().lower() + + if search_engine not in valid_engines: + raise ValidationError(f"Invalid search engine '{search_engine}'. Valid options: {', '.join(valid_engines)}") + + +def validate_query(query: Union[str, List[str]]) -> None: + """Validate search query parameter""" + if isinstance(query, str): + if not query.strip(): + raise ValidationError("Search query cannot be empty or whitespace") + if len(query) > 2048: + raise ValidationError("Search query cannot exceed 2048 characters") + elif isinstance(query, list): + if len(query) == 0: + raise ValidationError("Query list cannot be empty") + if len(query) > 50: # Reasonable limit + raise ValidationError("Query list cannot contain more than 50 queries") + for i, q in enumerate(query): + if not isinstance(q, str): + raise ValidationError(f"Query at index {i} must be a string, got {type(q).__name__}") + if not q.strip(): + raise ValidationError(f"Query at index {i} cannot be empty or whitespace") + if len(q) > 2048: + raise ValidationError(f"Query at index {i} cannot exceed 2048 characters") + else: + raise ValidationError(f"Query must be a string or list of strings, got {type(query).__name__}") + + +def validate_response_format(response_format: str) -> None: + """Validate response format parameter""" + if not isinstance(response_format, str): + raise ValidationError(f"Response format must be a string, got {type(response_format).__name__}") + + valid_formats = ['json', 'raw'] + response_format = response_format.strip().lower() + + if response_format not in valid_formats: + raise ValidationError(f"Invalid response format '{response_format}'. Valid options: {', '.join(valid_formats)}") + + +def validate_http_method(method: str) -> None: + """Validate HTTP method parameter""" + if not isinstance(method, str): + raise ValidationError(f"HTTP method must be a string, got {type(method).__name__}") + + valid_methods = ['GET', 'POST', 'PUT', 'DELETE', 'PATCH'] + method = method.strip().upper() + + if method not in valid_methods: + raise ValidationError(f"Invalid HTTP method '{method}'. Valid options: {', '.join(valid_methods)}") \ No newline at end of file diff --git a/src/utils/zone_manager.py b/src/utils/zone_manager.py new file mode 100644 index 0000000..82a1205 --- /dev/null +++ b/src/utils/zone_manager.py @@ -0,0 +1,174 @@ +import requests +import json +import logging +import time +from ..exceptions import ZoneError, NetworkError, APIError +from .retry import retry_request + +logger = logging.getLogger(__name__) + + +class ZoneManager: + """Manages Bright Data zones - creation and validation""" + + def __init__(self, session: requests.Session): + self.session = session + + def ensure_required_zones(self, web_unlocker_zone: str, serp_zone: str): + """ + Check if required zones exist and create them if they don't. + Raises exceptions on failure instead of silently continuing. + """ + try: + logger.info("Checking existing zones...") + zones = self._get_zones_with_retry() + zone_names = {zone.get('name') for zone in zones} + logger.info(f"Found {len(zones)} existing zones") + + zones_to_create = [] + if web_unlocker_zone not in zone_names: + zones_to_create.append((web_unlocker_zone, 'unblocker')) + logger.info(f"Need to create web unlocker zone: {web_unlocker_zone}") + + if serp_zone not in zone_names: + zones_to_create.append((serp_zone, 'serp')) + logger.info(f"Need to create SERP zone: {serp_zone}") + + if not zones_to_create: + logger.info("All required zones already exist") + return + + for zone_name, zone_type in zones_to_create: + logger.info(f"Creating zone: {zone_name} (type: {zone_type})") + self._create_zone_with_retry(zone_name, zone_type) + logger.info(f"Successfully created zone: {zone_name}") + + self._verify_zones_created([zone[0] for zone in zones_to_create]) + + except (ZoneError, NetworkError, APIError): + raise + except requests.exceptions.RequestException as e: + logger.error(f"Network error while ensuring zones exist: {e}") + raise NetworkError(f"Failed to ensure zones due to network error: {str(e)}") + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON response while checking zones: {e}") + raise ZoneError(f"Invalid response format from zones API: {str(e)}") + except Exception as e: + logger.error(f"Unexpected error while ensuring zones exist: {e}") + raise ZoneError(f"Unexpected error during zone creation: {str(e)}") + + @retry_request(max_retries=3, backoff_factor=1.5, retry_statuses={429, 500, 502, 503, 504}) + def _get_zones_with_retry(self): + """Get zones list with retry logic for network issues""" + response = self.session.get('https://api.brightdata.com/zone/get_active_zones') + + if response.status_code == 200: + try: + return response.json() or [] + except json.JSONDecodeError as e: + raise ZoneError(f"Invalid JSON response from zones API: {str(e)}") + elif response.status_code == 401: + raise ZoneError("Unauthorized (401): Check your API token and ensure it has proper permissions") + elif response.status_code == 403: + raise ZoneError("Forbidden (403): API token lacks sufficient permissions for zone operations") + else: + raise ZoneError(f"Failed to list zones ({response.status_code}): {response.text}") + + @retry_request(max_retries=3, backoff_factor=1.5, retry_statuses={429, 500, 502, 503, 504}) + def _create_zone_with_retry(self, zone_name: str, zone_type: str): + """ + Create a new zone in Bright Data with retry logic + + Args: + zone_name: Name for the new zone + zone_type: Type of zone ('unblocker' or 'serp') + """ + if zone_type == "serp": + plan_config = { + "type": "unblocker", + "serp": True + } + else: + plan_config = { + "type": zone_type + } + + payload = { + "plan": plan_config, + "zone": { + "name": zone_name, + "type": zone_type + } + } + + response = self.session.post( + 'https://api.brightdata.com/zone', + json=payload + ) + + if response.status_code in [200, 201]: + logger.info(f"Zone creation successful: {zone_name}") + return response + elif response.status_code == 409 or "Duplicate zone name" in response.text or "already exists" in response.text.lower(): + logger.info(f"Zone {zone_name} already exists - this is expected") + return response + elif response.status_code == 401: + raise ZoneError(f"Unauthorized (401): API token invalid or lacks permissions to create zone '{zone_name}'") + elif response.status_code == 403: + raise ZoneError(f"Forbidden (403): API token lacks permissions to create zone '{zone_name}'. Note: sdk_unlocker and sdk_serp zones should be allowed for all permissions.") + elif response.status_code == 400: + raise ZoneError(f"Bad request (400) creating zone '{zone_name}': {response.text}") + else: + raise ZoneError(f"Failed to create zone '{zone_name}' ({response.status_code}): {response.text}") + + def _verify_zones_created(self, zone_names: list): + """ + Verify that zones were successfully created by checking the zones list + """ + max_attempts = 3 + for attempt in range(max_attempts): + try: + logger.info(f"Verifying zone creation (attempt {attempt + 1}/{max_attempts})") + time.sleep(1) + + zones = self._get_zones_with_retry() + existing_zone_names = {zone.get('name') for zone in zones} + + missing_zones = [name for name in zone_names if name not in existing_zone_names] + + if not missing_zones: + logger.info("All zones verified successfully") + return + + if attempt == max_attempts - 1: + raise ZoneError(f"Zone verification failed: zones {missing_zones} not found after creation") + + logger.warning(f"Zones not yet visible: {missing_zones}. Retrying verification...") + + except (ZoneError, NetworkError): + if attempt == max_attempts - 1: + raise + logger.warning(f"Zone verification attempt {attempt + 1} failed, retrying...") + time.sleep(2 ** attempt) + + def _create_zone(self, zone_name: str, zone_type: str): + """ + Legacy method - kept for backward compatibility + Use _create_zone_with_retry instead for new code + """ + return self._create_zone_with_retry(zone_name, zone_type) + + def list_zones(self): + """ + List all active zones in your Bright Data account + + Returns: + List of zone dictionaries with their configurations + """ + try: + return self._get_zones_with_retry() + except (ZoneError, NetworkError): + raise + except Exception as e: + logger.error(f"Unexpected error listing zones: {e}") + raise ZoneError(f"Unexpected error while listing zones: {str(e)}") \ No newline at end of file diff --git a/tests/test_client.py b/tests/test_client.py index 51b1315..6dbc285 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -15,11 +15,18 @@ - Testing validation logic and error messages """ +<<<<<<< HEAD import pytest import os from unittest.mock import patch from brightdata import bdclient +======= +import os +import pytest +from brightdata import bdclient +from unittest.mock import patch +>>>>>>> mywork/main from brightdata.exceptions import ValidationError @@ -116,6 +123,149 @@ def mock_post(*args, **kwargs): request_data = captured_request.get('json', {}) assert "&brd_json=1" in request_data["url"] +<<<<<<< HEAD if __name__ == "__main__": - pytest.main([__file__]) \ No newline at end of file + pytest.main([__file__]) +======= +class TestClientSearchGPT: + """Tests for the client.search_gpt() function""" + + @pytest.fixture + @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones') + def client(self, mock_zones): + """Create a test client with mocked validation""" + with patch.dict(os.environ, {}, clear=True): + client = bdclient(api_token="valid_test_token_12345678", auto_create_zones=False) + return client + + # VALIDATION TESTS + + def test_prompt_required(self, client): + """Ensure ValidationError is raised when prompt is missing""" + with pytest.raises(ValidationError, match="prompt is required"): + client.search_gpt(prompt=None) + + def test_invalid_country_format(self, client): + """Reject invalid country codes""" + with pytest.raises(ValidationError, match="must be 2-letter code"): + client.search_gpt(prompt="hi", country="USA") + + def test_websearch_bool_validation(self, client): + """Reject invalid webSearch parameter type""" + with pytest.raises(ValidationError, match="must be a boolean or list of booleans"): + client.search_gpt(prompt="hi", webSearch="yes") + + # PARAMETER NORMALIZATION + + def test_normalizes_single_values_to_list(self, client, monkeypatch): + """Convert single parameters to list form""" + # Mock the session.post to intercept and provide dummy response + def dummy_post(url, json=None, timeout=None): + from unittest.mock import Mock + r = Mock() + r.status_code = 200 + # Build dummy response with normalized lists + if isinstance(json, list): + prompts = [item.get('prompt', '') for item in json] + countries = [item.get('country', '') for item in json] + sec_prompts = [item.get('additional_prompt', '') for item in json] + web_searches = [item.get('web_search', False) for item in json] + else: + prompts = [json.get('prompt', '')] + countries = [json.get('country', '')] + sec_prompts = [json.get('additional_prompt', '')] + web_searches = [json.get('web_search', False)] + r.json.return_value = { + 'prompt': prompts, + 'country': countries, + 'secondaryPrompt': sec_prompts, + 'webSearch': web_searches + } + return r + monkeypatch.setattr(client.search_api.session, 'post', dummy_post) + result = client.search_gpt( + prompt="hello", + country="US", + secondaryPrompt="follow up", + webSearch=False, + sync=True + ) + # The internal normalized payload should contain lists + assert isinstance(result["prompt"], list) + assert isinstance(result["country"], list) + assert isinstance(result["secondaryPrompt"], list) + assert isinstance(result["webSearch"], list) + + # MOCKED API CALL TESTS + + def test_sync_request_success(self, client, monkeypatch): + """Ensure sync request returns expected payload""" + mock_response = {"status": "ok", "data": "result"} + captured_payload = {} + + def mock_post(url, json=None, timeout=None): + captured_payload.update(json or {}) + from unittest.mock import Mock + r = Mock() + r.status_code = 200 + r.json.return_value = mock_response + return r + + monkeypatch.setattr(client.search_api.session, "post", mock_post) + + response = client.search_gpt(prompt="Hello", country="US", sync=True) + assert response == mock_response + assert captured_payload["url"] == "https://chatgpt.com" + assert "prompt" in captured_payload + assert "country" in captured_payload + + def test_async_request_timeout(self, client, monkeypatch): + """Ensure async mode uses correct timeout""" + captured_args = {} + + def mock_post(url, json=None, timeout=None): + captured_args["timeout"] = timeout + from unittest.mock import Mock + r = Mock() + r.status_code = 200 + r.json.return_value = {"id": "s_testid"} + return r + + monkeypatch.setattr(client.search_api.session, "post", mock_post) + + client.search_gpt(prompt="Async test", sync=False) + assert captured_args["timeout"] == 30 # default async timeout + + # ERROR AND RETRY HANDLING + + def test_retry_on_failure(self, client, monkeypatch): + """Test that request is retried on temporary failure""" + call_count = {"n": 0} + + def mock_post(url, json=None, timeout=None): + call_count["n"] += 1 + from unittest.mock import Mock + r = Mock() + r.status_code = 500 if call_count["n"] == 1 else 200 + r.json.return_value = {"ok": True} + return r + + monkeypatch.setattr(client.search_api.session, "post", mock_post) + result = client.search_gpt(prompt="retry", sync=True) + assert result["ok"] is True + assert call_count["n"] == 2 # retried once + + def test_raises_error_after_max_retries(self, client, monkeypatch): + """Ensure error is raised after exceeding retries""" + def mock_post(url, json=None, timeout=None): + from unittest.mock import Mock + r = Mock() + r.status_code = 500 + r.json.return_value = {"error": "server error"} + return r + + monkeypatch.setattr(client.search_api.session, "post", mock_post) + with pytest.raises(RuntimeError, match="Failed after retries"): + client.search_gpt(prompt="fail test", sync=True) +>>>>>>> mywork/main