diff --git a/README.md b/README.md
index 04fa2cc..4252525 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,11 @@
+<<<<<<< HEAD
Python SDK by Bright Data, Easy-to-use scalable methods for web search & scraping
+=======
+Python SDK by Bright Data, Easy to use scalable methods for web search & scraping
+>>>>>>> mywork/main
## Installation
@@ -10,11 +14,19 @@ To install the package, open your terminal:
```python
pip install brightdata-sdk
```
+<<<<<<< HEAD
> If using macOS, first open a virtual environment for your project
## Quick Start
Create a [Bright Data](https://brightdata.com/cp/setting/) account and copy your API key
+=======
+> If using macOS, first open a virtual environment for your project.
+
+## Quick Start
+
+Create a [Bright Data](https://brightdata.com/cp/setting/) account and copy your API key.
+>>>>>>> mywork/main
### Initialize the Client
@@ -25,7 +37,11 @@ client = bdclient(api_token="your_api_token_here") # can also be defined as BRIG
```
### Launch first request
+<<<<<<< HEAD
Add to your code a serp function
+=======
+Add to your code a serp function.
+>>>>>>> mywork/main
```python
results = client.search("best selling shoes")
@@ -38,10 +54,17 @@ print(client.parse_content(results))
| Feature | Functions | Description
|--------------------------|-----------------------------|-------------------------------------
+<<<<<<< HEAD
| **Scrape every website** | `scrape` | Scrape every website using Bright's scraping and unti bot-detection capabilities
| **Web search** | `search` | Search google and other search engines by query (supports batch searches)
| **Web crawling** | `crawl` | Discover and scrape multiple pages from websites with advanced filtering and depth control
| **AI-powered extraction** | `extract` | Extract specific information from websites using natural language queries and OpenAI
+=======
+| **Scrape any website** | `scrape` | Scrape every website using Bright's scraping and unti bot-detection capabilities
+| **Web search(SERP)** | `search` | Search google and other search engines by query (supports batch searches)
+| **Web crawling** | `crawl` | Discover and scrape multiple pages from websites with advanced filtering and depth control
+| **AI extraction** | `extract` | Extract specific information from websites using natural language queries and OpenAI
+>>>>>>> mywork/main
| **Content parsing** | `parse_content` | Extract text, links, images and structured data from API responses (JSON or HTML)
| **Browser automation** | `connect_browser` | Get WebSocket endpoint for Playwright/Selenium integration with Bright Data's scraping browser
| **Search chatGPT** | `search_chatGPT` | Prompt chatGPT and scrape its answers, support multiple inputs and follow-up prompts
@@ -51,11 +74,19 @@ print(client.parse_content(results))
| **Client class** | `bdclient` | Handles authentication, automatic zone creation and managment, and options for robust error handling
| **Parallel processing** | **all functions** | All functions use Concurrent processing for multiple URLs or queries, and support multiple Output Formats
+<<<<<<< HEAD
### Try usig one of the functions
#### `Search()`
```python
# Simple single query search
+=======
+### Try using one of the functions
+
+#### `Search()`
+```python
+# Single query search
+>>>>>>> mywork/main
result = client.search("pizza restaurants")
# Try using multiple queries (parallel processing), with custom configuration
@@ -69,7 +100,11 @@ results = client.search(
```
#### `scrape()`
```python
+<<<<<<< HEAD
# Simple single URL scrape
+=======
+# Single URL scrape
+>>>>>>> mywork/main
result = client.scrape("https://example.com")
# Multiple URLs (parallel processing) with custom options
@@ -83,6 +118,7 @@ results = client.scrape(
```
#### `search_chatGPT()`
```python
+<<<<<<< HEAD
result = client.search_chatGPT(
prompt="what day is it today?"
# prompt=["What are the top 3 programming languages in 2024?", "Best hotels in New York", "Explain quantum computing"],
@@ -90,6 +126,22 @@ result = client.search_chatGPT(
)
client.download_content(result) # In case of timeout error, your snapshot_id is presented and you will downloaded it using download_snapshot()
+=======
+# Sync mode (immediate result)
+result = client.search_gpt(
+ prompt="Top startups in Tel Aviv",
+ country="IL",
+ web_search=True
+)
+print(result)
+
+# Async mode (retrieve snapshot later)
+result = client.search_gpt(
+ prompt="Top startups in 2024",
+ sync=False
+)
+print(result["snapshot_id"])
+>>>>>>> mywork/main
```
#### `search_linkedin.`
@@ -125,7 +177,11 @@ print(results) # will print the snapshot_id, which can be downloaded using the d
result = client.crawl(
url="https://example.com/",
depth=2,
+<<<<<<< HEAD
filter="/product/", # Only crawl URLs containing "/product/"
+=======
+ include_filter="/product/", # Only crawl URLs containing "/product/"
+>>>>>>> mywork/main
exclude_filter="/ads/", # Exclude URLs containing "/ads/"
custom_output_fields=["markdown", "url", "page_title"]
)
@@ -202,12 +258,23 @@ client.download_content(data)
```
**`download_snapshot`** (for async requests)
```python
+<<<<<<< HEAD
# Save this function to seperate file
client.download_snapshot("") # Insert your snapshot_id
```
> [!TIP]
> Hover over the "search" or each function in the package, to see all its available parameters.
+=======
+# Save this function to a seperate file
+# Download snapshot
+snapshot_id = "your_snapshot_id_here" # <-- Replace with your actual snapshot ID
+client.download_snapshot(snapshot_id) # Insert your snapshot_id
+```
+
+> [!TIP]
+> Hover over the "search" or each function in the package to see all its available parameters.
+>>>>>>> mywork/main

@@ -251,7 +318,11 @@ Discover and scrape multiple pages from websites with advanced filtering.
- `url`: Single URL string or list of URLs to crawl (required)
- `ignore_sitemap`: Ignore sitemap when crawling (optional)
- `depth`: Maximum crawl depth relative to entered URL (optional)
+<<<<<<< HEAD
- `filter`: Regex to include only certain URLs (e.g. "/product/")
+=======
+- `include_filter`: Regex to include only certain URLs (e.g. "/product/")
+>>>>>>> mywork/main
- `exclude_filter`: Regex to exclude certain URLs (e.g. "/ads/")
- `custom_output_fields`: List of output fields to include (optional)
- `include_errors`: Include errors in response (default: True)
diff --git a/pyproject.toml b/pyproject.toml
index 0991d9f..d1c2c48 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,6 +60,9 @@ Repository = "https://github.com/brightdata/bright-data-sdk-python"
"Bug Reports" = "https://github.com/brightdata/bright-data-sdk-python/issues"
Changelog = "https://github.com/brightdata/bright-data-sdk-python/blob/main/CHANGELOG.md"
+[tool.setuptools]
+package-dir = {"" = "src"}
+
[tool.setuptools.packages.find]
include = ["brightdata*"]
exclude = ["tests*"]
@@ -134,4 +137,4 @@ filterwarnings = [
"error",
"ignore::UserWarning",
"ignore::DeprecationWarning",
-]
\ No newline at end of file
+]
diff --git a/setup.py b/setup.py
index a662168..502f386 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
"""
Setup script for Bright Data SDK
@@ -51,6 +52,29 @@ def read_version():
install_requires=[
"requests>=2.25.0",
"python-dotenv>=0.19.0",
+=======
+from setuptools import setup, find_packages
+
+setup(
+ name="brightdata-sdk",
+ version="1.1.3",
+ description="Python SDK for Bright Data Web Scraping and SERP APIs",
+ author="Bright Data",
+ author_email="support@brightdata.com",
+ maintainer="Bright Data",
+ maintainer_email="idanv@brightdata.com",
+ license="MIT",
+ packages=find_packages(where="src"),
+ package_dir={"": "src"},
+ include_package_data=True,
+ python_requires=">=3.8",
+ install_requires=[
+ "requests>=2.25.0",
+ "python-dotenv>=0.19.0",
+ "aiohttp>=3.8.0",
+ "beautifulsoup4>=4.9.0",
+ "openai>=1.0.0",
+>>>>>>> mywork/main
],
extras_require={
"dev": [
@@ -59,6 +83,7 @@ def read_version():
"black>=21.0.0",
"isort>=5.0.0",
"flake8>=3.8.0",
+<<<<<<< HEAD
],
},
keywords="brightdata, web scraping, proxy, serp, api, data extraction",
@@ -67,4 +92,36 @@ def read_version():
"Documentation": "https://github.com/brightdata/brightdata-sdk-python#readme",
"Source": "https://github.com/brightdata/brightdata-sdk-python",
},
-)
\ No newline at end of file
+)
+=======
+ "mypy>=0.900",
+ ],
+ "test": [
+ "pytest>=6.0.0",
+ "pytest-cov>=2.10.0",
+ ],
+ },
+ classifiers=[
+ "Development Status :: 4 - Beta",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: MIT License",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Topic :: Internet :: WWW/HTTP",
+ "Topic :: Software Development :: Libraries :: Python Modules",
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
+ ],
+ project_urls={
+ "Homepage": "https://github.com/brightdata/bright-data-sdk-python",
+ "Documentation": "https://github.com/brightdata/bright-data-sdk-python#readme",
+ "Repository": "https://github.com/brightdata/bright-data-sdk-python",
+ "Bug Reports": "https://github.com/brightdata/bright-data-sdk-python/issues",
+ "Changelog": "https://github.com/brightdata/bright-data-sdk-python/blob/main/CHANGELOG.md",
+ },
+)
+>>>>>>> mywork/main
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..c815a6c
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1,82 @@
+"""
+## Bright Data SDK for Python
+
+A comprehensive SDK for Bright Data's Web Scraping and SERP APIs, providing
+easy-to-use methods for web scraping, search engine result parsing, and data management.
+## Functions:
+First import the package and create a client:
+```python
+from brightdata import bdclient
+client = bdclient(your-apy-key)
+```
+Then use the client to call the desired functions:
+#### scrape()
+- Scrapes a website using Bright Data Web Unblocker API with proxy support (or multiple websites sequentially)
+- syntax: `results = client.scrape(url, country, max_workers, ...)`
+#### .scrape_linkedin. class
+- Scrapes LinkedIn data including posts, jobs, companies, and profiles, recieve structured data as a result
+- syntax: `results = client.scrape_linkedin.posts()/jobs()/companies()/profiles() # insert parameters per function`
+#### search()
+- Performs web searches using Bright Data SERP API with customizable search engines (or multiple search queries sequentially)
+- syntax: `results = client.search(query, search_engine, country, ...)`
+#### .search_linkedin. class
+- Search LinkedIn data including for specific posts, jobs, profiles. recieve the relevent data as a result
+- syntax: `results = client.search_linkedin.posts()/jobs()/profiles() # insert parameters per function`
+#### search_chatGPT()
+- Interact with ChatGPT using Bright Data's ChatGPT API, sending prompts and receiving responses
+- syntax: `results = client.search_chatGPT(prompt, additional_prompt, max_workers, ...)`
+#### download_content() / download_snapshot()
+- Saves the scraped content to local files in various formats (JSON, CSV, etc.)
+- syntax: `client.download_content(results)`
+- syntax: `client.download_snapshot(results)`
+#### connect_browser()
+- Get WebSocket endpoint for connecting to Bright Data's scraping browser with Playwright/Selenium
+- syntax: `endpoint_url = client.connect_browser()` then use with browser automation tools
+#### crawl()
+- Crawl websites to discover and scrape multiple pages using Bright Data's Web Crawl API
+- syntax: `result = client.crawl(url, filter, exclude_filter, depth, ...)`
+#### parse_content()
+- Parse and extract useful information from API responses (JSON or HTML)
+- syntax: `parsed = client.parse_content(data, extract_text=True, extract_links=True)`
+
+### Features:
+- Web Scraping: Scrape websites using Bright Data Web Unlocker API with proxy support
+- Search Engine Results: Perform web searches using Bright Data SERP API
+- Web Crawling: Discover and scrape multiple pages from websites with advanced filtering
+- Content Parsing: Extract text, links, images, and structured data from API responses
+- Browser Automation: Simple authentication for Bright Data's scraping browser with Playwright/Selenium
+- Multiple Search Engines: Support for Google, Bing, and Yandex
+- Parallel Processing: Concurrent processing for multiple URLs or queries
+- Robust Error Handling: Comprehensive error handling with retry logic
+- Input Validation: Automatic validation of URLs, zone names, and parameters
+- Zone Management: Automatic zone creation and management
+- Multiple Output Formats: JSON, raw HTML, markdown, and more
+"""
+
+from .client import bdclient
+from .exceptions import (
+ BrightDataError,
+ ValidationError,
+ AuthenticationError,
+ ZoneError,
+ NetworkError,
+ APIError
+)
+from .utils import parse_content, parse_multiple, extract_structured_data
+
+__version__ = "1.1.3"
+__author__ = "Bright Data"
+__email__ = "support@brightdata.com"
+
+__all__ = [
+ 'bdclient',
+ 'BrightDataError',
+ 'ValidationError',
+ 'AuthenticationError',
+ 'ZoneError',
+ 'NetworkError',
+ 'APIError',
+ 'parse_content',
+ 'parse_multiple',
+ 'extract_structured_data'
+]
\ No newline at end of file
diff --git a/src/api/__init__.py b/src/api/__init__.py
new file mode 100644
index 0000000..a79c0fd
--- /dev/null
+++ b/src/api/__init__.py
@@ -0,0 +1,13 @@
+from .scraper import WebScraper
+from .search import SearchAPI
+from .chatgpt import ChatGPTAPI
+from .linkedin import LinkedInAPI
+from .crawl import CrawlAPI
+
+__all__ = [
+ 'WebScraper',
+ 'SearchAPI',
+ 'ChatGPTAPI',
+ 'LinkedInAPI',
+ 'CrawlAPI'
+]
\ No newline at end of file
diff --git a/src/api/chatgpt.py b/src/api/chatgpt.py
new file mode 100644
index 0000000..e9edb90
--- /dev/null
+++ b/src/api/chatgpt.py
@@ -0,0 +1,126 @@
+import json
+import requests
+from typing import Union, Dict, Any, List
+
+from ..utils import get_logger
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.chatgpt')
+
+
+class ChatGPTAPI:
+ """Handles ChatGPT scraping operations using Bright Data's ChatGPT dataset API"""
+
+ def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5):
+ self.session = session
+ self.api_token = api_token
+ self.default_timeout = default_timeout
+ self.max_retries = max_retries
+ self.retry_backoff = retry_backoff
+
+ def scrape_chatgpt(
+ self,
+ prompts: List[str],
+ countries: List[str],
+ additional_prompts: List[str],
+ web_searches: List[bool],
+ sync: bool = True,
+ timeout: int = None
+ ) -> Dict[str, Any]:
+ """
+ Internal method to handle ChatGPT scraping API requests
+
+ Parameters:
+ - prompts: List of prompts to send to ChatGPT
+ - countries: List of country codes matching prompts
+ - additional_prompts: List of follow-up prompts matching prompts
+ - web_searches: List of web_search flags matching prompts
+ - sync: If True, uses synchronous API for immediate results
+ - timeout: Request timeout in seconds
+
+ Returns:
+ - Dict containing response with snapshot_id or direct data (if sync=True)
+ """
+ url = "https://api.brightdata.com/datasets/v3/scrape" if sync else "https://api.brightdata.com/datasets/v3/trigger"
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": "gd_m7aof0k82r803d5bjm",
+ "include_errors": "true"
+ }
+
+ data = [
+ {
+ "url": "https://chatgpt.com/",
+ "prompt": prompts[i],
+ "country": countries[i],
+ "additional_prompt": additional_prompts[i],
+ "web_search": web_searches[i]
+ }
+ for i in range(len(prompts))
+ ]
+
+ try:
+ response = self.session.post(
+ url,
+ headers=headers,
+ params=params,
+ json=data,
+ timeout=timeout or (65 if sync else self.default_timeout)
+ )
+
+ if response.status_code == 401:
+ raise AuthenticationError("Invalid API token or insufficient permissions")
+ elif response.status_code != 200:
+ raise APIError(f"ChatGPT scraping request failed with status {response.status_code}: {response.text}")
+
+ if sync:
+ response_text = response.text
+ if '\n{' in response_text and response_text.strip().startswith('{'):
+ json_objects = []
+ for line in response_text.strip().split('\n'):
+ if line.strip():
+ try:
+ json_objects.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ result = json_objects
+ else:
+ try:
+ result = response.json()
+ except json.JSONDecodeError:
+ result = response_text
+
+ logger.info(f"ChatGPT data retrieved synchronously for {len(prompts)} prompt(s)")
+ print(f"Retrieved {len(result) if isinstance(result, list) else 1} ChatGPT response(s)")
+ else:
+ result = response.json()
+ snapshot_id = result.get('snapshot_id')
+ if snapshot_id:
+ logger.info(f"ChatGPT scraping job initiated successfully for {len(prompts)} prompt(s)")
+ print("")
+ print("Snapshot ID:")
+ print(snapshot_id)
+ print("")
+
+ return result
+
+ except requests.exceptions.Timeout:
+ raise APIError("Timeout while initiating ChatGPT scraping")
+ except requests.exceptions.RequestException as e:
+ raise APIError(f"Network error during ChatGPT scraping: {str(e)}")
+ except json.JSONDecodeError as e:
+ raise APIError(f"Failed to parse ChatGPT scraping response: {str(e)}")
+ except Exception as e:
+ if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+ raise
+ raise APIError(f"Unexpected error during ChatGPT scraping: {str(e)}")
\ No newline at end of file
diff --git a/src/api/crawl.py b/src/api/crawl.py
new file mode 100644
index 0000000..4fe047a
--- /dev/null
+++ b/src/api/crawl.py
@@ -0,0 +1,175 @@
+import json
+from typing import Union, Dict, Any, List, Optional
+from ..utils import get_logger, validate_url
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.crawl')
+
+
+class CrawlAPI:
+ """Handles crawl operations using Bright Data's Web Crawl API"""
+
+ CRAWL_DATASET_ID = "gd_m6gjtfmeh43we6cqc"
+
+ AVAILABLE_OUTPUT_FIELDS = [
+ "markdown", "url", "html2text", "page_html", "ld_json",
+ "page_title", "timestamp", "input", "discovery_input",
+ "error", "error_code", "warning", "warning_code"
+ ]
+
+ def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5):
+ self.session = session
+ self.api_token = api_token
+ self.default_timeout = default_timeout
+ self.max_retries = max_retries
+ self.retry_backoff = retry_backoff
+
+ def crawl(
+ self,
+ url: Union[str, List[str]],
+ ignore_sitemap: Optional[bool] = None,
+ depth: Optional[int] = None,
+ filter: Optional[str] = None,
+ exclude_filter: Optional[str] = None,
+ custom_output_fields: Optional[List[str]] = None,
+ include_errors: bool = True
+ ) -> Dict[str, Any]:
+ """
+ ## Crawl websites using Bright Data's Web Crawl API
+
+ Performs web crawling to discover and scrape multiple pages from a website
+ starting from the specified URL(s).
+
+ ### Parameters:
+ - `url` (str | List[str]): Domain URL(s) to crawl (required)
+ - `ignore_sitemap` (bool, optional): Ignore sitemap when crawling
+ - `depth` (int, optional): Maximum depth to crawl relative to the entered URL
+ - `filter` (str, optional): Regular expression to include only certain URLs (e.g. "/product/")
+ - `exclude_filter` (str, optional): Regular expression to exclude certain URLs (e.g. "/ads/")
+ - `custom_output_fields` (List[str], optional): Custom output schema fields to include
+ - `include_errors` (bool, optional): Include errors in response (default: True)
+
+ ### Returns:
+ - `Dict[str, Any]`: Crawl response with snapshot_id for tracking
+
+ ### Example Usage:
+ ```python
+ # Single URL crawl
+ result = client.crawl("https://example.com/")
+
+ # Multiple URLs with filters
+ urls = ["https://example.com/", "https://example2.com/"]
+ result = client.crawl(
+ url=urls,
+ filter="/product/",
+ exclude_filter="/ads/",
+ depth=2,
+ ignore_sitemap=True
+ )
+
+ # Custom output schema
+ result = client.crawl(
+ url="https://example.com/",
+ custom_output_fields=["markdown", "url", "page_title"]
+ )
+ ```
+
+ ### Raises:
+ - `ValidationError`: Invalid URL or parameters
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+ if isinstance(url, str):
+ urls = [url]
+ elif isinstance(url, list):
+ urls = url
+ else:
+ raise ValidationError("URL must be a string or list of strings")
+
+ if not urls:
+ raise ValidationError("At least one URL is required")
+
+ for u in urls:
+ if not isinstance(u, str) or not u.strip():
+ raise ValidationError("All URLs must be non-empty strings")
+ validate_url(u)
+
+ if custom_output_fields is not None:
+ if not isinstance(custom_output_fields, list):
+ raise ValidationError("custom_output_fields must be a list")
+
+ invalid_fields = [field for field in custom_output_fields if field not in self.AVAILABLE_OUTPUT_FIELDS]
+ if invalid_fields:
+ raise ValidationError(f"Invalid output fields: {invalid_fields}. Available fields: {self.AVAILABLE_OUTPUT_FIELDS}")
+
+ crawl_inputs = []
+ for u in urls:
+ crawl_input = {"url": u}
+
+ if ignore_sitemap is not None:
+ crawl_input["ignore_sitemap"] = ignore_sitemap
+ if depth is not None:
+ crawl_input["depth"] = depth
+ if filter is not None:
+ crawl_input["filter"] = filter
+ if exclude_filter is not None:
+ crawl_input["exclude_filter"] = exclude_filter
+
+ crawl_inputs.append(crawl_input)
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ params = {
+ "dataset_id": self.CRAWL_DATASET_ID,
+ "include_errors": str(include_errors).lower(),
+ "type": "discover_new",
+ "discover_by": "domain_url"
+ }
+
+ if custom_output_fields:
+ payload = {
+ "input": crawl_inputs,
+ "custom_output_fields": custom_output_fields
+ }
+ else:
+ payload = crawl_inputs
+
+ logger.info(f"Starting crawl for {len(urls)} URL(s)")
+ logger.debug(f"Crawl parameters: depth={depth}, filter={filter}, exclude_filter={exclude_filter}")
+
+ try:
+ response = self.session.post(
+ api_url,
+ params=params,
+ json=payload,
+ timeout=self.default_timeout
+ )
+
+ if response.status_code == 200:
+ result = response.json()
+ snapshot_id = result.get('snapshot_id')
+ logger.info(f"Crawl initiated successfully. Snapshot ID: {snapshot_id}")
+ return result
+
+ elif response.status_code == 401:
+ logger.error("Unauthorized (401): Check API token")
+ raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}")
+ elif response.status_code == 403:
+ logger.error("Forbidden (403): Insufficient permissions")
+ raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}")
+ elif response.status_code == 400:
+ logger.error(f"Bad request (400): {response.text}")
+ raise APIError(f"Bad request (400): {response.text}")
+ else:
+ logger.error(f"Crawl request failed ({response.status_code}): {response.text}")
+ raise APIError(
+ f"Crawl request failed ({response.status_code}): {response.text}",
+ status_code=response.status_code,
+ response_text=response.text
+ )
+
+ except Exception as e:
+ if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+ raise
+ logger.error(f"Unexpected error during crawl: {e}")
+ raise APIError(f"Unexpected error during crawl: {str(e)}")
\ No newline at end of file
diff --git a/src/api/download.py b/src/api/download.py
new file mode 100644
index 0000000..4bccdc0
--- /dev/null
+++ b/src/api/download.py
@@ -0,0 +1,265 @@
+import json
+import requests
+from datetime import datetime
+from typing import Union, Dict, Any, List
+
+from ..utils import get_logger
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.download')
+
+
+class DownloadAPI:
+ """Handles snapshot and content download operations using Bright Data's download API"""
+
+ def __init__(self, session, api_token, default_timeout=30):
+ self.session = session
+ self.api_token = api_token
+ self.default_timeout = default_timeout
+
+ def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str:
+ """
+ ## Download content to a file based on its format
+
+ ### Args:
+ content: The content to download (dict for JSON, string for other formats)
+ filename: Optional filename. If not provided, generates one with timestamp
+ format: Format of the content ("json", "csv", "ndjson", "jsonl", "txt")
+ parse: If True, automatically parse JSON strings in 'body' fields to objects (default: False)
+
+ ### Returns:
+ Path to the downloaded file
+ """
+
+ if not filename:
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ filename = f"brightdata_results_{timestamp}.{format}"
+
+ if not filename.endswith(f".{format}"):
+ filename = f"{filename}.{format}"
+
+ if parse and isinstance(content, (list, dict)):
+ content = self._parse_body_json(content)
+
+ try:
+ if format == "json":
+ with open(filename, 'w', encoding='utf-8') as f:
+ if isinstance(content, dict) or isinstance(content, list):
+ json.dump(content, f, indent=2, ensure_ascii=False)
+ else:
+ f.write(str(content))
+ else:
+ with open(filename, 'w', encoding='utf-8') as f:
+ f.write(str(content))
+
+ logger.info(f"Content downloaded to: {filename}")
+ return filename
+
+ except IOError as e:
+ raise APIError(f"Failed to write file {filename}: {str(e)}")
+ except Exception as e:
+ raise APIError(f"Failed to download content: {str(e)}")
+
+ def download_snapshot(
+ self,
+ snapshot_id: str,
+ format: str = "json",
+ compress: bool = False,
+ batch_size: int = None,
+ part: int = None
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]], str]:
+ """
+ ## Download snapshot content from Bright Data dataset API
+
+ Downloads the snapshot content using the snapshot ID returned from scrape_chatGPT()
+ or other dataset collection triggers.
+
+ ### Parameters:
+ - `snapshot_id` (str): The snapshot ID returned when collection was triggered (required)
+ - `format` (str, optional): Format of the data - "json", "ndjson", "jsonl", or "csv" (default: "json")
+ - `compress` (bool, optional): Whether the result should be compressed (default: False)
+ - `batch_size` (int, optional): Divide into batches of X records (minimum: 1000)
+ - `part` (int, optional): If batch_size provided, specify which part to download
+
+ ### Returns:
+ - `Union[Dict, List, str]`: Snapshot data in the requested format
+
+ ### Example Usage:
+ ```python
+ # Download complete snapshot
+ data = client.download_snapshot("s_m4x7enmven8djfqak")
+
+ # Download as CSV format
+ csv_data = client.download_snapshot("s_m4x7enmven8djfqak", format="csv")
+
+ # Download in batches
+ batch_data = client.download_snapshot(
+ "s_m4x7enmven8djfqak",
+ batch_size=1000,
+ part=1
+ )
+ ```
+
+ ### Raises:
+ - `ValidationError`: Invalid parameters or snapshot_id format
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed, snapshot not found, or server error
+ """
+ if not snapshot_id or not isinstance(snapshot_id, str):
+ raise ValidationError("Snapshot ID is required and must be a non-empty string")
+
+ if format not in ["json", "ndjson", "jsonl", "csv"]:
+ raise ValidationError("Format must be one of: json, ndjson, jsonl, csv")
+
+ if not isinstance(compress, bool):
+ raise ValidationError("Compress must be a boolean")
+
+ if batch_size is not None:
+ if not isinstance(batch_size, int) or batch_size < 1000:
+ raise ValidationError("Batch size must be an integer >= 1000")
+
+ if part is not None:
+ if not isinstance(part, int) or part < 1:
+ raise ValidationError("Part must be a positive integer")
+ if batch_size is None:
+ raise ValidationError("Part parameter requires batch_size to be specified")
+
+ url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.api_token}",
+ "Accept": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "format": format
+ }
+
+ if compress:
+ params["compress"] = "true"
+
+ if batch_size is not None:
+ params["batch_size"] = batch_size
+
+ if part is not None:
+ params["part"] = part
+
+ try:
+ logger.info(f"Downloading snapshot {snapshot_id} in {format} format")
+
+ response = self.session.get(
+ url,
+ headers=headers,
+ params=params,
+ timeout=self.default_timeout
+ )
+
+ if response.status_code == 200:
+ pass
+ elif response.status_code == 202:
+ try:
+ response_data = response.json()
+ message = response_data.get('message', 'Snapshot is not ready yet')
+ print("Snapshot is not ready yet, try again soon")
+ return {"status": "not_ready", "message": message, "snapshot_id": snapshot_id}
+ except json.JSONDecodeError:
+ print("Snapshot is not ready yet, try again soon")
+ return {"status": "not_ready", "message": "Snapshot is not ready yet, check again soon", "snapshot_id": snapshot_id}
+ elif response.status_code == 401:
+ raise AuthenticationError("Invalid API token or insufficient permissions")
+ elif response.status_code == 404:
+ raise APIError(f"Snapshot '{snapshot_id}' not found")
+ else:
+ raise APIError(f"Download request failed with status {response.status_code}: {response.text}")
+
+ if format == "csv":
+ data = response.text
+ save_data = data
+ else:
+ response_text = response.text
+ if '\n{' in response_text and response_text.strip().startswith('{'):
+ json_objects = []
+ for line in response_text.strip().split('\n'):
+ if line.strip():
+ try:
+ json_objects.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ data = json_objects
+ save_data = json_objects
+ else:
+ try:
+ data = response.json()
+ save_data = data
+ except json.JSONDecodeError:
+ data = response_text
+ save_data = response_text
+
+ try:
+ output_file = f"snapshot_{snapshot_id}.{format}"
+ if format == "csv" or isinstance(save_data, str):
+ with open(output_file, 'w', encoding='utf-8') as f:
+ f.write(str(save_data))
+ else:
+ with open(output_file, 'w', encoding='utf-8') as f:
+ json.dump(save_data, f, indent=2, ensure_ascii=False)
+ logger.info(f"Data saved to: {output_file}")
+ except Exception:
+ pass
+
+ logger.info(f"Successfully downloaded snapshot {snapshot_id}")
+ return data
+
+ except requests.exceptions.Timeout:
+ raise APIError("Timeout while downloading snapshot")
+ except requests.exceptions.RequestException as e:
+ raise APIError(f"Network error during snapshot download: {str(e)}")
+ except Exception as e:
+ if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+ raise
+ raise APIError(f"Unexpected error during snapshot download: {str(e)}")
+
+ def _parse_body_json(self, content: Union[Dict, List]) -> Union[Dict, List]:
+ """
+ Parse JSON strings in 'body' fields to objects
+
+ Args:
+ content: The content to process
+
+ Returns:
+ Content with parsed body fields
+ """
+ if content is None:
+ return content
+
+ if isinstance(content, list):
+ for item in content:
+ if isinstance(item, dict) and 'body' in item:
+ body = item['body']
+ if isinstance(body, str):
+ try:
+ item['body'] = json.loads(body)
+ except (json.JSONDecodeError, TypeError):
+ pass
+ elif isinstance(item, (dict, list)):
+ self._parse_body_json(item)
+
+ elif isinstance(content, dict):
+ if 'body' in content:
+ body = content['body']
+ if isinstance(body, str):
+ try:
+ content['body'] = json.loads(body)
+ except (json.JSONDecodeError, TypeError):
+ pass
+
+ for key, value in content.items():
+ if isinstance(value, (dict, list)):
+ content[key] = self._parse_body_json(value)
+
+ return content
\ No newline at end of file
diff --git a/src/api/extract.py b/src/api/extract.py
new file mode 100644
index 0000000..1b04b84
--- /dev/null
+++ b/src/api/extract.py
@@ -0,0 +1,419 @@
+import os
+import re
+import json
+import openai
+from typing import Dict, Any, Tuple, Union, List
+from urllib.parse import urlparse
+
+from ..utils import get_logger
+from ..exceptions import ValidationError, APIError
+
+logger = get_logger('api.extract')
+
+
+class ExtractResult(str):
+ """
+ Custom result class that behaves like a string (extracted content)
+ but also provides access to metadata attributes
+ """
+ def __new__(cls, extracted_content, metadata):
+ obj = str.__new__(cls, extracted_content)
+ obj._metadata = metadata
+ return obj
+
+ def __getattr__(self, name):
+ if name in self._metadata:
+ return self._metadata[name]
+ raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
+
+ def __getitem__(self, key):
+ return self._metadata[key]
+
+ def get(self, key, default=None):
+ return self._metadata.get(key, default)
+
+ def keys(self):
+ return self._metadata.keys()
+
+ def values(self):
+ return self._metadata.values()
+
+ def items(self):
+ return self._metadata.items()
+
+ @property
+ def metadata(self):
+ """Access full metadata dictionary"""
+ return self._metadata
+
+
+class ExtractAPI:
+ """Handles content extraction using web scraping + LLM processing"""
+
+ def __init__(self, client):
+ self.client = client
+
+ def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: Dict[str, Any] = None, llm_key: str = None) -> Dict[str, Any]:
+ """
+ ## Extract specific information from websites using AI
+
+ Combines web scraping with OpenAI's language models to extract targeted information
+ from web pages based on natural language queries.
+
+ ### Parameters:
+ - `query` (str): Natural language query describing what to extract. If `url` parameter is provided,
+ this becomes the pure extraction query. If `url` is not provided, this should include
+ the URL (e.g. "extract the most recent news from cnn.com")
+ - `url` (str | List[str], optional): Direct URL(s) to scrape. If provided, bypasses URL extraction
+ from query and sends these URLs to the web unlocker API
+ - `output_scheme` (dict, optional): JSON Schema defining the expected structure for the LLM response.
+ Uses OpenAI's Structured Outputs for reliable type-safe responses.
+ Example: {"type": "object", "properties": {"title": {"type": "string"}, "date": {"type": "string"}}, "required": ["title", "date"]}
+ - `llm_key` (str, optional): OpenAI API key. If not provided, uses OPENAI_API_KEY env variable
+
+ ### Returns:
+ - `ExtractResult`: String containing extracted content with metadata attributes access
+
+ ### Example Usage:
+ ```python
+ # Using URL parameter with structured output
+ result = client.extract(
+ query="extract the most recent news headlines",
+ url="https://cnn.com",
+ output_scheme={
+ "type": "object",
+ "properties": {
+ "headlines": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "title": {"type": "string"},
+ "date": {"type": "string"}
+ },
+ "required": ["title", "date"]
+ }
+ }
+ },
+ "required": ["headlines"]
+ }
+ )
+
+ # Using URL in query (original behavior)
+ result = client.extract(
+ query="extract the most recent news from cnn.com",
+ llm_key="your-openai-api-key"
+ )
+
+ # Multiple URLs with structured schema
+ result = client.extract(
+ query="extract main headlines",
+ url=["https://cnn.com", "https://bbc.com"],
+ output_scheme={
+ "type": "object",
+ "properties": {
+ "sources": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "source_name": {"type": "string"},
+ "headlines": {"type": "array", "items": {"type": "string"}}
+ },
+ "required": ["source_name", "headlines"]
+ }
+ }
+ },
+ "required": ["sources"]
+ }
+ )
+ ```
+
+ ### Raises:
+ - `ValidationError`: Invalid query format or missing LLM key
+ - `APIError`: Scraping failed or LLM processing error
+ """
+ if not query or not isinstance(query, str):
+ raise ValidationError("Query must be a non-empty string")
+
+ query = query.strip()
+ if len(query) > 10000:
+ raise ValidationError("Query is too long (maximum 10,000 characters)")
+ if len(query) < 5:
+ raise ValidationError("Query is too short (minimum 5 characters)")
+
+ if not llm_key:
+ llm_key = os.getenv('OPENAI_API_KEY')
+
+ if not llm_key or not isinstance(llm_key, str):
+ raise ValidationError("OpenAI API key is required. Provide it as parameter or set OPENAI_API_KEY environment variable")
+
+ if output_scheme is not None:
+ if not isinstance(output_scheme, dict):
+ raise ValidationError("output_scheme must be a dict containing a valid JSON Schema")
+ if "type" not in output_scheme:
+ raise ValidationError("output_scheme must have a 'type' property")
+
+ self._validate_structured_outputs_schema(output_scheme)
+
+ logger.info(f"Processing extract query: {query[:50]}...")
+
+ try:
+ if url is not None:
+ parsed_query = query.strip()
+ target_urls = url if isinstance(url, list) else [url]
+ logger.info(f"Using provided URL(s): {target_urls}")
+ else:
+ parsed_query, extracted_url = self._parse_query_and_url(query)
+ target_urls = [extracted_url]
+ logger.info(f"Parsed - Query: '{parsed_query}', URL: '{extracted_url}'")
+
+ if len(target_urls) == 1:
+ scraped_content = self.client.scrape(target_urls[0], response_format="raw")
+ source_url = target_urls[0]
+ else:
+ scraped_content = self.client.scrape(target_urls, response_format="raw")
+ source_url = ', '.join(target_urls)
+
+ logger.info(f"Scraped content from {len(target_urls)} URL(s)")
+
+ if isinstance(scraped_content, list):
+ all_text = []
+ all_titles = []
+ for i, content in enumerate(scraped_content):
+ parsed = self.client.parse_content(
+ content,
+ extract_text=True,
+ extract_links=False,
+ extract_images=False
+ )
+ all_text.append(f"--- Content from {target_urls[i]} ---\n{parsed.get('text', '')}")
+ all_titles.append(parsed.get('title', 'Unknown'))
+
+ combined_text = "\n\n".join(all_text)
+ combined_title = " | ".join(all_titles)
+ parsed_content = {'text': combined_text, 'title': combined_title}
+ else:
+ parsed_content = self.client.parse_content(
+ scraped_content,
+ extract_text=True,
+ extract_links=False,
+ extract_images=False
+ )
+
+ logger.info(f"Parsed content - text length: {len(parsed_content.get('text', ''))}")
+
+ extracted_info, token_usage = self._process_with_llm(
+ parsed_query,
+ parsed_content.get('text', ''),
+ llm_key,
+ source_url,
+ output_scheme
+ )
+
+ metadata = {
+ 'query': parsed_query,
+ 'url': source_url,
+ 'extracted_content': extracted_info,
+ 'source_title': parsed_content.get('title', 'Unknown'),
+ 'content_length': len(parsed_content.get('text', '')),
+ 'token_usage': token_usage,
+ 'success': True
+ }
+
+ return ExtractResult(extracted_info, metadata)
+
+ except Exception as e:
+ if isinstance(e, (ValidationError, APIError)):
+ raise
+ logger.error(f"Unexpected error during extraction: {e}")
+ raise APIError(f"Extraction failed: {str(e)}")
+
+ def _parse_query_and_url(self, query: str) -> Tuple[str, str]:
+ """
+ Parse natural language query to extract the task and URL
+
+ Args:
+ query: Natural language query like "extract news from cnn.com"
+
+ Returns:
+ Tuple of (parsed_query, full_url)
+ """
+ query = query.strip()
+
+ url_patterns = [
+ r'from\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)',
+ r'on\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)',
+ r'at\s+((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)',
+ r'((?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*)'
+ ]
+
+ url = None
+ for pattern in url_patterns:
+ match = re.search(pattern, query, re.IGNORECASE)
+ if match:
+ url = match.group(1)
+ break
+
+ if not url:
+ raise ValidationError("Could not extract URL from query. Please include a website URL.")
+
+ full_url = self._build_full_url(url)
+
+ extract_query = re.sub(r'\b(?:from|on|at)\s+(?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*', '', query, flags=re.IGNORECASE)
+ extract_query = re.sub(r'\b(?:https?://)?(?:www\.)?[\w\.-]+(?:\.[\w]{2,})+(?:/[\w\.-]*)*', '', extract_query, flags=re.IGNORECASE)
+ extract_query = re.sub(r'\s+', ' ', extract_query).strip()
+
+ if not extract_query:
+ extract_query = "extract the main content"
+
+ return extract_query, full_url
+
+ def _build_full_url(self, url: str) -> str:
+ """
+ Build a complete URL from potentially partial URL
+
+ Args:
+ url: Potentially partial URL like "cnn.com" or "https://example.com"
+
+ Returns:
+ Complete URL with https:// and www if needed
+ """
+ url = url.strip()
+
+ if not url.startswith(('http://', 'https://')):
+ if not url.startswith('www.'):
+ url = f'www.{url}'
+ url = f'https://{url}'
+
+ parsed = urlparse(url)
+ if not parsed.netloc:
+ raise ValidationError(f"Invalid URL format: {url}")
+
+ return url
+
+ def _validate_structured_outputs_schema(self, schema: Dict[str, Any], path: str = "") -> None:
+ """
+ Validate JSON Schema for OpenAI Structured Outputs compatibility
+
+ Args:
+ schema: JSON Schema to validate
+ path: Current path in schema (for error reporting)
+ """
+ if not isinstance(schema, dict):
+ return
+
+ schema_type = schema.get("type")
+
+ if schema_type == "object":
+ if "properties" not in schema:
+ raise ValidationError(f"Object schema at '{path}' must have 'properties' defined")
+ if "required" not in schema:
+ raise ValidationError(f"Object schema at '{path}' must have 'required' array (OpenAI Structured Outputs requirement)")
+ if "additionalProperties" not in schema or schema["additionalProperties"] is not False:
+ raise ValidationError(f"Object schema at '{path}' must have 'additionalProperties': false (OpenAI Structured Outputs requirement)")
+
+ properties = set(schema["properties"].keys())
+ required = set(schema["required"])
+ if properties != required:
+ missing = properties - required
+ extra = required - properties
+ error_msg = f"OpenAI Structured Outputs requires ALL properties to be in 'required' array at '{path}'."
+ if missing:
+ error_msg += f" Missing from required: {list(missing)}"
+ if extra:
+ error_msg += f" Extra in required: {list(extra)}"
+ raise ValidationError(error_msg)
+
+ for prop_name, prop_schema in schema["properties"].items():
+ self._validate_structured_outputs_schema(prop_schema, f"{path}.{prop_name}")
+
+ elif schema_type == "array":
+ if "items" in schema:
+ self._validate_structured_outputs_schema(schema["items"], f"{path}[]")
+
+ def _process_with_llm(self, query: str, content: str, llm_key: str, source_url: str, output_scheme: Dict[str, Any] = None) -> Tuple[str, Dict[str, int]]:
+ """
+ Process scraped content with OpenAI to extract requested information
+
+ Args:
+ query: What to extract from the content
+ content: Scraped and parsed text content
+ llm_key: OpenAI API key
+ source_url: Source URL for context
+ output_scheme: JSON Schema dict for structured outputs (optional)
+
+ Returns:
+ Tuple of (extracted information, token usage dict)
+ """
+ if len(content) > 15000:
+ beginning = content[:8000]
+ end = content[-4000:]
+ content = f"{beginning}\n\n... [middle content truncated for token efficiency] ...\n\n{end}"
+ elif len(content) > 12000:
+ content = content[:12000] + "\n\n... [content truncated to optimize tokens]"
+
+ client = openai.OpenAI(api_key=llm_key)
+
+ system_prompt = f"""You are a precise web content extraction specialist. Your task: {query}
+
+SOURCE: {source_url}
+
+INSTRUCTIONS:
+1. Extract ONLY the specific information requested
+2. Include relevant details (dates, numbers, names) when available
+3. If requested info isn't found, briefly state what content IS available
+4. Keep response concise but complete
+5. Be accurate and factual"""
+
+ user_prompt = f"CONTENT TO ANALYZE:\n\n{content}\n\nEXTRACT: {query}"
+
+ try:
+ call_params = {
+ "model": "gpt-4o-2024-08-06",
+ "messages": [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": user_prompt}
+ ],
+ "max_tokens": 1000,
+ "temperature": 0.1
+ }
+
+ if output_scheme:
+ call_params["response_format"] = {
+ "type": "json_schema",
+ "json_schema": {
+ "name": "extracted_content",
+ "strict": True,
+ "schema": output_scheme
+ }
+ }
+ logger.info("Using OpenAI Structured Outputs with provided schema")
+ else:
+ logger.info("Using regular OpenAI completion (no structured schema provided)")
+
+ response = client.chat.completions.create(**call_params)
+
+ if not response.choices or not response.choices[0].message.content:
+ raise APIError("OpenAI returned empty response")
+
+ extracted_content = response.choices[0].message.content.strip()
+
+ if output_scheme:
+ logger.info("Received structured JSON response from OpenAI")
+ else:
+ logger.info("Received text response from OpenAI")
+
+ token_usage = {
+ 'prompt_tokens': response.usage.prompt_tokens,
+ 'completion_tokens': response.usage.completion_tokens,
+ 'total_tokens': response.usage.total_tokens
+ }
+
+ logger.info(f"OpenAI token usage: {token_usage['total_tokens']} total ({token_usage['prompt_tokens']} prompt + {token_usage['completion_tokens']} completion)")
+
+ return extracted_content, token_usage
+
+ except Exception as e:
+ logger.error(f"OpenAI API error: {e}")
+ raise APIError(f"Failed to process content with LLM: {str(e)}")
\ No newline at end of file
diff --git a/src/api/linkedin.py b/src/api/linkedin.py
new file mode 100644
index 0000000..19ede6b
--- /dev/null
+++ b/src/api/linkedin.py
@@ -0,0 +1,803 @@
+import json
+import re
+import requests
+from typing import Union, Dict, Any, List
+
+from ..utils import get_logger
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.linkedin')
+
+
+class LinkedInAPI:
+ """Handles LinkedIn data collection using Bright Data's collect API"""
+
+ DATASET_IDS = {
+ 'profile': 'gd_l1viktl72bvl7bjuj0',
+ 'company': 'gd_l1vikfnt1wgvvqz95w',
+ 'job': 'gd_lpfll7v5hcqtkxl6l',
+ 'post': 'gd_lyy3tktm25m4avu764'
+ }
+
+ URL_PATTERNS = {
+ 'profile': re.compile(r'linkedin\.com/in/[^/?]+/?(\?.*)?$'),
+ 'company': re.compile(r'linkedin\.com/(company|organization-guest/company)/[^/?]+/?(\?.*)?$'),
+ 'job': re.compile(r'linkedin\.com/jobs/view/[^/?]+/?(\?.*)?$'),
+ 'post': re.compile(r'linkedin\.com/(posts|pulse)/[^/?]+/?(\?.*)?$')
+ }
+
+ def __init__(self, session, api_token, default_timeout=30, max_retries=3, retry_backoff=1.5):
+ self.session = session
+ self.api_token = api_token
+ self.default_timeout = default_timeout
+ self.max_retries = max_retries
+ self.retry_backoff = retry_backoff
+
+ def _identify_dataset_type(self, url: str) -> str:
+ """
+ Identify LinkedIn dataset type based on URL pattern
+
+ Args:
+ url: LinkedIn URL to analyze
+
+ Returns:
+ Dataset type ('profile', 'company', 'job', 'post')
+
+ Raises:
+ ValidationError: If URL doesn't match any known LinkedIn pattern
+ """
+ if not url or not isinstance(url, str):
+ raise ValidationError("URL must be a non-empty string")
+
+ url = url.strip().lower()
+ for dataset_type, pattern in self.URL_PATTERNS.items():
+ if pattern.search(url):
+ logger.debug(f"URL '{url}' identified as LinkedIn {dataset_type}")
+ return dataset_type
+
+ raise ValidationError(f"URL '{url}' does not match any supported LinkedIn data type")
+
+ def _scrape_linkedin_dataset(
+ self,
+ urls: Union[str, List[str]],
+ dataset_id: str,
+ dataset_type: str,
+ sync: bool = True,
+ timeout: int = None
+ ) -> Dict[str, Any]:
+ """
+ Internal method to scrape LinkedIn data using Bright Data's collect API
+
+ Args:
+ urls: Single LinkedIn URL or list of LinkedIn URLs
+ dataset_id: Bright Data dataset ID for the specific LinkedIn data type
+ dataset_type: Type of LinkedIn data (for logging purposes)
+ sync: If True (default), uses synchronous API for immediate results
+ timeout: Request timeout in seconds
+
+ Returns:
+ Dict containing response with snapshot_id or direct data (if sync=True)
+
+ Raises:
+ ValidationError: Invalid URL format
+ AuthenticationError: Invalid API token or insufficient permissions
+ APIError: Request failed or server error
+ """
+ if isinstance(urls, str):
+ url_list = [urls]
+ else:
+ url_list = urls
+
+ if not url_list or len(url_list) == 0:
+ raise ValidationError("At least one URL is required")
+ for url in url_list:
+ if not url or not isinstance(url, str):
+ raise ValidationError("All URLs must be non-empty strings")
+
+ logger.info(f"Processing {len(url_list)} LinkedIn {dataset_type} URL(s) {'synchronously' if sync else 'asynchronously'}")
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+
+ if sync:
+ api_url = "https://api.brightdata.com/datasets/v3/scrape"
+ data = {
+ "input": [{"url": url} for url in url_list]
+ }
+ params = {
+ "dataset_id": dataset_id,
+ "notify": "false",
+ "include_errors": "true"
+ }
+ else:
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+ data = [{"url": url} for url in url_list]
+ params = {
+ "dataset_id": dataset_id,
+ "include_errors": "true"
+ }
+
+ try:
+ if sync:
+ response = self.session.post(
+ api_url,
+ headers=headers,
+ params=params,
+ json=data,
+ timeout=timeout or 65
+ )
+ else:
+ response = self.session.post(
+ api_url,
+ headers=headers,
+ params=params,
+ json=data,
+ timeout=timeout or self.default_timeout
+ )
+
+ if response.status_code == 401:
+ raise AuthenticationError("Invalid API token or insufficient permissions")
+ elif response.status_code not in [200, 202]:
+ raise APIError(f"LinkedIn data collection request failed with status {response.status_code}: {response.text}")
+
+ if sync:
+ response_text = response.text
+ if '\n{' in response_text and response_text.strip().startswith('{'):
+ json_objects = []
+ for line in response_text.strip().split('\n'):
+ if line.strip():
+ try:
+ json_objects.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ result = json_objects
+ else:
+ try:
+ result = response.json()
+ except json.JSONDecodeError:
+ result = response_text
+
+ logger.info(f"LinkedIn {dataset_type} data retrieved synchronously for {len(url_list)} URL(s)")
+ print(f"Retrieved {len(result) if isinstance(result, list) else 1} LinkedIn {dataset_type} record(s)")
+ else:
+ result = response.json()
+ snapshot_id = result.get('snapshot_id')
+ if snapshot_id:
+ logger.info(f"LinkedIn {dataset_type} data collection job initiated successfully for {len(url_list)} URL(s)")
+ print("")
+ print("Snapshot ID:")
+ print(snapshot_id)
+ print("")
+
+ return result
+
+ except requests.exceptions.Timeout:
+ raise APIError("Timeout while initiating LinkedIn data collection")
+ except requests.exceptions.RequestException as e:
+ raise APIError(f"Network error during LinkedIn data collection: {str(e)}")
+ except json.JSONDecodeError as e:
+ raise APIError(f"Failed to parse LinkedIn data collection response: {str(e)}")
+ except Exception as e:
+ if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+ raise
+ raise APIError(f"Unexpected error during LinkedIn data collection: {str(e)}")
+
+
+class LinkedInScraper:
+ """LinkedIn data scraping interface with specialized methods for different data types"""
+
+ def __init__(self, linkedin_api):
+ self.linkedin_api = linkedin_api
+
+ def profiles(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]:
+ """
+ ## Scrape LinkedIn Profile Data
+
+ Scrapes structured data from LinkedIn profiles using the profiles dataset.
+
+ ### Parameters:
+ - `url` (str | List[str]): Single LinkedIn profile URL or list of profile URLs
+ - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing
+ - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async)
+
+ ### Returns:
+ - `Dict[str, Any]`: If sync=True, returns scraped profile data directly. If sync=False, returns response with snapshot_id for async processing
+
+ ### Example URLs:
+ - `https://www.linkedin.com/in/username/`
+ - `https://linkedin.com/in/first-last-123456/`
+
+ ### Example Usage:
+ ```python
+ # Single profile (synchronous - returns data immediately)
+ result = client.scrape_linkedin.profiles("https://www.linkedin.com/in/elad-moshe-05a90413/")
+
+ # Multiple profiles (synchronous - returns data immediately)
+ profiles = [
+ "https://www.linkedin.com/in/user1/",
+ "https://www.linkedin.com/in/user2/"
+ ]
+ result = client.scrape_linkedin.profiles(profiles)
+
+ # Asynchronous processing (returns snapshot_id)
+ result = client.scrape_linkedin.profiles(profiles, sync=False)
+ ```
+ """
+ return self.linkedin_api._scrape_linkedin_dataset(
+ url,
+ self.linkedin_api.DATASET_IDS['profile'],
+ 'profile',
+ sync,
+ timeout
+ )
+
+ def companies(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]:
+ """
+ ## Scrape LinkedIn Company Data
+
+ Scrapes structured data from LinkedIn company pages using the companies dataset.
+
+ ### Parameters:
+ - `url` (str | List[str]): Single LinkedIn company URL or list of company URLs
+ - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing
+ - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async)
+
+ ### Returns:
+ - `Dict[str, Any]`: If sync=True, returns scraped company data directly. If sync=False, returns response with snapshot_id for async processing
+
+ ### Example URLs:
+ - `https://www.linkedin.com/company/company-name/`
+ - `https://linkedin.com/company/bright-data/`
+
+ ### Example Usage:
+ ```python
+ # Single company (synchronous)
+ result = client.scrape_linkedin.companies("https://www.linkedin.com/company/bright-data/")
+
+ # Multiple companies (synchronous)
+ companies = [
+ "https://www.linkedin.com/company/ibm/",
+ "https://www.linkedin.com/company/microsoft/"
+ ]
+ result = client.scrape_linkedin.companies(companies)
+
+ # Asynchronous processing
+ result = client.scrape_linkedin.companies(companies, sync=False)
+ ```
+ """
+ return self.linkedin_api._scrape_linkedin_dataset(
+ url,
+ self.linkedin_api.DATASET_IDS['company'],
+ 'company',
+ sync,
+ timeout
+ )
+
+ def jobs(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]:
+ """
+ ## Scrape LinkedIn Job Data
+
+ Scrapes structured data from LinkedIn job listings using the jobs dataset.
+
+ ### Parameters:
+ - `url` (str | List[str]): Single LinkedIn job URL or list of job URLs
+ - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing
+ - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async)
+
+ ### Returns:
+ - `Dict[str, Any]`: If sync=True, returns scraped job data directly. If sync=False, returns response with snapshot_id for async processing
+
+ ### Example URLs:
+ - `https://www.linkedin.com/jobs/view/1234567890/`
+ - `https://linkedin.com/jobs/view/job-id/`
+
+ ### Example Usage:
+ ```python
+ # Single job listing (synchronous)
+ result = client.scrape_linkedin.jobs("https://www.linkedin.com/jobs/view/1234567890/")
+
+ # Multiple job listings (synchronous)
+ jobs = [
+ "https://www.linkedin.com/jobs/view/1111111/",
+ "https://www.linkedin.com/jobs/view/2222222/"
+ ]
+ result = client.scrape_linkedin.jobs(jobs)
+
+ # Asynchronous processing
+ result = client.scrape_linkedin.jobs(jobs, sync=False)
+ ```
+ """
+ return self.linkedin_api._scrape_linkedin_dataset(
+ url,
+ self.linkedin_api.DATASET_IDS['job'],
+ 'job',
+ sync,
+ timeout
+ )
+
+ def posts(self, url: Union[str, List[str]], sync: bool = True, timeout: int = None) -> Dict[str, Any]:
+ """
+ ## Scrape LinkedIn Post Data
+
+ Scrapes structured data from LinkedIn posts and articles using the posts dataset.
+
+ ### Parameters:
+ - `url` (str | List[str]): Single LinkedIn post URL or list of post URLs
+ - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing
+ - `timeout` (int, optional): Request timeout in seconds (default: 65 for sync, 30 for async)
+
+ ### Returns:
+ - `Dict[str, Any]`: If sync=True, returns scraped post data directly. If sync=False, returns response with snapshot_id for async processing
+
+ ### Example URLs:
+ - `https://www.linkedin.com/posts/username-activity-123456/`
+ - `https://www.linkedin.com/pulse/article-title-author/`
+
+ ### Example Usage:
+ ```python
+ # Single post (synchronous)
+ result = client.scrape_linkedin.posts("https://www.linkedin.com/posts/user-activity-123/")
+
+ # Multiple posts (synchronous)
+ posts = [
+ "https://www.linkedin.com/posts/user1-activity-111/",
+ "https://www.linkedin.com/pulse/article-author/"
+ ]
+ result = client.scrape_linkedin.posts(posts)
+
+ # Asynchronous processing
+ result = client.scrape_linkedin.posts(posts, sync=False)
+ ```
+ """
+ return self.linkedin_api._scrape_linkedin_dataset(
+ url,
+ self.linkedin_api.DATASET_IDS['post'],
+ 'post',
+ sync,
+ timeout
+ )
+
+
+class LinkedInSearcher:
+ """LinkedIn search interface for discovering new LinkedIn data by various criteria"""
+
+ def __init__(self, linkedin_api):
+ self.linkedin_api = linkedin_api
+
+ def profiles(
+ self,
+ first_name: Union[str, List[str]],
+ last_name: Union[str, List[str]],
+ timeout: int = None
+ ) -> Dict[str, Any]:
+ """
+ ## Search LinkedIn Profiles by Name
+
+ Discovers LinkedIn profiles by searching for first and last names.
+
+ ### Parameters:
+ - `first_name` (str | List[str]): Single first name or list of first names to search for
+ - `last_name` (str | List[str]): Single last name or list of last names to search for
+ - `timeout` (int, optional): Request timeout in seconds (default: 30)
+
+ ### Returns:
+ - `Dict[str, Any]`: Response containing snapshot_id for async processing
+
+ ### Example Usage:
+ ```python
+ # Single name search (returns snapshot_id)
+ result = client.search_linkedin.profiles("James", "Smith")
+
+ # Multiple names search (returns snapshot_id)
+ first_names = ["James", "Idan"]
+ last_names = ["Smith", "Vilenski"]
+ result = client.search_linkedin.profiles(first_names, last_names)
+ ```
+ """
+ if isinstance(first_name, str):
+ first_names = [first_name]
+ else:
+ first_names = first_name
+
+ if isinstance(last_name, str):
+ last_names = [last_name]
+ else:
+ last_names = last_name
+
+ if len(first_names) != len(last_names):
+ raise ValidationError("first_name and last_name must have the same length")
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['profile'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "name"
+ }
+
+ data = [
+ {
+ "first_name": first_names[i],
+ "last_name": last_names[i]
+ }
+ for i in range(len(first_names))
+ ]
+
+ return self._make_request(api_url, headers, params, data, 'profile search', len(data), timeout)
+
+ def jobs(
+ self,
+ url: Union[str, List[str]] = None,
+ location: Union[str, List[str]] = None,
+ keyword: Union[str, List[str]] = "",
+ country: Union[str, List[str]] = "",
+ time_range: Union[str, List[str]] = "",
+ job_type: Union[str, List[str]] = "",
+ experience_level: Union[str, List[str]] = "",
+ remote: Union[str, List[str]] = "",
+ company: Union[str, List[str]] = "",
+ location_radius: Union[str, List[str]] = "",
+ selective_search: Union[bool, List[bool]] = False,
+ timeout: int = None
+ ) -> Dict[str, Any]:
+ """
+ ## Search LinkedIn Jobs by URL or Keywords
+
+ Discovers LinkedIn jobs either by searching specific job search URLs or by keyword criteria.
+
+ ### Parameters:
+ - `url` (str | List[str], optional): LinkedIn job search URLs to scrape
+ - `location` (str | List[str], optional): Job location(s) - required when searching by keyword
+ - `keyword` (str | List[str], optional): Job keyword(s) to search for (default: "")
+ - `country` (str | List[str], optional): Country code(s) (default: "")
+ - `time_range` (str | List[str], optional): Time range filter (default: "")
+ - `job_type` (str | List[str], optional): Job type filter (default: "")
+ - `experience_level` (str | List[str], optional): Experience level filter (default: "")
+ - `remote` (str | List[str], optional): Remote work filter (default: "")
+ - `company` (str | List[str], optional): Company name filter (default: "")
+ - `location_radius` (str | List[str], optional): Location radius filter (default: "")
+ - `selective_search` (bool | List[bool], optional): Enable selective search (default: False)
+ - `timeout` (int, optional): Request timeout in seconds (default: 30)
+
+ ### Returns:
+ - `Dict[str, Any]`: Response containing snapshot_id for async processing
+
+ ### Example Usage:
+ ```python
+ # Search by job URLs (returns snapshot_id)
+ job_urls = [
+ "https://www.linkedin.com/jobs/search?keywords=Software&location=Tel%20Aviv-Yafo",
+ "https://www.linkedin.com/jobs/reddit-inc.-jobs-worldwide?f_C=150573"
+ ]
+ result = client.search_linkedin.jobs(url=job_urls)
+
+ # Search by keyword (returns snapshot_id)
+ result = client.search_linkedin.jobs(
+ location="Paris",
+ keyword="product manager",
+ country="FR",
+ time_range="Past month",
+ job_type="Full-time"
+ )
+ ```
+ """
+ if url is not None:
+ return self._search_jobs_by_url(url, timeout)
+ elif location is not None:
+ return self._search_jobs_by_keyword(
+ location, keyword, country, time_range, job_type,
+ experience_level, remote, company, location_radius,
+ selective_search, timeout
+ )
+ else:
+ raise ValidationError("Either 'url' or 'location' parameter must be provided")
+
+ def posts(
+ self,
+ profile_url: Union[str, List[str]] = None,
+ company_url: Union[str, List[str]] = None,
+ url: Union[str, List[str]] = None,
+ start_date: Union[str, List[str]] = "",
+ end_date: Union[str, List[str]] = "",
+ timeout: int = None
+ ) -> Dict[str, Any]:
+ """
+ ## Search LinkedIn Posts by Profile, Company, or General URL
+
+ Discovers LinkedIn posts using various search methods.
+
+ ### Parameters:
+ - `profile_url` (str | List[str], optional): LinkedIn profile URL(s) to get posts from
+ - `company_url` (str | List[str], optional): LinkedIn company URL(s) to get posts from
+ - `url` (str | List[str], optional): General LinkedIn URL(s) for posts
+ - `start_date` (str | List[str], optional): Start date filter (ISO format, default: "")
+ - `end_date` (str | List[str], optional): End date filter (ISO format, default: "")
+ - `timeout` (int, optional): Request timeout in seconds (default: 30)
+
+ ### Returns:
+ - `Dict[str, Any]`: Response containing snapshot_id for async processing
+
+ ### Example Usage:
+ ```python
+ # Search posts by profile URL with date range (returns snapshot_id)
+ result = client.search_linkedin.posts(
+ profile_url="https://www.linkedin.com/in/bettywliu",
+ start_date="2018-04-25T00:00:00.000Z",
+ end_date="2021-05-25T00:00:00.000Z"
+ )
+
+ # Search posts by company URL (returns snapshot_id)
+ result = client.search_linkedin.posts(
+ company_url="https://www.linkedin.com/company/bright-data"
+ )
+
+ # Search posts by general URL (returns snapshot_id)
+ result = client.search_linkedin.posts(
+ url="https://www.linkedin.com/posts/activity-123456"
+ )
+ ```
+ """
+ if profile_url is not None:
+ return self._search_posts_by_profile(profile_url, start_date, end_date, timeout)
+ elif company_url is not None:
+ return self._search_posts_by_company(company_url, timeout)
+ elif url is not None:
+ return self._search_posts_by_url(url, timeout)
+ else:
+ raise ValidationError("One of 'profile_url', 'company_url', or 'url' parameter must be provided")
+
+ def _search_jobs_by_url(self, urls, timeout):
+ """Search jobs by LinkedIn job search URLs"""
+ if isinstance(urls, str):
+ url_list = [urls]
+ else:
+ url_list = urls
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['job'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "url"
+ }
+
+ data = [{"url": url} for url in url_list]
+ return self._make_request(api_url, headers, params, data, 'job search by URL', len(data), timeout)
+
+ def _search_jobs_by_keyword(self, location, keyword, country, time_range, job_type, experience_level, remote, company, location_radius, selective_search, timeout):
+ """Search jobs by keyword criteria"""
+ params_dict = {
+ 'location': location, 'keyword': keyword, 'country': country,
+ 'time_range': time_range, 'job_type': job_type, 'experience_level': experience_level,
+ 'remote': remote, 'company': company, 'location_radius': location_radius,
+ 'selective_search': selective_search
+ }
+
+ max_length = 1
+ for key, value in params_dict.items():
+ if isinstance(value, list):
+ max_length = max(max_length, len(value))
+ normalized_params = {}
+ for key, value in params_dict.items():
+ if isinstance(value, list):
+ if len(value) != max_length and len(value) != 1:
+ raise ValidationError(f"Parameter '{key}' list length must be 1 or {max_length}")
+ normalized_params[key] = value * max_length if len(value) == 1 else value
+ else:
+ normalized_params[key] = [value] * max_length
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['job'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "keyword"
+ }
+
+ data = []
+ for i in range(max_length):
+ data.append({
+ "location": normalized_params['location'][i],
+ "keyword": normalized_params['keyword'][i],
+ "country": normalized_params['country'][i],
+ "time_range": normalized_params['time_range'][i],
+ "job_type": normalized_params['job_type'][i],
+ "experience_level": normalized_params['experience_level'][i],
+ "remote": normalized_params['remote'][i],
+ "company": normalized_params['company'][i],
+ "location_radius": normalized_params['location_radius'][i],
+ "selective_search": normalized_params['selective_search'][i]
+ })
+
+ return self._make_request(api_url, headers, params, data, 'job search by keyword', len(data), timeout)
+
+ def _search_posts_by_profile(self, profile_urls, start_dates, end_dates, timeout):
+ """Search posts by profile URL with optional date filtering"""
+ if isinstance(profile_urls, str):
+ url_list = [profile_urls]
+ else:
+ url_list = profile_urls
+
+ if isinstance(start_dates, str):
+ start_list = [start_dates] * len(url_list)
+ else:
+ start_list = start_dates if len(start_dates) == len(url_list) else [start_dates[0]] * len(url_list)
+
+ if isinstance(end_dates, str):
+ end_list = [end_dates] * len(url_list)
+ else:
+ end_list = end_dates if len(end_dates) == len(url_list) else [end_dates[0]] * len(url_list)
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['post'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "profile_url"
+ }
+
+ data = []
+ for i in range(len(url_list)):
+ item = {"url": url_list[i]}
+ if start_list[i]:
+ item["start_date"] = start_list[i]
+ if end_list[i]:
+ item["end_date"] = end_list[i]
+ data.append(item)
+
+ return self._make_request(api_url, headers, params, data, 'post search by profile', len(data), timeout)
+
+ def _search_posts_by_company(self, company_urls, timeout):
+ """Search posts by company URL"""
+ if isinstance(company_urls, str):
+ url_list = [company_urls]
+ else:
+ url_list = company_urls
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['post'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "company_url"
+ }
+
+ data = [{"url": url} for url in url_list]
+ return self._make_request(api_url, headers, params, data, 'post search by company', len(data), timeout)
+
+ def _search_posts_by_url(self, urls, timeout):
+ """Search posts by general URL"""
+ if isinstance(urls, str):
+ url_list = [urls]
+ else:
+ url_list = urls
+
+ api_url = "https://api.brightdata.com/datasets/v3/trigger"
+
+ try:
+ from .. import __version__
+ user_agent = f"brightdata-sdk/{__version__}"
+ except ImportError:
+ user_agent = "brightdata-sdk/unknown"
+
+ headers = {
+ "Authorization": f"Bearer {self.linkedin_api.api_token}",
+ "Content-Type": "application/json",
+ "User-Agent": user_agent
+ }
+ params = {
+ "dataset_id": self.linkedin_api.DATASET_IDS['post'],
+ "include_errors": "true",
+ "type": "discover_new",
+ "discover_by": "url"
+ }
+
+ data = [{"url": url} for url in url_list]
+ return self._make_request(api_url, headers, params, data, 'post search by URL', len(data), timeout)
+
+ def _make_request(self, api_url, headers, params, data, operation_type, count, timeout):
+ """Common method to make API requests (async only for search operations)"""
+ try:
+ response = self.linkedin_api.session.post(
+ api_url,
+ headers=headers,
+ params=params,
+ json=data,
+ timeout=timeout or self.linkedin_api.default_timeout
+ )
+
+ if response.status_code == 401:
+ raise AuthenticationError("Invalid API token or insufficient permissions")
+ elif response.status_code != 200:
+ raise APIError(f"LinkedIn {operation_type} request failed with status {response.status_code}: {response.text}")
+
+ result = response.json()
+ snapshot_id = result.get('snapshot_id')
+ if snapshot_id:
+ logger.info(f"LinkedIn {operation_type} job initiated successfully for {count} item(s)")
+ print("")
+ print("Snapshot ID:")
+ print(snapshot_id)
+ print("")
+
+ return result
+
+ except requests.exceptions.Timeout:
+ raise APIError(f"Timeout while initiating LinkedIn {operation_type}")
+ except requests.exceptions.RequestException as e:
+ raise APIError(f"Network error during LinkedIn {operation_type}: {str(e)}")
+ except json.JSONDecodeError as e:
+ raise APIError(f"Failed to parse LinkedIn {operation_type} response: {str(e)}")
+ except Exception as e:
+ if isinstance(e, (ValidationError, AuthenticationError, APIError)):
+ raise
+ raise APIError(f"Unexpected error during LinkedIn {operation_type}: {str(e)}")
\ No newline at end of file
diff --git a/src/api/scraper.py b/src/api/scraper.py
new file mode 100644
index 0000000..0d4fc31
--- /dev/null
+++ b/src/api/scraper.py
@@ -0,0 +1,205 @@
+import time
+from typing import Union, Dict, Any, List
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from ..utils import (
+ validate_url, validate_zone_name, validate_country_code,
+ validate_timeout, validate_max_workers, validate_url_list,
+ validate_response_format, validate_http_method, retry_request,
+ get_logger, log_request, safe_json_parse, validate_response_size
+)
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.scraper')
+
+
+class WebScraper:
+ """Handles web scraping operations using Bright Data Web Unlocker API"""
+
+ def __init__(self, session, default_timeout=30, max_retries=3, retry_backoff=1.5):
+ self.session = session
+ self.default_timeout = default_timeout
+ self.max_retries = max_retries
+ self.retry_backoff = retry_backoff
+
+ def scrape(
+ self,
+ url: Union[str, List[str]],
+ zone: str,
+ response_format: str = "raw",
+ method: str = "GET",
+ country: str = "",
+ data_format: str = "markdown",
+ async_request: bool = False,
+ max_workers: int = 10,
+ timeout: int = None
+ ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]:
+ """
+ **Unlock and scrape websites using Bright Data Web Unlocker API**
+
+ Scrapes one or multiple URLs through Bright Data's proxy network with anti-bot detection bypass.
+
+ **Parameters:**
+ - `url` (str | List[str]): Single URL string or list of URLs to scrape
+ - `zone` (str): Your Bright Data zone identifier
+ - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`)
+ - `method` (str, optional): HTTP method for the request (default: `"GET"`)
+ - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`)
+ - `data_format` (str, optional): Additional format transformation (default: `"html"`)
+ - `async_request` (bool, optional): Enable asynchronous processing (default: `False`)
+ - `max_workers` (int, optional): Maximum parallel workers for multiple URLs (default: `10`)
+ - `timeout` (int, optional): Request timeout in seconds (default: `30`)
+
+ **Returns:**
+ - Single URL: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"`
+ - Multiple URLs: `List[Union[Dict[str, Any], str]]` corresponding to each input URL
+
+ **Example Usage:**
+ ```python
+ # Single URL scraping
+ result = client.scrape(
+ url="https://example.com",
+ zone="your_zone_name",
+ response_format="json"
+ )
+
+ # Multiple URLs scraping
+ urls = ["https://site1.com", "https://site2.com"]
+ results = client.scrape(
+ url=urls,
+ zone="your_zone_name",
+ response_format="raw",
+ max_workers=5
+ )
+ ```
+
+ **Raises:**
+ - `ValidationError`: Invalid URL format or empty URL list
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+
+ timeout = timeout or self.default_timeout
+ validate_zone_name(zone)
+ validate_response_format(response_format)
+ validate_http_method(method)
+ validate_country_code(country)
+ validate_timeout(timeout)
+ validate_max_workers(max_workers)
+
+ if isinstance(url, list):
+ validate_url_list(url)
+ effective_max_workers = min(len(url), max_workers or 10)
+
+ results = [None] * len(url)
+
+ with ThreadPoolExecutor(max_workers=effective_max_workers) as executor:
+ future_to_index = {
+ executor.submit(
+ self._perform_single_scrape,
+ single_url, zone, response_format, method, country,
+ data_format, async_request, timeout
+ ): i
+ for i, single_url in enumerate(url)
+ }
+ for future in as_completed(future_to_index):
+ index = future_to_index[future]
+ try:
+ result = future.result()
+ results[index] = result
+ except Exception as e:
+ raise APIError(f"Failed to scrape {url[index]}: {str(e)}")
+
+ return results
+ else:
+ validate_url(url)
+ return self._perform_single_scrape(
+ url, zone, response_format, method, country,
+ data_format, async_request, timeout
+ )
+
+ def _perform_single_scrape(
+ self,
+ url: str,
+ zone: str,
+ response_format: str,
+ method: str,
+ country: str,
+ data_format: str,
+ async_request: bool,
+ timeout: int
+ ) -> Union[Dict[str, Any], str]:
+ """
+ Perform a single scrape operation with comprehensive logging
+ """
+ endpoint = "https://api.brightdata.com/request"
+ start_time = time.time()
+
+ logger.info(f"Starting scrape request for URL: {url[:100]}{'...' if len(url) > 100 else ''}")
+
+ payload = {
+ "zone": zone,
+ "url": url,
+ "format": response_format,
+ "method": method,
+ "data_format": data_format
+ }
+
+ params = {}
+ if async_request:
+ params['async'] = 'true'
+
+ @retry_request(
+ max_retries=self.max_retries,
+ backoff_factor=self.retry_backoff,
+ retry_statuses={429, 500, 502, 503, 504}
+ )
+ def make_request():
+ return self.session.post(
+ endpoint,
+ json=payload,
+ params=params,
+ timeout=timeout
+ )
+
+ try:
+ response = make_request()
+ response_time = (time.time() - start_time) * 1000
+
+ # Log request details
+ log_request(logger, 'POST', endpoint, response.status_code, response_time)
+
+ if response.status_code == 200:
+ logger.info(f"Scrape completed successfully in {response_time:.2f}ms")
+
+ validate_response_size(response.text)
+
+ if response_format == "json":
+ result = safe_json_parse(response.text)
+ logger.debug(f"Processed response with {len(str(result))} characters")
+ return result
+ else:
+ logger.debug(f"Returning raw response with {len(response.text)} characters")
+ return response.text
+
+ elif response.status_code == 400:
+ logger.error(f"Bad Request (400) for URL {url}: {response.text}")
+ raise ValidationError(f"Bad Request (400): {response.text}")
+ elif response.status_code == 401:
+ logger.error(f"Unauthorized (401) for URL {url}: Check API token")
+ raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}")
+ elif response.status_code == 403:
+ logger.error(f"Forbidden (403) for URL {url}: Insufficient permissions")
+ raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}")
+ elif response.status_code == 404:
+ logger.error(f"Not Found (404) for URL {url}: {response.text}")
+ raise APIError(f"Not Found (404): {response.text}")
+ else:
+ logger.error(f"API Error ({response.status_code}) for URL {url}: {response.text}")
+ raise APIError(f"API Error ({response.status_code}): {response.text}",
+ status_code=response.status_code, response_text=response.text)
+
+ except Exception as e:
+ response_time = (time.time() - start_time) * 1000
+ logger.error(f"Request failed after {response_time:.2f}ms for URL {url}: {str(e)}", exc_info=True)
+ raise
\ No newline at end of file
diff --git a/src/api/search.py b/src/api/search.py
new file mode 100644
index 0000000..24e6365
--- /dev/null
+++ b/src/api/search.py
@@ -0,0 +1,212 @@
+import json
+import time
+from typing import Union, Dict, Any, List
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from urllib.parse import quote_plus
+
+from ..utils import (
+ validate_zone_name, validate_country_code, validate_timeout,
+ validate_max_workers, validate_search_engine, validate_query,
+ validate_response_format, validate_http_method, retry_request,
+ get_logger, log_request, safe_json_parse, validate_response_size
+)
+from ..exceptions import ValidationError, APIError, AuthenticationError
+
+logger = get_logger('api.search')
+
+
+class SearchAPI:
+ """Handles search operations using Bright Data SERP API"""
+
+ def __init__(self, session, default_timeout=30, max_retries=3, retry_backoff=1.5):
+ self.session = session
+ self.default_timeout = default_timeout
+ self.max_retries = max_retries
+ self.retry_backoff = retry_backoff
+
+ def search(
+ self,
+ query: Union[str, List[str]],
+ search_engine: str = "google",
+ zone: str = None,
+ response_format: str = "raw",
+ method: str = "GET",
+ country: str = "",
+ data_format: str = "markdown",
+ async_request: bool = False,
+ max_workers: int = 10,
+ timeout: int = None,
+ parse: bool = False
+ ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]:
+ """
+ ## Search the web using Bright Data SERP API
+
+ Performs web searches through major search engines using Bright Data's proxy network
+ for reliable, bot-detection-free results.
+
+ ### Parameters:
+ - `query` (str | List[str]): Search query string or list of search queries
+ - `search_engine` (str, optional): Search engine to use - `"google"`, `"bing"`, or `"yandex"` (default: `"google"`)
+ - `zone` (str, optional): Your Bright Data zone identifier (default: `None`)
+ - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`)
+ - `method` (str, optional): HTTP method for the request (default: `"GET"`)
+ - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`)
+ - `data_format` (str, optional): Additional format transformation (default: `"markdown"`)
+ - `async_request` (bool, optional): Enable asynchronous processing (default: `False`)
+ - `max_workers` (int, optional): Maximum parallel workers for multiple queries (default: `10`)
+ - `timeout` (int, optional): Request timeout in seconds (default: `30`)
+ - `parse` (bool, optional): Enable JSON parsing by adding brd_json=1 to URL (default: `False`)
+
+ ### Returns:
+ - Single query: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"`
+ - Multiple queries: `List[Union[Dict[str, Any], str]]` corresponding to each input query
+
+ ### Example Usage:
+ ```python
+ # Single search query
+ result = client.search(
+ query="best laptops 2024",
+ search_engine="google",
+ response_format="json"
+ )
+
+ # Multiple search queries
+ queries = ["python tutorials", "machine learning courses", "web development"]
+ results = client.search(
+ query=queries,
+ search_engine="bing",
+ zone="your_zone_name",
+ max_workers=3
+ )
+ ```
+
+ ### Supported Search Engines:
+ - `"google"` - Google Search
+ - `"bing"` - Microsoft Bing
+ - `"yandex"` - Yandex Search
+
+ ### Raises:
+ - `ValidationError`: Invalid search engine, empty query, or validation errors
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+
+ timeout = timeout or self.default_timeout
+ validate_zone_name(zone)
+ validate_search_engine(search_engine)
+ validate_query(query)
+ validate_response_format(response_format)
+ validate_http_method(method)
+ validate_country_code(country)
+ validate_timeout(timeout)
+ validate_max_workers(max_workers)
+
+ base_url_map = {
+ "google": "https://www.google.com/search?q=",
+ "bing": "https://www.bing.com/search?q=",
+ "yandex": "https://yandex.com/search/?text="
+ }
+
+ base_url = base_url_map[search_engine.lower()]
+
+ if isinstance(query, list):
+ effective_max_workers = min(len(query), max_workers or 10)
+ results = [None] * len(query)
+
+ with ThreadPoolExecutor(max_workers=effective_max_workers) as executor:
+ future_to_index = {
+ executor.submit(
+ self._perform_single_search,
+ single_query, zone, response_format, method, country,
+ data_format, async_request, base_url, timeout, parse
+ ): i
+ for i, single_query in enumerate(query)
+ }
+
+ for future in as_completed(future_to_index):
+ index = future_to_index[future]
+ try:
+ result = future.result()
+ results[index] = result
+ except Exception as e:
+ raise APIError(f"Failed to search '{query[index]}': {str(e)}")
+
+ return results
+ else:
+ return self._perform_single_search(
+ query, zone, response_format, method, country,
+ data_format, async_request, base_url, timeout, parse
+ )
+
+ def _perform_single_search(
+ self,
+ query: str,
+ zone: str,
+ response_format: str,
+ method: str,
+ country: str,
+ data_format: str,
+ async_request: bool,
+ base_url: str,
+ timeout: int,
+ parse: bool
+ ) -> Union[Dict[str, Any], str]:
+ """
+ Perform a single search operation
+ """
+ encoded_query = quote_plus(query)
+ url = f"{base_url}{encoded_query}"
+
+ if parse:
+ url += "&brd_json=1"
+
+ endpoint = "https://api.brightdata.com/request"
+
+ payload = {
+ "zone": zone,
+ "url": url,
+ "format": response_format,
+ "method": method,
+ "data_format": data_format
+ }
+
+ params = {}
+ if async_request:
+ params['async'] = 'true'
+
+ @retry_request(
+ max_retries=self.max_retries,
+ backoff_factor=self.retry_backoff,
+ retry_statuses={429, 500, 502, 503, 504}
+ )
+ def make_request():
+ return self.session.post(
+ endpoint,
+ json=payload,
+ params=params,
+ timeout=timeout
+ )
+
+ response = make_request()
+
+ if response.status_code == 200:
+ if response_format == "json":
+ try:
+ return response.json()
+ except json.JSONDecodeError as e:
+ logger.warning(f"Failed to parse JSON response: {e}")
+ return response.text
+ else:
+ return response.text
+
+ elif response.status_code == 400:
+ raise ValidationError(f"Bad Request (400): {response.text}")
+ elif response.status_code == 401:
+ raise AuthenticationError(f"Unauthorized (401): Check your API token. {response.text}")
+ elif response.status_code == 403:
+ raise AuthenticationError(f"Forbidden (403): Insufficient permissions. {response.text}")
+ elif response.status_code == 404:
+ raise APIError(f"Not Found (404): {response.text}")
+ else:
+ raise APIError(f"API Error ({response.status_code}): {response.text}",
+ status_code=response.status_code, response_text=response.text)
\ No newline at end of file
diff --git a/src/client.py b/src/client.py
new file mode 100644
index 0000000..1d4f794
--- /dev/null
+++ b/src/client.py
@@ -0,0 +1,817 @@
+import os
+import re
+import time
+import json
+import requests
+from .search import Search
+from datetime import datetime
+from .api.crawl import CrawlAPI
+from .api.chatgpt import ChatGPTAPI
+from .api.extract import ExtractAPI
+from .api.download import DownloadAPI
+from .api import WebScraper, SearchAPI
+from typing import Union, Dict, Any, List
+from .exceptions import ValidationError, AuthenticationError, APIError
+from .api.linkedin import LinkedInAPI, LinkedInScraper, LinkedInSearcher
+from .utils import ZoneManager, setup_logging, get_logger, parse_content
+
+def _get_version():
+ """Get version from __init__.py, cached at module import time."""
+ try:
+ init_file = os.path.join(os.path.dirname(__file__), '__init__.py')
+ with open(init_file, 'r', encoding='utf-8') as f:
+ for line in f:
+ if line.startswith('__version__'):
+ return line.split('"')[1]
+ except (OSError, IndexError):
+ pass
+ return "unknown"
+
+__version__ = _get_version()
+
+logger = get_logger('client')
+
+
+class bdclient:
+ """Main client for the Bright Data SDK"""
+
+ DEFAULT_MAX_WORKERS = 10
+ DEFAULT_TIMEOUT = 30
+ CONNECTION_POOL_SIZE = 20
+ MAX_RETRIES = 3
+ RETRY_BACKOFF_FACTOR = 1.5
+ RETRY_STATUSES = {429, 500, 502, 503, 504}
+
+ def __init__(
+ self,
+ api_token: str = None,
+ auto_create_zones: bool = True,
+ web_unlocker_zone: str = None,
+ serp_zone: str = None,
+ browser_zone: str = None,
+ browser_username: str = None,
+ browser_password: str = None,
+ browser_type: str = "playwright",
+ log_level: str = "INFO",
+ structured_logging: bool = True,
+ verbose: bool = None
+ ):
+ """
+ Initialize the Bright Data client with your API token.
+
+ Create an account at https://brightdata.com/ to get your API token.
+ Go to Settings > API Keys and verify that your key has "Admin" permissions.
+
+ Args:
+ api_token: Your Bright Data API token (or set BRIGHTDATA_API_TOKEN env var)
+ auto_create_zones: Auto-create required zones if missing (default: True)
+ web_unlocker_zone: Custom Web Unlocker zone name (default: 'sdk_unlocker')
+ serp_zone: Custom SERP zone name (default: 'sdk_serp')
+ browser_zone: Custom Browser zone name (default: 'sdk_browser')
+ browser_username: Browser API username ("username-zone-{zone_name}")
+ browser_password: Browser API password
+ browser_type: "playwright", "puppeteer", or "selenium" (default: "playwright")
+ log_level: Logging level
+ structured_logging: Enable structured JSON logging
+ verbose: When True, show all logs per log_level. Can also use BRIGHTDATA_VERBOSE env var.
+ """
+
+ try:
+ from dotenv import load_dotenv
+ load_dotenv()
+ except ImportError:
+ pass
+
+ if verbose is None:
+ env_verbose = os.getenv('BRIGHTDATA_VERBOSE', '').lower()
+ verbose = env_verbose in ('true', '1', 'yes', 'on')
+
+ setup_logging(log_level, structured_logging, verbose)
+ logger.info("Initializing Bright Data SDK client")
+
+ # API Token Validation
+ self.api_token = api_token or os.getenv('BRIGHTDATA_API_TOKEN')
+ if not self.api_token:
+ logger.error("API token not provided")
+ raise ValidationError(
+ "API token is required. Pass api_token or set BRIGHTDATA_API_TOKEN env var."
+ )
+
+ if not isinstance(self.api_token, str):
+ logger.error("API token must be a string")
+ raise ValidationError("API token must be a string")
+
+ if len(self.api_token.strip()) < 10:
+ logger.error("API token appears to be invalid (too short)")
+ raise ValidationError("API token appears to be invalid")
+
+ token_preview = f"{self.api_token[:4]}***{self.api_token[-4:]}"
+ logger.info(f"API token validated successfully: {token_preview}")
+
+ self.web_unlocker_zone = web_unlocker_zone or os.getenv('WEB_UNLOCKER_ZONE', 'sdk_unlocker')
+ self.serp_zone = serp_zone or os.getenv('SERP_ZONE', 'sdk_serp')
+ self.browser_zone = browser_zone or os.getenv('BROWSER_ZONE', 'sdk_browser')
+ self.auto_create_zones = auto_create_zones
+
+ self.browser_username = browser_username or os.getenv('BRIGHTDATA_BROWSER_USERNAME')
+ self.browser_password = browser_password or os.getenv('BRIGHTDATA_BROWSER_PASSWORD')
+
+ valid_browser_types = ["playwright", "puppeteer", "selenium"]
+ if browser_type not in valid_browser_types:
+ raise ValidationError(
+ f"Invalid browser_type '{browser_type}'. Must be one of: {valid_browser_types}"
+ )
+ self.browser_type = browser_type
+
+ if self.browser_username and self.browser_password:
+ browser_preview = f"{self.browser_username[:3]}***"
+ logger.info(f"Browser credentials configured: {browser_preview} (type: {self.browser_type})")
+ elif self.browser_username or self.browser_password:
+ logger.warning("Incomplete browser credentials: both username and password are required.")
+ else:
+ logger.debug("No browser credentials provided - browser API will not be available.")
+
+ self.session = requests.Session()
+ self.session.headers.update({
+ 'Authorization': f'Bearer {self.api_token}',
+ 'Content-Type': 'application/json',
+ 'User-Agent': f'brightdata-sdk/{__version__}'
+ })
+ logger.info("HTTP session configured with secure headers")
+
+ adapter = requests.adapters.HTTPAdapter(
+ pool_connections=self.CONNECTION_POOL_SIZE,
+ pool_maxsize=self.CONNECTION_POOL_SIZE,
+ max_retries=0
+ )
+ self.session.mount('https://', adapter)
+ self.session.mount('http://', adapter)
+
+ self.zone_manager = ZoneManager(self.session)
+
+ self.web_scraper = WebScraper(
+ self.session,
+ self.DEFAULT_TIMEOUT,
+ self.MAX_RETRIES,
+ self.RETRY_BACKOFF_FACTOR
+ )
+
+ self.search_api = SearchAPI(
+ self.session,
+ self.DEFAULT_TIMEOUT,
+ self.MAX_RETRIES,
+ self.RETRY_BACKOFF_FACTOR
+ )
+
+ self.chatgpt_api = ChatGPTAPI(
+ self.session,
+ self.api_token,
+ self.DEFAULT_TIMEOUT,
+ self.MAX_RETRIES,
+ self.RETRY_BACKOFF_FACTOR
+ )
+
+ self.linkedin_api = LinkedInAPI(
+ self.session,
+ self.api_token,
+ self.DEFAULT_TIMEOUT,
+ self.MAX_RETRIES,
+ self.RETRY_BACKOFF_FACTOR
+ )
+
+ self.download_api = DownloadAPI(self.session, self.api_token, self.DEFAULT_TIMEOUT)
+
+ self.crawl_api = CrawlAPI(
+ self.session,
+ self.api_token,
+ self.DEFAULT_TIMEOUT,
+ self.MAX_RETRIES,
+ self.RETRY_BACKOFF_FACTOR
+ )
+
+ self.extract_api = ExtractAPI(self)
+
+ self.search = Search(self)
+
+ if self.auto_create_zones:
+ self.zone_manager.ensure_required_zones(
+ self.web_unlocker_zone,
+ self.serp_zone
+ )
+
+
+ def scrape(
+ self,
+ url: Union[str, List[str]],
+ zone: str = None,
+ response_format: str = "raw",
+ method: str = "GET",
+ country: str = "",
+ data_format: str = "html",
+ async_request: bool = False,
+ max_workers: int = None,
+ timeout: int = None
+ ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]:
+ """
+ ## Unlock and scrape websites using Bright Data Web Unlocker API
+
+ Scrapes one or multiple websites using Bright Data's Web Unlocker and proxy network.
+ Automatically handles bot-detection, CAPTCHAs, and retries.
+
+ ### Parameters:
+ - `url` (str | List[str]): Single URL string or list of URLs to scrape
+ - `zone` (str, optional): Zone identifier (default: auto-configured web_unlocker_zone)
+ - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`)
+ - `method` (str, optional): HTTP method for the request (default: `"GET"`)
+ - `country` (str, optional): Two-letter ISO country code for proxy location (defaults to fastest connection)
+ - `data_format` (str, optional): Additional format transformation (default: `"html"`)
+ - `async_request` (bool, optional): Enable asynchronous processing (default: `False`)
+ - `max_workers` (int, optional): Maximum parallel workers for multiple URLs (default: `10`)
+ - `timeout` (int, optional): Request timeout in seconds (default: `30`)
+
+ ### Returns:
+ - Single URL: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"`
+ - Multiple URLs: `List[Union[Dict[str, Any], str]]` corresponding to each input URL
+
+ ### Raises:
+ - `ValidationError`: Invalid URL or parameters
+ - `APIError`: Scraping failed (non-2xx response or server error)
+ """
+
+ # URL validation
+
+ if not url:
+ raise ValidationError("The 'url' parameter cannot be None or empty.")
+
+ if isinstance(url, str):
+ if not url.strip():
+ raise ValidationError("The 'url' string cannot be empty or whitespace.")
+ elif isinstance(url, list):
+ if len(url) == 0:
+ raise ValidationError("URL list cannot be empty")
+ if any((not isinstance(u, str) or not u.strip()) for u in url):
+ raise ValidationError("All URLs in the list must be non-empty strings")
+
+ result = self.web_scraper.scrape(
+ url, zone or self.web_unlocker_zone, response_format, method, country,
+ data_format, async_request, max_workers or self.DEFAULT_MAX_WORKERS, timeout or self.DEFAULT_TIMEOUT
+ )
+ return result
+
+ def search(
+ self,
+ query: Union[str, List[str]],
+ search_engine: str = "google",
+ zone: str = None,
+ response_format: str = "raw",
+ method: str = "GET",
+ country: str = "",
+ data_format: str = "html",
+ async_request: bool = False,
+ max_workers: int = None,
+ timeout: int = None,
+ parse: bool = False
+ ) -> Union[Dict[str, Any], str, List[Union[Dict[str, Any], str]]]:
+ """
+ ## Perform web search using Bright Data's SERP
+
+ ### Parameters:
+ - `query` (str | List[str]): Search query string or list of search queries
+ - `search_engine` (str, optional): Search engine to use - `"google"`, `"bing"`, or `"yandex"` (default: `"google"`)
+ - `zone` (str, optional): Zone identifier (default: auto-configured serp_zone)
+ - `response_format` (str, optional): Response format - `"json"` for structured data, `"raw"` for HTML string (default: `"raw"`)
+ - `method` (str, optional): HTTP method for the request (default: `"GET"`)
+ - `country` (str, optional): Two-letter ISO country code for proxy location (default: `"us"`)
+ - `data_format` (str, optional): Additional format transformation (default: `"html"`)
+ - `async_request` (bool, optional): Enable asynchronous processing (default: `False`)
+ - `max_workers` (int, optional): Maximum parallel workers for multiple queries (default: `10`)
+ - `timeout` (int, optional): Request timeout in seconds (default: `30`)
+ - `parse` (bool, optional): Enable JSON parsing by adding brd_json=1 to URL (default: `False`)
+
+ ### Returns:
+ - Single query: `Dict[str, Any]` if `response_format="json"`, `str` if `response_format="raw"`
+ - Multiple queries: `List[Union[Dict[str, Any], str]]` corresponding to each input query
+
+ ### Raises:
+ - `ValidationError`: Query is missing or invalid
+ - `APIError`: Search request failed or returned an error
+ """
+
+ # Query validation
+
+ if not query:
+ raise ValidationError("query cannot be empty")
+ if isinstance(query, str):
+ if not query.strip():
+ raise ValidationError("Search query cannot be empty or whitespace")
+ elif isinstance(query, list):
+ if len(query) == 0:
+ raise ValidationError("Query list cannot be empty")
+ for q in query:
+ if not isinstance(q, str) or not q.strip():
+ raise ValidationError("All queries in the list must be non-empty strings")
+
+ # Validate search engine
+
+ search_engine = (search_engine or "google").strip().lower()
+ valid_engines = ["google", "bing", "yandex"]
+ if search_engine not in valid_engines:
+ raise ValidationError(f"Invalid search engine '{search_engine}'. Valid options: {', '.join(valid_engines)}")
+
+ zone = zone or self.serp_zone
+ max_workers = max_workers or self.DEFAULT_MAX_WORKERS
+
+ result = self.search_api.search(
+ query=query,
+ search_engine=search_engine,
+ zone=zone or self.serp_zone,
+ response_format=response_format,
+ method=method,
+ country=country,
+ data_format=data_format,
+ async_request=async_request,
+ max_workers=max_workers,
+ timeout=timeout or self.DEFAULT_TIMEOUT,
+ parse=parse,
+ )
+
+ return result
+
+ def download_content(self, content: Union[Dict, str], filename: str = None, format: str = "json", parse: bool = False) -> str:
+ """
+ ## Download content to a file based on its format
+
+ ### Args:
+ content: The content to download (dict for JSON, string for other formats)
+ filename: Optional filename. If not provided, generates one with timestamp
+ format: Format of the content ("json", "csv", "ndjson", "jsonl", "txt")
+ parse: If True, automatically parse JSON strings in 'body' fields to objects (default: False)
+
+ ### Returns:
+ The file path of the saved file.
+ """
+ if not content:
+ raise ValidationError("Content is empty or None")
+ return self.download_api.download_content(content, filename, response_format, parse)
+
+
+ def search_gpt(
+ self,
+ prompt: Union[str, List[str]],
+ country: Union[str, List[str]] = None,
+ additional_prompt: Union[str, List[str]] = None,
+ web_search: Union[bool, List[bool]] = False,
+ sync: bool = True,
+ timeout: int = None,
+ **kwargs
+ ) -> Dict[str, Any]:
+
+ """
+ ## Search ChatGPT responses using Bright Data's ChatGPT dataset API
+
+ Sends one or multiple prompts to ChatGPT through Bright Data's proxy network
+ with support for both synchronous and asynchronous processing.
+
+ ### Parameters:
+ - `prompt` (str | List[str]): Single prompt string or list of prompts to send to ChatGPT
+ - `country` (str | List[str], optional): Two-letter ISO country code(s) for proxy location (default: "")
+ - `additional_prompt` (str | List[str], optional): Follow-up prompt(s) after receiving the first answer (default: "")
+ - `web_search` (bool | List[bool], optional): Whether to click the web search button in ChatGPT (default: False)
+ - `sync` (bool, optional): If True (default), returns data immediately. If False, returns snapshot_id for async processing
+
+ ### Returns:
+ - `Dict[str, Any]`: If sync=True, returns ChatGPT response data directly. If sync=False, returns response with snapshot_id for async processing
+
+ ### Raises:
+ - `ValidationError`: Invalid prompt or parameters
+ - `AuthenticationError`: Invalid API token or insufficient permissions
+ - `APIError`: Request failed or server error
+ """
+
+ # Handle alternate parameter names from kwargs
+
+ if 'secondaryPrompt' in kwargs:
+ additional_prompt = kwargs.pop('secondaryPrompt')
+ if 'additionalPrompt' in kwargs:
+ additional_prompt = kwargs.pop('additionalPrompt')
+ if 'webSearch' in kwargs:
+ web_search = kwargs.pop('webSearch')
+
+ # Validate prompt input
+
+ if (isinstance(prompt, list) and len(prompt) == 0) or prompt is None:
+ raise ValidationError("prompt is required")
+
+ # Ensure prompts list
+
+ prompts = prompt if isinstance(prompt, list) else [prompt]
+
+ # Validate each prompt is a non-empty string
+
+ for p in prompts:
+ if not isinstance(p, str) or not p.strip():
+ raise ValidationError("All prompts must be non-empty strings")
+
+ def normalize_param(param, name):
+ if param is None:
+ return [None] * len(prompts)
+ if isinstance(param, list):
+ if len(param) != len(prompts):
+ raise ValidationError(f"Length of {name} list must match number of prompts")
+ return param
+ return [param] * len(prompts)
+
+ countries = normalize_param(country, "country")
+ followups = normalize_param(additional_prompt, "additional_prompt")
+ web_searches = normalize_param(web_search, "web_search")
+
+ # Validate country codes
+ for i, c in enumerate(countries):
+ if c is None or str(c).strip() == "":
+ countries[i] = ""
+ else:
+ if not isinstance(c, str) or len(c.strip()) != 2 or not c.strip().isalpha():
+ raise ValidationError("must be 2-letter code")
+ countries[i] = c.strip().lower()
+ # Validate follow-up prompts
+ for i, f in enumerate(followups):
+ if f is None:
+ followups[i] = ""
+ elif not isinstance(f, str):
+ raise ValidationError("All follow-up prompts must be strings")
+ else:
+ followups[i] = f.strip()
+ # Validate web_search flags
+ for i, w in enumerate(web_searches):
+ if w is None:
+ web_searches[i] = False
+ elif not isinstance(w, bool):
+ raise ValidationError("must be a boolean or list of booleans")
+
+ timeout_value = timeout if timeout is not None else (65 if sync else 30)
+
+ if timeout is not None:
+ if not isinstance(timeout, int):
+ raise ValidationError("Timeout must be an integer")
+ if timeout <= 0:
+ raise ValidationError("Timeout must be greater than 0 seconds")
+ if timeout > 300:
+ raise ValidationError("Timeout cannot exceed 300 seconds (5 minutes)")
+ # Prepare request payload
+ tasks = []
+ for i in range(len(prompts)):
+ task = {
+ "url": "https://chatgpt.com",
+ "prompt": prompts[i].strip(),
+ "country": countries[i] or "",
+ "additional_prompt": followups[i] or "",
+ "web_search": bool(web_searches[i])
+ }
+ tasks.append(task)
+ payload_data = tasks[0] if len(tasks) == 1 else tasks
+ # Make API request with retries
+ endpoint = "https://api.brightdata.com/datasets/v3/scrape" if sync else "https://api.brightdata.com/datasets/v3/trigger"
+ params = {
+ "dataset_id": "gd_m7aof0k82r803d5bjm",
+ "include_errors": "true"
+ }
+ last_exception = None
+ for attempt in range(self.MAX_RETRIES + 1):
+ try:
+ response = self.session.post(endpoint, json=payload_data, timeout=timeout_value)
+ except requests.exceptions.RequestException as e:
+ last_exception = e
+ if attempt >= self.MAX_RETRIES:
+ raise NetworkError(f"Network error: {e}")
+ # Retry on network errors
+ time.sleep(self.RETRY_BACKOFF_FACTOR ** attempt)
+ continue
+ if response.status_code == 401:
+ raise AuthenticationError("Invalid API token or unauthorized")
+ if response.status_code in self.RETRY_STATUSES:
+ if attempt >= self.MAX_RETRIES:
+ raise RuntimeError("Failed after retries")
+ time.sleep(self.RETRY_BACKOFF_FACTOR ** attempt)
+ continue
+ if response.status_code != 200:
+ raise APIError(f"ChatGPT search failed with status {response.status_code}: {response.text}", status_code=response.status_code, response_text=getattr(response, 'text', ''))
+ # Success
+ result_data = response.json()
+ if sync:
+ return result_data
+ snapshot_id = result_data.get("snapshot_id") or result_data.get("id")
+ if snapshot_id:
+ print(f"Snapshot ID: {snapshot_id}")
+ return {"snapshot_id": snapshot_id}
+ else:
+ raise APIError("Failed to retrieve snapshot ID from response", status_code=response.status_code, response_text=response.text)
+
+
+ @property
+ def scrape_linkedin(self):
+ """
+ ## LinkedIn Data Scraping Interface
+
+ Provides specialized methods for scraping different types of LinkedIn data
+ using Bright Data's collect API with pre-configured dataset IDs.
+
+ ### Available Methods:
+ - `profiles(url)` - Scrape LinkedIn profile data
+ - `companies(url)` - Scrape LinkedIn company data
+ - `jobs(url)` - Scrape LinkedIn job listing data
+ - `posts(url)` - Scrape LinkedIn post content
+
+ ### Example Usage:
+ ```python
+ # Scrape LinkedIn profiles
+ result = client.scrape_linkedin.profiles("https://www.linkedin.com/in/username/")
+
+ # Scrape multiple companies
+ companies = [
+ "https://www.linkedin.com/company/ibm",
+ "https://www.linkedin.com/company/bright-data"
+ ]
+ result = client.scrape_linkedin.companies(companies)
+
+ # Scrape job listings
+ result = client.scrape_linkedin.jobs("https://www.linkedin.com/jobs/view/123456/")
+
+ # Scrape posts
+ result = client.scrape_linkedin.posts("https://www.linkedin.com/posts/user-activity-123/")
+ ```
+
+ ### Returns:
+ Each method returns a `Dict[str, Any]` containing snapshot_id and metadata for tracking the request.
+ Use the snapshot_id with `download_snapshot()` to retrieve the collected data.
+ """
+ if not hasattr(self, '_linkedin_scraper'):
+ self._linkedin_scraper = LinkedInScraper(self.linkedin_api)
+ return self._linkedin_scraper
+
+ @property
+ def search_linkedin(self):
+ """
+ ## LinkedIn Data Search Interface
+
+ Provides specialized methods for discovering new LinkedIn data by various search criteria
+ using Bright Data's collect API with pre-configured dataset IDs.
+
+ ### Available Methods:
+ - `profiles(first_name, last_name)` - Search LinkedIn profiles by name
+ - `jobs(url=..., location=...)` - Search LinkedIn jobs by URL or keyword criteria
+ - `posts(profile_url=..., company_url=..., url=...)` - Search LinkedIn posts by various methods
+
+ ### Example Usage:
+ ```python
+ # Search profiles by name
+ result = client.search_linkedin.profiles("James", "Smith")
+
+ # Search jobs by location and keywords
+ result = client.search_linkedin.jobs(
+ location="Paris",
+ keyword="product manager",
+ country="FR"
+ )
+
+ # Search posts by profile URL with date range
+ result = client.search_linkedin.posts(
+ profile_url="https://www.linkedin.com/in/username",
+ start_date="2018-04-25T00:00:00.000Z",
+ end_date="2021-05-25T00:00:00.000Z"
+ )
+ ```
+
+ ### Returns:
+ Each method returns a `Dict[str, Any]` containing snapshot_id (async) or direct data (sync) for tracking the request.
+ Use the snapshot_id with `download_snapshot()` to retrieve the collected data.
+ """
+ if not hasattr(self, '_linkedin_searcher'):
+ self._linkedin_searcher = LinkedInSearcher(self.linkedin_api)
+ return self._linkedin_searcher
+
+ def download_snapshot(
+ self,
+ snapshot_id: str,
+ response_format: str = "json",
+ compress: bool = False,
+ batch_size: int = None,
+ part: int = None
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]], str]:
+ """
+ ## Download snapshot content from Bright Data dataset API
+
+ Downloads the snapshot content using the snapshot ID returned from scrape_chatGPT()
+ or other dataset collection triggers.
+
+ ### Parameters:
+ - `snapshot_id` (str): The snapshot ID returned when collection was triggered (required)
+ - `response_format` (str, optional): Format of the output data: "json", "csv", "ndjson", "jsonl" (default: "json")
+ - `compress` (bool, optional): Whether the result should be compressed (default: False)
+ - `batch_size` (int, optional): Divide into batches of X records (minimum: 1000)
+ - `part` (int, optional): If batch_size provided, specify which part to download
+
+ ### Returns:
+ - `Union[Dict, List, str]`: Snapshot data in the requested format, OR
+ - `Dict`: Status response if snapshot is not ready yet (status="not_ready")
+
+
+ ### Raises:
+ - `ValidationError`: Invalid parameters or snapshot_id format
+ - `APIError`: Request failed, snapshot not found, or server error
+ """
+
+ # snapshot_id validation
+
+ if not snapshot_id or not isinstance(snapshot_id, str):
+ raise ValidationError("The 'snapshot_id' parameter must be a non-empty string.")
+ if not snapshot_id.startswith("s_"):
+ raise ValidationError("Invalid 'snapshot_id' format. Expected an ID starting with 's_' (e.g., 's_m4x7enmven8djfqak').")
+
+ # format validation
+
+ allowed_formats = {"json", "ndjson", "jsonl", "csv"}
+ if format not in allowed_formats:
+ raise ValueError(
+ f"Invalid 'format' value: '{format}'. Must be one of {sorted(allowed_formats)}."
+ )
+
+ return self.download_api.download_snapshot(snapshot_id, response_format, compress, batch_size, part)
+
+
+ def list_zones(self) -> List[Dict[str, Any]]:
+ """
+ ## List all active zones in your Bright Data account
+
+ ### Returns:
+ List of zone dictionaries with their configurations
+ """
+ return self.zone_manager.list_zones()
+
+ def connect_browser(self) -> str:
+ """
+ ## Get WebSocket endpoint URL for connecting to Bright Data's scraping browser
+
+ Returns the WebSocket endpoint URL that can be used with Playwright or Selenium
+ to connect to Bright Data's scraping browser service.
+
+ **Security Warning:** The returned URL contains authentication credentials. Do not share this URL or expose it publicly.
+
+ ### Returns:
+ WebSocket URL (str) for connecting to the browser (contains one-time token)
+
+ ### Raises:
+ - `AuthenticationError`: If the API token or browser zone credentials are invalid
+ - `APIError`: If retrieving the browser endpoint fails
+ """
+
+ if not self.browser_username or not self.browser_password:
+ logger.error("Browser credentials not configured")
+ raise ValidationError(
+ "Browser credentials are required. Provide browser_username and browser_password "
+ "parameters or set BRIGHTDATA_BROWSER_USERNAME and BRIGHTDATA_BROWSER_PASSWORD "
+ "environment variables."
+ )
+
+ if not isinstance(self.browser_username, str) or not isinstance(self.browser_password, str):
+ logger.error("Browser credentials must be strings")
+ raise ValidationError("Browser username and password must be strings")
+
+ if len(self.browser_username.strip()) == 0 or len(self.browser_password.strip()) == 0:
+ logger.error("Browser credentials cannot be empty")
+ raise ValidationError("Browser username and password cannot be empty")
+
+ auth_string = f"{self.browser_username}:{self.browser_password}"
+
+ if self.browser_type == "selenium":
+ endpoint_url = f"https://{auth_string}@brd.superproxy.io:9515"
+ logger.debug(f"Browser endpoint URL: https://***:***@brd.superproxy.io:9515")
+ else:
+ endpoint_url = f"wss://{auth_string}@brd.superproxy.io:9222"
+ logger.debug(f"Browser endpoint URL: wss://***:***@brd.superproxy.io:9222")
+
+ logger.info(f"Generated {self.browser_type} connection endpoint for user: {self.browser_username[:3]}***")
+
+ return endpoint_url
+
+ def crawl(
+ self,
+ url: Union[str, List[str]],
+ ignore_sitemap: bool = None,
+ depth: int = None,
+ include_filter: str = None,
+ exclude_filter: str = None,
+ custom_output_fields: List[str] = None,
+ include_errors: bool = True
+ ) -> Dict[str, Any]:
+ """
+ ## Crawl websites using Bright Data's Web Crawl API
+
+ Performs web crawling to discover and scrape multiple pages from a website
+ starting from the specified URL(s). Returns a snapshot_id for tracking the crawl progress.
+
+ ### Parameters:
+ - `url` (str | List[str]): Starting URL or URLs to crawl
+ - `ignore_sitemap` (bool, optional): If True, ignore site's sitemap (default: False)
+ - `depth` (int, optional): Maximum crawl depth (number of hops from start URL)
+ - `include_filter` (str, optional): Only crawl URLs that include this substring (default: None)
+ - `exclude_filter` (str, optional): Do not crawl URLs that include this substring
+ - `custom_output_fields` (List[str], optional): Additional data fields to return (e.g., ["markdown","text","title"])
+ - `include_errors` (bool, optional): If True, include pages that errored in results (default: True)
+
+ ### Returns:
+ - A dict containing the crawl job details, including a `snapshot_id` to retrieve results via download_snapshot()
+
+ ### Raises:
+ - `ValidationError`: Missing URL or invalid parameters
+ - `APIError`: Crawl request failed
+ """
+
+ # URL validation
+
+ if not url:
+ raise ValidationError("The 'url' parameter cannot be None or empty.")
+ if isinstance(url, str):
+ if not url.strip():
+ raise ValidationError("The 'url' string cannot be empty or whitespace.")
+ elif isinstance(url, list):
+ if len(url) == 0:
+ raise ValidationError("URL list cannot be empty")
+ for u in url:
+ if not isinstance(u, str) or not u.strip():
+ raise ValidationError("All URLs in the list must be non-empty strings")
+ if depth is not None:
+ if not isinstance(depth, int):
+ raise ValidationError("Depth must be an integer")
+ if depth <= 0:
+ raise ValidationError("The 'depth' parameter must be a positive integer.")
+
+ result = self.crawl_api.crawl(
+ url, ignore_sitemap, depth, include_filter, exclude_filter, custom_output_fields, include_errors
+ )
+ return result
+
+ def parse_content(
+ self,
+ data: Union[str, Dict, List],
+ extract_text: bool = True,
+ extract_links: bool = False,
+ extract_images: bool = False
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+ """
+ ## Parse content from API responses
+
+ Extract and parse useful information from scraping, search, or crawling results.
+ Automatically detects and handles both single and multiple results from batch operations.
+
+ ### Parameters:
+ - `data` (str | Dict | List): Response data from scrape(), search(), or crawl() methods
+ - `extract_text` (bool, optional): Extract clean text content (default: True)
+ - `extract_links` (bool, optional): Extract all links from content (default: False)
+ - `extract_images` (bool, optional): Extract image URLs from content (default: False)
+
+ ### Returns:
+ - `Dict[str, Any]`: Parsed content for single results
+ - `List[Dict[str, Any]]`: List of parsed content for multiple results (auto-detected)
+ """
+
+ return parse_content(
+ data=data,
+ extract_text=extract_text,
+ extract_links=extract_links,
+ extract_images=extract_images
+ )
+
+ def extract(self, query: str, url: Union[str, List[str]] = None, output_scheme: Dict[str, Any] = None, llm_key: str = None) -> str:
+ """
+ ## Extract specific information from websites using AI
+
+ Combines web scraping with OpenAI's language models to extract targeted information
+ from web pages based on natural language queries. Automatically parses URLs and
+ optimizes content for efficient LLM processing.
+
+ **LLM Key Notice:** If `llm_key` is not provided, the method will attempt to read
+ the OpenAI API key from the `OPENAI_API_KEY` environment variable. Ensure it is set.
+
+ ### Parameters:
+ - `query` (str): Natural language query describing what to extract. If `url` parameter is provided,
+ the extraction will run on that URL. Otherwise, a prior scrape result should be provided.
+ - `url` (str | List[str], optional): Target page URL(s) to extract information from. Can be omitted if using a prior result.
+ - `output_scheme` (Dict, optional): JSON schema defining the structure of desired output (keys and value types)
+ - `llm_key` (str, optional): OpenAI API key for LLM usage (if not provided, will use environment variable)
+
+ ### Returns:
+ - `str`: The extracted information as a text string (may contain JSON or markdown depending on query and output_scheme)
+
+ ### Raises:
+ - `ValidationError`: Missing query, missing URL, or invalid LLM key
+ """
+
+ # Validate LLM key
+ if not llm_key:
+ raise ValidationError(
+ "Missing API key. Provide it via the `llm_key` parameter or set the "
+ "`BRIGHTDATA_API_TOKEN` environment variable. Example:\n\n"
+ "export BRIGHTDATA_API_TOKEN='your-openai-api-key'"
+ )
+
+ return self.extract_api.extract(query, url, output_scheme, llm_key)
diff --git a/src/exceptions/__init__.py b/src/exceptions/__init__.py
new file mode 100644
index 0000000..6554555
--- /dev/null
+++ b/src/exceptions/__init__.py
@@ -0,0 +1,17 @@
+from .errors import (
+ BrightDataError,
+ ValidationError,
+ AuthenticationError,
+ ZoneError,
+ NetworkError,
+ APIError
+)
+
+__all__ = [
+ 'BrightDataError',
+ 'ValidationError',
+ 'AuthenticationError',
+ 'ZoneError',
+ 'NetworkError',
+ 'APIError'
+]
\ No newline at end of file
diff --git a/src/exceptions/errors.py b/src/exceptions/errors.py
new file mode 100644
index 0000000..1cf4425
--- /dev/null
+++ b/src/exceptions/errors.py
@@ -0,0 +1,31 @@
+class BrightDataError(Exception):
+ """Base exception for all Bright Data SDK errors"""
+ pass
+
+
+class ValidationError(BrightDataError):
+ """Raised when input validation fails"""
+ pass
+
+
+class AuthenticationError(BrightDataError):
+ """Raised when API authentication fails"""
+ pass
+
+
+class ZoneError(BrightDataError):
+ """Raised when zone operations fail"""
+ pass
+
+
+class NetworkError(BrightDataError):
+ """Raised when network operations fail"""
+ pass
+
+
+class APIError(BrightDataError):
+ """Raised when API requests fail"""
+ def __init__(self, message, status_code=None, response_text=None):
+ super().__init__(message)
+ self.status_code = status_code
+ self.response_text = response_text
\ No newline at end of file
diff --git a/src/schemas/tst b/src/schemas/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/schemas/tst
@@ -0,0 +1 @@
+
diff --git a/src/search.py b/src/search.py
new file mode 100644
index 0000000..215e65d
--- /dev/null
+++ b/src/search.py
@@ -0,0 +1,305 @@
+from __future__ import annotations
+
+import re
+import time
+from .exceptions import ValidationError, APIError
+from typing import Any, Dict, List, Optional, Union
+
+
+class SearchGPTResult:
+ """
+ Wrapper for GPT search results.
+
+ Args
+
+ raw : Any - The raw API response object (dict / list / text).
+ text : str (property) - Best-effort extraction of the final answer text (T1).
+ prompt : Optional[str] - The original prompt (for single result).
+ country : Optional[str] - Country code used for the request (single result).
+ usage : Optional[Dict[str, Any]] - Token/usage metadata if available.
+ snapshot_id : Optional[str] - Present when sync=False (async job queued).
+ """
+
+ def __init__(
+ self,
+ raw: Any,
+ prompt: Optional[str] = None,
+ country: Optional[str] = None,
+ usage: Optional[Dict[str, Any]] = None,
+ snapshot_id: Optional[str] = None,
+ ) -> None:
+ self.raw = raw
+ self.prompt = prompt
+ self.country = country
+ self.usage = usage
+ self.snapshot_id = snapshot_id
+
+ # helpers
+ @staticmethod
+ def _coalesce(*vals) -> Optional[str]:
+ for v in vals:
+ if isinstance(v, str) and v.strip():
+ return v
+ return None
+
+ @staticmethod
+ def _dig(d: Any, *keys) -> Any:
+ cur = d
+ for k in keys:
+ if isinstance(cur, dict) and k in cur:
+ cur = cur[k]
+ else:
+ return None
+ return cur
+
+ @property
+ def text(self) -> Optional[str]:
+ """
+ Best-effort extraction of ONLY the final answer text.
+ Tries common fields/paths seen in ChatGPT-like payloads.
+ Returns None if not found.
+ """
+ raw = self.raw
+
+ # If API returned a plain string
+ if isinstance(raw, str):
+ return raw.strip() or None
+
+ t = self._dig(raw, "answer")
+ if isinstance(t, str):
+ return t.strip() or None
+
+ t = self._dig(raw, "data", "answer")
+ if isinstance(t, str):
+ return t.strip() or None
+
+ t = self._dig(raw, "message", "content")
+ if isinstance(t, str):
+ return t.strip() or None
+
+ choices = self._dig(raw, "choices")
+ if isinstance(choices, list) and choices:
+ content = self._dig(choices[0], "message", "content")
+ if isinstance(content, str):
+ return content.strip() or None
+
+ content = choices[0].get("text") if isinstance(choices[0], dict) else None
+ if isinstance(content, str):
+ return content.strip() or None
+
+ t = self._dig(raw, "result")
+ if isinstance(t, str):
+ return t.strip() or None
+ t = self._dig(raw, "output")
+ if isinstance(t, str):
+ return t.strip() or None
+
+ for key in ("content", "text", "final", "final_text"):
+ v = self._dig(raw, key)
+ if isinstance(v, str):
+ return v.strip() or None
+
+ return None
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {
+ "prompt": self.prompt,
+ "country": self.country,
+ "usage": self.usage,
+ "snapshot_id": self.snapshot_id,
+ "raw": self.raw,
+ "text": self.text,
+ }
+
+
+class Search:
+ """
+ Namespaced search interface.
+ """
+
+ def __init__(self, client) -> None:
+ self._c = client # root client (reuses session, APIs, zones)
+
+ def __call__(
+ self,
+ query: Union[str, List[str]],
+ search_engine: str = "google",
+ zone: str = None,
+ response_format: str = "raw",
+ method: str = "GET",
+ country: str = "",
+ data_format: str = "html",
+ async_request: bool = False,
+ max_workers: int = None,
+ timeout: int = None,
+ parse: bool = False,
+ ):
+ return self.web(
+ query=query,
+ search_engine=search_engine,
+ zone=zone,
+ response_format=response_format,
+ method=method,
+ country=country,
+ data_format=data_format,
+ async_request=async_request,
+ max_workers=max_workers,
+ timeout=timeout,
+ parse=parse,
+ )
+
+ # GPT
+ def gpt(
+ self,
+ prompt: Union[str, List[str]],
+ country: Union[str, List[str]] = None,
+ secondary_prompt: Union[str, List[str]] = None,
+ web_search: Union[bool, List[bool]] = False,
+ sync: bool = True,
+ timeout: int = None,
+ ) -> Union[SearchGPTResult, List[SearchGPTResult]]:
+ """
+ Query ChatGPT via Bright Data's dataset API.
+
+ Returns - Single object for single prompt, list for multiple prompts (M2).
+ """
+ prompts: List[str]
+ if isinstance(prompt, str):
+ prompts = [prompt]
+ elif isinstance(prompt, list) and all(isinstance(p, str) for p in prompt):
+ prompts = prompt
+ else:
+ raise ValidationError("Invalid prompt input: must be a non-empty string or list of strings.")
+ if not prompts:
+ raise ValidationError("At least one prompt is required.")
+
+ # normalization helper
+ def _norm(param, name):
+ if param is None:
+ return [None] * len(prompts)
+ if isinstance(param, list):
+ if len(param) != len(prompts):
+ raise ValidationError(f"{name} list must have the same length as prompts.")
+ return param
+ return [param] * len(prompts)
+
+ countries = _norm(country, "country")
+ secondary_prompts = _norm(secondary_prompt, "secondary_prompt")
+ web_searches = _norm(web_search, "web_search")
+
+ # validation
+ for c in countries:
+ if c and not re.match(r"^[A-Z]{2}$", c):
+ raise ValidationError(f"Invalid country code '{c}'. Must be 2 uppercase letters.")
+ for s in secondary_prompts:
+ if s is not None and not isinstance(s, str):
+ raise ValidationError("Secondary prompts must be strings.")
+ for w in web_searches:
+ if not isinstance(w, bool):
+ raise ValidationError("Web search flags must be boolean.")
+ if timeout is not None and (not isinstance(timeout, int) or timeout <= 0):
+ raise ValidationError("Timeout must be a positive integer.")
+
+ timeout = timeout or (65 if sync else 30)
+
+ # retries around API call
+ max_retries = 3
+ last_err = None
+ for attempt in range(max_retries):
+ try:
+ result = self._c.chatgpt_api.scrape_chatgpt(
+ prompts=prompts,
+ countries=countries,
+ additional_prompts=secondary_prompts,
+ web_searches=web_searches,
+ sync=sync,
+ timeout=timeout,
+ )
+ # Wrap result(s)
+ if not sync:
+ # Async: expect {"snapshot_id": "...", ...}
+ snapshot_id = result.get("snapshot_id") if isinstance(result, dict) else None
+ return SearchGPTResult(raw=result, snapshot_id=snapshot_id)
+
+ if isinstance(result, list):
+ out: List[SearchGPTResult] = []
+ if len(result) == len(prompts):
+ for i, item in enumerate(result):
+ out.append(
+ SearchGPTResult(
+ raw=item,
+ prompt=prompts[i],
+ country=countries[i],
+ usage=None,
+ )
+ )
+ else:
+ for item in result:
+ out.append(SearchGPTResult(raw=item))
+ return out[0] if len(prompts) == 1 and len(out) == 1 else out
+
+ return SearchGPTResult(raw=result, prompt=prompts[0] if len(prompts) == 1 else None)
+
+ except APIError as e:
+ last_err = e
+ if attempt < max_retries - 1:
+ time.sleep(2)
+ continue
+ raise e
+ except Exception as e:
+ if isinstance(e, (ValidationError, APIError)):
+ raise
+ last_err = e
+ if attempt < max_retries - 1:
+ time.sleep(2)
+ continue
+ raise APIError(f"Unexpected error in search.gpt: {e}") from e
+
+ if last_err:
+ raise last_err
+ raise APIError("Unknown error in search.gpt")
+
+ # Web (SERP)
+ def web(
+ self,
+ query: Union[str, List[str]],
+ search_engine: str = "google",
+ zone: str = None,
+ response_format: str = "raw",
+ method: str = "GET",
+ country: str = "",
+ data_format: str = "html",
+ async_request: bool = False,
+ max_workers: int = None,
+ timeout: int = None,
+ parse: bool = False,
+ ):
+ """
+ Web/SERP search wrapper. Thin pass-through to SearchAPI with validation.
+ """
+ if not query:
+ raise ValueError("The 'query' parameter cannot be None or empty.")
+ if isinstance(query, str):
+ if not query.strip():
+ raise ValueError("The 'query' string cannot be empty or whitespace.")
+ elif isinstance(query, list):
+ if not all(isinstance(q, str) and q.strip() for q in query):
+ raise ValueError("All queries in the list must be non-empty strings.")
+ else:
+ raise TypeError("The 'query' parameter must be a string or a list of strings.")
+
+ zone = zone or self._c.serp_zone
+ max_workers = max_workers or self._c.DEFAULT_MAX_WORKERS
+
+ return self._c.search_api.search(
+ query, search_engine, zone, response_format, method, country,
+ data_format, async_request, max_workers, timeout, parse
+ )
+
+ # LinkedIn
+ @property
+ def linkedin(self):
+ """
+ Namespaced LinkedIn search helpers.
+ """
+ return self._c.search_linkedin
diff --git a/src/types/tst b/src/types/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/types/tst
@@ -0,0 +1 @@
+
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000..75a2f6c
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1,35 @@
+from .validation import (
+ validate_url, validate_zone_name, validate_country_code,
+ validate_timeout, validate_max_workers, validate_url_list,
+ validate_search_engine, validate_query, validate_response_format,
+ validate_http_method
+)
+from .retry import retry_request
+from .zone_manager import ZoneManager
+from .logging_config import setup_logging, get_logger, log_request
+from .response_validator import safe_json_parse, validate_response_size, check_response_not_empty
+from .parser import parse_content, parse_multiple, extract_structured_data
+
+__all__ = [
+ 'validate_url',
+ 'validate_zone_name',
+ 'validate_country_code',
+ 'validate_timeout',
+ 'validate_max_workers',
+ 'validate_url_list',
+ 'validate_search_engine',
+ 'validate_query',
+ 'validate_response_format',
+ 'validate_http_method',
+ 'retry_request',
+ 'ZoneManager',
+ 'setup_logging',
+ 'get_logger',
+ 'log_request',
+ 'safe_json_parse',
+ 'validate_response_size',
+ 'check_response_not_empty',
+ 'parse_content',
+ 'parse_multiple',
+ 'extract_structured_data'
+]
\ No newline at end of file
diff --git a/src/utils/logging_config.py b/src/utils/logging_config.py
new file mode 100644
index 0000000..89289da
--- /dev/null
+++ b/src/utils/logging_config.py
@@ -0,0 +1,177 @@
+"""
+Structured logging configuration for Bright Data SDK
+"""
+import logging
+import json
+import time
+from typing import Dict, Any
+import uuid
+
+
+class StructuredFormatter(logging.Formatter):
+ """Custom formatter that outputs structured JSON logs"""
+
+ def __init__(self):
+ super().__init__()
+ self.start_time = time.time()
+
+ def format(self, record):
+ log_data = {
+ 'timestamp': self.formatTime(record),
+ 'level': record.levelname,
+ 'logger': record.name,
+ 'message': record.getMessage(),
+ 'module': record.module,
+ 'function': record.funcName,
+ 'line': record.lineno
+ }
+
+ correlation_id = getattr(record, 'correlation_id', None)
+ if correlation_id:
+ log_data['correlation_id'] = correlation_id
+
+ if hasattr(record, 'url'):
+ log_data['url'] = record.url
+ if hasattr(record, 'method'):
+ log_data['method'] = record.method
+ if hasattr(record, 'status_code'):
+ log_data['status_code'] = record.status_code
+ if hasattr(record, 'response_time'):
+ log_data['response_time_ms'] = record.response_time
+
+ if record.exc_info:
+ log_data['exception'] = {
+ 'type': record.exc_info[0].__name__ if record.exc_info[0] else None,
+ 'message': str(record.exc_info[1]) if record.exc_info[1] else None,
+ 'traceback': self.formatException(record.exc_info)
+ }
+
+ log_data = self._sanitize_log_data(log_data)
+
+ return json.dumps(log_data, default=str)
+
+ def _sanitize_log_data(self, log_data: Dict[str, Any]) -> Dict[str, Any]:
+ """Remove or mask sensitive information from log data"""
+ sensitive_keys = ['authorization', 'token', 'api_token', 'password', 'secret']
+
+ def sanitize_value(key: str, value: Any) -> Any:
+ if isinstance(key, str) and any(sensitive in key.lower() for sensitive in sensitive_keys):
+ return "***REDACTED***"
+ elif isinstance(value, str) and len(value) > 20:
+ if value.isalnum() and len(value) > 32:
+ return f"{value[:8]}***REDACTED***{value[-4:]}"
+ return value
+
+ def recursive_sanitize(obj):
+ if isinstance(obj, dict):
+ return {k: recursive_sanitize(sanitize_value(k, v)) for k, v in obj.items()}
+ elif isinstance(obj, list):
+ return [recursive_sanitize(item) for item in obj]
+ else:
+ return obj
+
+ return recursive_sanitize(log_data)
+
+
+def setup_logging(level: str = "INFO", structured: bool = True, verbose: bool = True) -> None:
+ """
+ Setup logging configuration for the SDK
+
+ Args:
+ level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+ structured: Whether to use structured JSON logging
+ verbose: Whether to show verbose logging (default: True)
+ When False, only WARNING and above are shown
+ When True, uses the specified level
+ """
+ if not verbose:
+ log_level = logging.WARNING
+ else:
+ log_level = getattr(logging, level.upper(), logging.INFO)
+
+ root_logger = logging.getLogger('brightdata')
+ root_logger.handlers.clear()
+
+ handler = logging.StreamHandler()
+ handler.setLevel(log_level)
+
+ if structured:
+ formatter = StructuredFormatter()
+ else:
+ formatter = logging.Formatter(
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ )
+
+ handler.setFormatter(formatter)
+ root_logger.addHandler(handler)
+ root_logger.setLevel(log_level)
+
+ root_logger.propagate = False
+
+
+def get_logger(name: str) -> logging.Logger:
+ """
+ Get a logger instance with the specified name
+
+ Args:
+ name: Logger name
+
+ Returns:
+ Configured logger instance
+ """
+ return logging.getLogger(f'brightdata.{name}')
+
+
+def log_request(logger: logging.Logger, method: str, url: str,
+ status_code: int = None, response_time: float = None,
+ correlation_id: str = None) -> None:
+ """
+ Log HTTP request details
+
+ Args:
+ logger: Logger instance
+ method: HTTP method
+ url: Request URL (will be sanitized)
+ status_code: HTTP response status code
+ response_time: Response time in milliseconds
+ correlation_id: Request correlation ID
+ """
+ extra = {
+ 'method': method,
+ 'url': _sanitize_url(url),
+ 'correlation_id': correlation_id or str(uuid.uuid4())
+ }
+
+ if status_code is not None:
+ extra['status_code'] = status_code
+ if response_time is not None:
+ extra['response_time'] = response_time
+
+ if status_code and status_code >= 400:
+ logger.error(f"HTTP request failed: {method} {_sanitize_url(url)}", extra=extra)
+ else:
+ logger.info(f"HTTP request: {method} {_sanitize_url(url)}", extra=extra)
+
+
+def _sanitize_url(url: str) -> str:
+ """Sanitize URL to remove sensitive query parameters"""
+ try:
+ from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
+
+ parsed = urlparse(url)
+ query_params = parse_qs(parsed.query)
+
+ sensitive_params = ['token', 'api_key', 'secret', 'password']
+ for param in sensitive_params:
+ if param in query_params:
+ query_params[param] = ['***REDACTED***']
+
+ sanitized_query = urlencode(query_params, doseq=True)
+ sanitized = urlunparse((
+ parsed.scheme, parsed.netloc, parsed.path,
+ parsed.params, sanitized_query, parsed.fragment
+ ))
+
+ return sanitized
+ except Exception:
+ return url.split('?')[0] + ('?***PARAMS_REDACTED***' if '?' in url else '')
\ No newline at end of file
diff --git a/src/utils/parser.py b/src/utils/parser.py
new file mode 100644
index 0000000..686ad39
--- /dev/null
+++ b/src/utils/parser.py
@@ -0,0 +1,264 @@
+"""
+Content parsing utilities for Bright Data SDK responses
+
+Provides functions to extract and parse content from scraping and search results.
+"""
+import json
+import re
+from typing import Any, Dict, List, Union, Optional
+
+from bs4 import BeautifulSoup
+
+
+def parse_content(data: Union[str, Dict, List], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+ """
+ Parse content from Bright Data API responses
+
+ Automatically detects and handles both single and multiple results from scrape/search operations.
+ Can be used as a standalone function or called from the client.
+
+ Args:
+ data: Response data from scrape() or search() - can be JSON dict/list or HTML string
+ extract_text: Extract clean text content (default: True)
+ extract_links: Extract all links from content (default: False)
+ extract_images: Extract image URLs from content (default: False)
+
+ Returns:
+ Dict containing parsed content for single results, or List[Dict] for multiple results with keys:
+ - 'type': 'json' or 'html'
+ - 'text': Cleaned text content (if extract_text=True)
+ - 'links': List of extracted links (if extract_links=True)
+ - 'images': List of image URLs (if extract_images=True)
+ - 'title': Page title (if available)
+ - 'raw_length': Length of original content
+ - 'structured_data': Original JSON data (if type='json')
+ """
+ if _is_multiple_results(data):
+ return parse_multiple(data, extract_text=extract_text, extract_links=extract_links, extract_images=extract_images)
+
+ return _parse_single_content(data, extract_text, extract_links, extract_images)
+
+
+def parse_multiple(data_list: List[Union[str, Dict]], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> List[Dict[str, Any]]:
+ """
+ Parse multiple content items (useful for batch scraping results)
+
+ Args:
+ data_list: List of response data items
+ extract_text: Extract clean text content (default: True)
+ extract_links: Extract all links from content (default: False)
+ extract_images: Extract image URLs from content (default: False)
+
+ Returns:
+ List of parsed content dictionaries
+ """
+ if not isinstance(data_list, list):
+ return []
+
+ return [_parse_single_content(item, extract_text, extract_links, extract_images) for item in data_list]
+
+
+def _is_multiple_results(data: Union[str, Dict, List]) -> bool:
+ """
+ Detect if data contains multiple scraping/search results
+
+ Args:
+ data: Response data to analyze
+
+ Returns:
+ True if data appears to be multiple results, False otherwise
+ """
+ if not isinstance(data, list):
+ return False
+
+ if len(data) <= 1:
+ return False
+
+ multiple_result_indicators = 0
+
+ for item in data[:3]:
+ if isinstance(item, dict):
+ common_keys = {'html', 'body', 'content', 'page_html', 'raw_html', 'url', 'status_code'}
+ if any(key in item for key in common_keys):
+ multiple_result_indicators += 1
+ elif isinstance(item, str) and len(item) > 100:
+ if '= 2
+
+
+def _parse_single_content(data: Union[str, Dict, List], extract_text: bool = True, extract_links: bool = False, extract_images: bool = False) -> Dict[str, Any]:
+ """
+ Parse single content item from Bright Data API responses
+
+ Args:
+ data: Single response data item - can be JSON dict or HTML string
+ extract_text: Extract clean text content (default: True)
+ extract_links: Extract all links from content (default: False)
+ extract_images: Extract image URLs from content (default: False)
+
+ Returns:
+ Dict containing parsed content
+ """
+ result = {
+ 'type': None,
+ 'raw_length': 0,
+ 'title': None
+ }
+
+ if data is None:
+ return result
+
+ if isinstance(data, (dict, list)):
+ result['type'] = 'json'
+ result['structured_data'] = data
+ result['raw_length'] = len(str(data))
+
+ html_content = _extract_html_from_json(data)
+ if html_content and (extract_text or extract_links or extract_images):
+ _parse_html_content(html_content, result, extract_text, extract_links, extract_images)
+
+ result['title'] = _extract_title_from_json(data)
+
+ elif isinstance(data, str):
+ result['type'] = 'html'
+ result['raw_length'] = len(data)
+
+ if extract_text or extract_links or extract_images:
+ _parse_html_content(data, result, extract_text, extract_links, extract_images)
+
+ return result
+
+
+def extract_structured_data(data: Union[str, Dict, List]) -> Optional[Dict]:
+ """
+ Extract structured data (JSON-LD, microdata) from content
+
+ Args:
+ data: Response data
+
+ Returns:
+ Structured data if found, None otherwise
+ """
+ html_content = None
+
+ if isinstance(data, str):
+ html_content = data
+ elif isinstance(data, (dict, list)):
+ html_content = _extract_html_from_json(data)
+
+ if not html_content:
+ return None
+
+ try:
+ soup = BeautifulSoup(html_content, 'html.parser')
+
+ scripts = soup.find_all('script', type='application/ld+json')
+ if scripts:
+ structured_data = []
+ for script in scripts:
+ try:
+ data = json.loads(script.string)
+ structured_data.append(data)
+ except json.JSONDecodeError:
+ continue
+ if structured_data:
+ return {'json_ld': structured_data}
+
+ except Exception:
+ pass
+
+ return None
+
+
+def _extract_html_from_json(data: Union[Dict, List]) -> Optional[str]:
+ """Extract HTML content from JSON response structure"""
+ if isinstance(data, dict):
+ html_keys = ['html', 'body', 'content', 'page_html', 'raw_html']
+ for key in html_keys:
+ if key in data and isinstance(data[key], str):
+ return data[key]
+
+ for value in data.values():
+ if isinstance(value, (dict, list)):
+ html = _extract_html_from_json(value)
+ if html:
+ return html
+
+ elif isinstance(data, list):
+ for item in data:
+ if isinstance(item, (dict, list)):
+ html = _extract_html_from_json(item)
+ if html:
+ return html
+
+ return None
+
+
+def _extract_title_from_json(data: Union[Dict, List]) -> Optional[str]:
+ """Extract title from JSON response structure"""
+ if isinstance(data, dict):
+ title_keys = ['title', 'page_title', 'name']
+ for key in title_keys:
+ if key in data and isinstance(data[key], str):
+ return data[key].strip()
+
+ for value in data.values():
+ if isinstance(value, (dict, list)):
+ title = _extract_title_from_json(value)
+ if title:
+ return title
+
+ elif isinstance(data, list):
+ for item in data:
+ if isinstance(item, (dict, list)):
+ title = _extract_title_from_json(item)
+ if title:
+ return title
+
+ return None
+
+
+def _parse_html_content(html: str, result: Dict, extract_text: bool, extract_links: bool, extract_images: bool):
+ """Parse HTML content and update result dictionary"""
+ try:
+ soup = BeautifulSoup(html, 'html.parser')
+
+ if not result.get('title'):
+ title_tag = soup.find('title')
+ if title_tag:
+ result['title'] = title_tag.get_text().strip()
+
+ if extract_text:
+ for script in soup(["script", "style"]):
+ script.decompose()
+
+ text = soup.get_text()
+ lines = (line.strip() for line in text.splitlines())
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
+ result['text'] = '\n'.join(chunk for chunk in chunks if chunk)
+
+ if extract_links:
+ links = []
+ for a_tag in soup.find_all('a', href=True):
+ href = a_tag['href']
+ text = a_tag.get_text().strip()
+ links.append({'url': href, 'text': text})
+ result['links'] = links
+
+ if extract_images:
+ images = []
+ for img_tag in soup.find_all('img', src=True):
+ src = img_tag['src']
+ alt = img_tag.get('alt', '').strip()
+ images.append({'url': src, 'alt': alt})
+ result['images'] = images
+
+ except Exception as e:
+ if extract_text:
+ result['text'] = f"HTML parsing failed: {str(e)}"
+ if extract_links:
+ result['links'] = []
+ if extract_images:
+ result['images'] = []
\ No newline at end of file
diff --git a/src/utils/response_validator.py b/src/utils/response_validator.py
new file mode 100644
index 0000000..83a9aa7
--- /dev/null
+++ b/src/utils/response_validator.py
@@ -0,0 +1,49 @@
+"""
+Minimal response validation utilities for Bright Data SDK
+"""
+import json
+from typing import Any, Dict, Union
+from ..exceptions import ValidationError
+
+
+def safe_json_parse(response_text: str) -> Dict[str, Any]:
+ """
+ Safely parse JSON response with minimal validation
+
+ Args:
+ response_text: Raw response text from API
+
+ Returns:
+ Parsed JSON data or original text if parsing fails
+ """
+ if not response_text:
+ return {}
+
+ try:
+ return json.loads(response_text)
+ except (json.JSONDecodeError, TypeError):
+ # Return original text if JSON parsing fails
+ return response_text
+
+
+def validate_response_size(response_text: str, max_size_mb: float = 100.0) -> None:
+ """
+ Quick size check to prevent memory issues
+
+ Args:
+ response_text: Response text to validate
+ max_size_mb: Maximum allowed size in megabytes
+ """
+ if response_text and len(response_text) > (max_size_mb * 1024 * 1024):
+ raise ValidationError(f"Response too large (>{max_size_mb}MB)")
+
+
+def check_response_not_empty(data: Any) -> None:
+ """
+ Minimal check that response contains data
+
+ Args:
+ data: Response data to check
+ """
+ if data is None or (isinstance(data, str) and len(data.strip()) == 0):
+ raise ValidationError("Empty response received")
\ No newline at end of file
diff --git a/src/utils/retry.py b/src/utils/retry.py
new file mode 100644
index 0000000..361645a
--- /dev/null
+++ b/src/utils/retry.py
@@ -0,0 +1,90 @@
+import time
+import random
+import requests
+from functools import wraps
+from ..exceptions import NetworkError, APIError
+
+
+def retry_request(max_retries=3, backoff_factor=1.5, retry_statuses=None, max_backoff=60):
+ """
+ Decorator for retrying requests with exponential backoff and jitter
+
+ Args:
+ max_retries: Maximum number of retry attempts
+ backoff_factor: Exponential backoff multiplier
+ retry_statuses: HTTP status codes that should trigger retries
+ max_backoff: Maximum backoff time in seconds
+ """
+ if retry_statuses is None:
+ retry_statuses = {429, 500, 502, 503, 504}
+
+ def decorator(func):
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ last_exception = None
+
+ for attempt in range(max_retries + 1): # +1 to include initial attempt
+ try:
+ response = func(*args, **kwargs)
+
+ # Check if we should retry based on status code
+ if hasattr(response, 'status_code') and response.status_code in retry_statuses:
+ if attempt >= max_retries:
+ raise APIError(
+ f"Server error after {max_retries} retries: HTTP {response.status_code}",
+ status_code=response.status_code,
+ response_text=getattr(response, 'text', '')
+ )
+
+ # Calculate backoff with jitter
+ backoff_time = min(backoff_factor ** attempt, max_backoff)
+ jitter = backoff_time * 0.1 * random.random() # Add up to 10% jitter
+ total_delay = backoff_time + jitter
+
+ time.sleep(total_delay)
+ continue
+
+ return response
+
+ except requests.exceptions.ConnectTimeout as e:
+ last_exception = NetworkError(f"Connection timeout: {str(e)}")
+ except requests.exceptions.ReadTimeout as e:
+ last_exception = NetworkError(f"Read timeout: {str(e)}")
+ except requests.exceptions.Timeout as e:
+ last_exception = NetworkError(f"Request timeout: {str(e)}")
+ except requests.exceptions.ConnectionError as e:
+ # Handle DNS resolution, connection refused, etc.
+ if "Name or service not known" in str(e):
+ last_exception = NetworkError(f"DNS resolution failed: {str(e)}")
+ elif "Connection refused" in str(e):
+ last_exception = NetworkError(f"Connection refused: {str(e)}")
+ else:
+ last_exception = NetworkError(f"Connection error: {str(e)}")
+ except requests.exceptions.SSLError as e:
+ last_exception = NetworkError(f"SSL/TLS error: {str(e)}")
+ except requests.exceptions.ProxyError as e:
+ last_exception = NetworkError(f"Proxy error: {str(e)}")
+ except requests.exceptions.RequestException as e:
+ last_exception = NetworkError(f"Network error: {str(e)}")
+ except Exception as e:
+ # Catch any other unexpected exceptions
+ last_exception = NetworkError(f"Unexpected error: {str(e)}")
+
+ # If this was the last attempt, raise the exception
+ if attempt >= max_retries:
+ raise last_exception
+
+ # Calculate backoff with jitter for network errors
+ backoff_time = min(backoff_factor ** attempt, max_backoff)
+ jitter = backoff_time * 0.1 * random.random()
+ total_delay = backoff_time + jitter
+
+ time.sleep(total_delay)
+
+ # This should never be reached, but just in case
+ if last_exception:
+ raise last_exception
+ return None
+
+ return wrapper
+ return decorator
\ No newline at end of file
diff --git a/src/utils/tst b/src/utils/tst
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/utils/tst
@@ -0,0 +1 @@
+
diff --git a/src/utils/validation.py b/src/utils/validation.py
new file mode 100644
index 0000000..938cb43
--- /dev/null
+++ b/src/utils/validation.py
@@ -0,0 +1,183 @@
+from urllib.parse import urlparse
+from typing import Union, List
+from ..exceptions import ValidationError
+
+
+def validate_url(url: str) -> None:
+ """Validate URL format with comprehensive checks"""
+ if not isinstance(url, str):
+ raise ValidationError(f"URL must be a string, got {type(url).__name__}")
+
+ if not url.strip():
+ raise ValidationError("URL cannot be empty or whitespace")
+
+ # Check URL length
+ if len(url) > 8192: # Common URL length limit
+ raise ValidationError("URL exceeds maximum length of 8192 characters")
+
+ try:
+ parsed = urlparse(url.strip())
+ if not parsed.scheme:
+ raise ValidationError(f"URL must include a scheme (http/https): {url}")
+ if parsed.scheme.lower() not in ['http', 'https']:
+ raise ValidationError(f"URL scheme must be http or https, got: {parsed.scheme}")
+ if not parsed.netloc:
+ raise ValidationError(f"URL must include a valid domain: {url}")
+ # Check for suspicious characters
+ if any(char in url for char in ['<', '>', '"', "'"]):
+ raise ValidationError("URL contains invalid characters")
+ except Exception as e:
+ if isinstance(e, ValidationError):
+ raise
+ raise ValidationError(f"Invalid URL format '{url}': {str(e)}")
+
+
+def validate_zone_name(zone: str = None) -> None:
+ """Validate zone name format with enhanced checks"""
+ if zone is None:
+ return # Zone can be None (optional parameter)
+
+ if not isinstance(zone, str):
+ raise ValidationError(f"Zone name must be a string, got {type(zone).__name__}")
+
+ zone = zone.strip()
+ if not zone:
+ raise ValidationError("Zone name cannot be empty or whitespace")
+
+ if len(zone) < 3:
+ raise ValidationError("Zone name must be at least 3 characters long")
+
+ if len(zone) > 63:
+ raise ValidationError("Zone name must not exceed 63 characters")
+
+ if not zone.replace('_', '').replace('-', '').isalnum():
+ raise ValidationError("Zone name can only contain letters, numbers, hyphens, and underscores")
+
+ if zone.startswith('-') or zone.endswith('-'):
+ raise ValidationError("Zone name cannot start or end with a hyphen")
+
+ if zone.startswith('_') or zone.endswith('_'):
+ raise ValidationError("Zone name cannot start or end with an underscore")
+
+
+def validate_country_code(country: str) -> None:
+ """Validate ISO country code format"""
+ if not isinstance(country, str):
+ raise ValidationError(f"Country code must be a string, got {type(country).__name__}")
+
+ country = country.strip().lower()
+ if len(country) == 0:
+ return
+
+ if len(country) != 2:
+ raise ValidationError("Country code must be exactly 2 characters (ISO 3166-1 alpha-2) or empty")
+
+ if not country.isalpha():
+ raise ValidationError("Country code must contain only letters")
+
+
+def validate_timeout(timeout: int) -> None:
+ """Validate timeout value"""
+ if timeout is None:
+ return # Timeout can be None (use default)
+
+ if not isinstance(timeout, int):
+ raise ValidationError(f"Timeout must be an integer, got {type(timeout).__name__}")
+
+ if timeout <= 0:
+ raise ValidationError("Timeout must be greater than 0 seconds")
+
+ if timeout > 300: # 5 minutes max
+ raise ValidationError("Timeout cannot exceed 300 seconds (5 minutes)")
+
+
+def validate_max_workers(max_workers: int) -> None:
+ """Validate max_workers parameter"""
+ if max_workers is None:
+ return # Can be None (use default)
+
+ if not isinstance(max_workers, int):
+ raise ValidationError(f"max_workers must be an integer, got {type(max_workers).__name__}")
+
+ if max_workers <= 0:
+ raise ValidationError("max_workers must be greater than 0")
+
+ if max_workers > 50: # Reasonable upper limit
+ raise ValidationError("max_workers cannot exceed 50 (to prevent resource exhaustion)")
+
+
+def validate_url_list(urls: List[str], max_urls: int = 100) -> None:
+ """Validate list of URLs with size limits"""
+ if not isinstance(urls, list):
+ raise ValidationError(f"URL list must be a list, got {type(urls).__name__}")
+
+ if len(urls) == 0:
+ raise ValidationError("URL list cannot be empty")
+
+ if len(urls) > max_urls:
+ raise ValidationError(f"URL list cannot contain more than {max_urls} URLs")
+
+ for i, url in enumerate(urls):
+ try:
+ validate_url(url)
+ except ValidationError as e:
+ raise ValidationError(f"Invalid URL at index {i}: {str(e)}")
+
+
+def validate_search_engine(search_engine: str) -> None:
+ """Validate search engine parameter"""
+ if not isinstance(search_engine, str):
+ raise ValidationError(f"Search engine must be a string, got {type(search_engine).__name__}")
+
+ valid_engines = ['google', 'bing', 'yandex']
+ search_engine = search_engine.strip().lower()
+
+ if search_engine not in valid_engines:
+ raise ValidationError(f"Invalid search engine '{search_engine}'. Valid options: {', '.join(valid_engines)}")
+
+
+def validate_query(query: Union[str, List[str]]) -> None:
+ """Validate search query parameter"""
+ if isinstance(query, str):
+ if not query.strip():
+ raise ValidationError("Search query cannot be empty or whitespace")
+ if len(query) > 2048:
+ raise ValidationError("Search query cannot exceed 2048 characters")
+ elif isinstance(query, list):
+ if len(query) == 0:
+ raise ValidationError("Query list cannot be empty")
+ if len(query) > 50: # Reasonable limit
+ raise ValidationError("Query list cannot contain more than 50 queries")
+ for i, q in enumerate(query):
+ if not isinstance(q, str):
+ raise ValidationError(f"Query at index {i} must be a string, got {type(q).__name__}")
+ if not q.strip():
+ raise ValidationError(f"Query at index {i} cannot be empty or whitespace")
+ if len(q) > 2048:
+ raise ValidationError(f"Query at index {i} cannot exceed 2048 characters")
+ else:
+ raise ValidationError(f"Query must be a string or list of strings, got {type(query).__name__}")
+
+
+def validate_response_format(response_format: str) -> None:
+ """Validate response format parameter"""
+ if not isinstance(response_format, str):
+ raise ValidationError(f"Response format must be a string, got {type(response_format).__name__}")
+
+ valid_formats = ['json', 'raw']
+ response_format = response_format.strip().lower()
+
+ if response_format not in valid_formats:
+ raise ValidationError(f"Invalid response format '{response_format}'. Valid options: {', '.join(valid_formats)}")
+
+
+def validate_http_method(method: str) -> None:
+ """Validate HTTP method parameter"""
+ if not isinstance(method, str):
+ raise ValidationError(f"HTTP method must be a string, got {type(method).__name__}")
+
+ valid_methods = ['GET', 'POST', 'PUT', 'DELETE', 'PATCH']
+ method = method.strip().upper()
+
+ if method not in valid_methods:
+ raise ValidationError(f"Invalid HTTP method '{method}'. Valid options: {', '.join(valid_methods)}")
\ No newline at end of file
diff --git a/src/utils/zone_manager.py b/src/utils/zone_manager.py
new file mode 100644
index 0000000..82a1205
--- /dev/null
+++ b/src/utils/zone_manager.py
@@ -0,0 +1,174 @@
+import requests
+import json
+import logging
+import time
+from ..exceptions import ZoneError, NetworkError, APIError
+from .retry import retry_request
+
+logger = logging.getLogger(__name__)
+
+
+class ZoneManager:
+ """Manages Bright Data zones - creation and validation"""
+
+ def __init__(self, session: requests.Session):
+ self.session = session
+
+ def ensure_required_zones(self, web_unlocker_zone: str, serp_zone: str):
+ """
+ Check if required zones exist and create them if they don't.
+ Raises exceptions on failure instead of silently continuing.
+ """
+ try:
+ logger.info("Checking existing zones...")
+ zones = self._get_zones_with_retry()
+ zone_names = {zone.get('name') for zone in zones}
+ logger.info(f"Found {len(zones)} existing zones")
+
+ zones_to_create = []
+ if web_unlocker_zone not in zone_names:
+ zones_to_create.append((web_unlocker_zone, 'unblocker'))
+ logger.info(f"Need to create web unlocker zone: {web_unlocker_zone}")
+
+ if serp_zone not in zone_names:
+ zones_to_create.append((serp_zone, 'serp'))
+ logger.info(f"Need to create SERP zone: {serp_zone}")
+
+ if not zones_to_create:
+ logger.info("All required zones already exist")
+ return
+
+ for zone_name, zone_type in zones_to_create:
+ logger.info(f"Creating zone: {zone_name} (type: {zone_type})")
+ self._create_zone_with_retry(zone_name, zone_type)
+ logger.info(f"Successfully created zone: {zone_name}")
+
+ self._verify_zones_created([zone[0] for zone in zones_to_create])
+
+ except (ZoneError, NetworkError, APIError):
+ raise
+ except requests.exceptions.RequestException as e:
+ logger.error(f"Network error while ensuring zones exist: {e}")
+ raise NetworkError(f"Failed to ensure zones due to network error: {str(e)}")
+ except json.JSONDecodeError as e:
+ logger.error(f"Invalid JSON response while checking zones: {e}")
+ raise ZoneError(f"Invalid response format from zones API: {str(e)}")
+ except Exception as e:
+ logger.error(f"Unexpected error while ensuring zones exist: {e}")
+ raise ZoneError(f"Unexpected error during zone creation: {str(e)}")
+
+ @retry_request(max_retries=3, backoff_factor=1.5, retry_statuses={429, 500, 502, 503, 504})
+ def _get_zones_with_retry(self):
+ """Get zones list with retry logic for network issues"""
+ response = self.session.get('https://api.brightdata.com/zone/get_active_zones')
+
+ if response.status_code == 200:
+ try:
+ return response.json() or []
+ except json.JSONDecodeError as e:
+ raise ZoneError(f"Invalid JSON response from zones API: {str(e)}")
+ elif response.status_code == 401:
+ raise ZoneError("Unauthorized (401): Check your API token and ensure it has proper permissions")
+ elif response.status_code == 403:
+ raise ZoneError("Forbidden (403): API token lacks sufficient permissions for zone operations")
+ else:
+ raise ZoneError(f"Failed to list zones ({response.status_code}): {response.text}")
+
+ @retry_request(max_retries=3, backoff_factor=1.5, retry_statuses={429, 500, 502, 503, 504})
+ def _create_zone_with_retry(self, zone_name: str, zone_type: str):
+ """
+ Create a new zone in Bright Data with retry logic
+
+ Args:
+ zone_name: Name for the new zone
+ zone_type: Type of zone ('unblocker' or 'serp')
+ """
+ if zone_type == "serp":
+ plan_config = {
+ "type": "unblocker",
+ "serp": True
+ }
+ else:
+ plan_config = {
+ "type": zone_type
+ }
+
+ payload = {
+ "plan": plan_config,
+ "zone": {
+ "name": zone_name,
+ "type": zone_type
+ }
+ }
+
+ response = self.session.post(
+ 'https://api.brightdata.com/zone',
+ json=payload
+ )
+
+ if response.status_code in [200, 201]:
+ logger.info(f"Zone creation successful: {zone_name}")
+ return response
+ elif response.status_code == 409 or "Duplicate zone name" in response.text or "already exists" in response.text.lower():
+ logger.info(f"Zone {zone_name} already exists - this is expected")
+ return response
+ elif response.status_code == 401:
+ raise ZoneError(f"Unauthorized (401): API token invalid or lacks permissions to create zone '{zone_name}'")
+ elif response.status_code == 403:
+ raise ZoneError(f"Forbidden (403): API token lacks permissions to create zone '{zone_name}'. Note: sdk_unlocker and sdk_serp zones should be allowed for all permissions.")
+ elif response.status_code == 400:
+ raise ZoneError(f"Bad request (400) creating zone '{zone_name}': {response.text}")
+ else:
+ raise ZoneError(f"Failed to create zone '{zone_name}' ({response.status_code}): {response.text}")
+
+ def _verify_zones_created(self, zone_names: list):
+ """
+ Verify that zones were successfully created by checking the zones list
+ """
+ max_attempts = 3
+ for attempt in range(max_attempts):
+ try:
+ logger.info(f"Verifying zone creation (attempt {attempt + 1}/{max_attempts})")
+ time.sleep(1)
+
+ zones = self._get_zones_with_retry()
+ existing_zone_names = {zone.get('name') for zone in zones}
+
+ missing_zones = [name for name in zone_names if name not in existing_zone_names]
+
+ if not missing_zones:
+ logger.info("All zones verified successfully")
+ return
+
+ if attempt == max_attempts - 1:
+ raise ZoneError(f"Zone verification failed: zones {missing_zones} not found after creation")
+
+ logger.warning(f"Zones not yet visible: {missing_zones}. Retrying verification...")
+
+ except (ZoneError, NetworkError):
+ if attempt == max_attempts - 1:
+ raise
+ logger.warning(f"Zone verification attempt {attempt + 1} failed, retrying...")
+ time.sleep(2 ** attempt)
+
+ def _create_zone(self, zone_name: str, zone_type: str):
+ """
+ Legacy method - kept for backward compatibility
+ Use _create_zone_with_retry instead for new code
+ """
+ return self._create_zone_with_retry(zone_name, zone_type)
+
+ def list_zones(self):
+ """
+ List all active zones in your Bright Data account
+
+ Returns:
+ List of zone dictionaries with their configurations
+ """
+ try:
+ return self._get_zones_with_retry()
+ except (ZoneError, NetworkError):
+ raise
+ except Exception as e:
+ logger.error(f"Unexpected error listing zones: {e}")
+ raise ZoneError(f"Unexpected error while listing zones: {str(e)}")
\ No newline at end of file
diff --git a/tests/test_client.py b/tests/test_client.py
index 51b1315..6dbc285 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -15,11 +15,18 @@
- Testing validation logic and error messages
"""
+<<<<<<< HEAD
import pytest
import os
from unittest.mock import patch
from brightdata import bdclient
+=======
+import os
+import pytest
+from brightdata import bdclient
+from unittest.mock import patch
+>>>>>>> mywork/main
from brightdata.exceptions import ValidationError
@@ -116,6 +123,149 @@ def mock_post(*args, **kwargs):
request_data = captured_request.get('json', {})
assert "&brd_json=1" in request_data["url"]
+<<<<<<< HEAD
if __name__ == "__main__":
- pytest.main([__file__])
\ No newline at end of file
+ pytest.main([__file__])
+=======
+class TestClientSearchGPT:
+ """Tests for the client.search_gpt() function"""
+
+ @pytest.fixture
+ @patch('brightdata.utils.zone_manager.ZoneManager.ensure_required_zones')
+ def client(self, mock_zones):
+ """Create a test client with mocked validation"""
+ with patch.dict(os.environ, {}, clear=True):
+ client = bdclient(api_token="valid_test_token_12345678", auto_create_zones=False)
+ return client
+
+ # VALIDATION TESTS
+
+ def test_prompt_required(self, client):
+ """Ensure ValidationError is raised when prompt is missing"""
+ with pytest.raises(ValidationError, match="prompt is required"):
+ client.search_gpt(prompt=None)
+
+ def test_invalid_country_format(self, client):
+ """Reject invalid country codes"""
+ with pytest.raises(ValidationError, match="must be 2-letter code"):
+ client.search_gpt(prompt="hi", country="USA")
+
+ def test_websearch_bool_validation(self, client):
+ """Reject invalid webSearch parameter type"""
+ with pytest.raises(ValidationError, match="must be a boolean or list of booleans"):
+ client.search_gpt(prompt="hi", webSearch="yes")
+
+ # PARAMETER NORMALIZATION
+
+ def test_normalizes_single_values_to_list(self, client, monkeypatch):
+ """Convert single parameters to list form"""
+ # Mock the session.post to intercept and provide dummy response
+ def dummy_post(url, json=None, timeout=None):
+ from unittest.mock import Mock
+ r = Mock()
+ r.status_code = 200
+ # Build dummy response with normalized lists
+ if isinstance(json, list):
+ prompts = [item.get('prompt', '') for item in json]
+ countries = [item.get('country', '') for item in json]
+ sec_prompts = [item.get('additional_prompt', '') for item in json]
+ web_searches = [item.get('web_search', False) for item in json]
+ else:
+ prompts = [json.get('prompt', '')]
+ countries = [json.get('country', '')]
+ sec_prompts = [json.get('additional_prompt', '')]
+ web_searches = [json.get('web_search', False)]
+ r.json.return_value = {
+ 'prompt': prompts,
+ 'country': countries,
+ 'secondaryPrompt': sec_prompts,
+ 'webSearch': web_searches
+ }
+ return r
+ monkeypatch.setattr(client.search_api.session, 'post', dummy_post)
+ result = client.search_gpt(
+ prompt="hello",
+ country="US",
+ secondaryPrompt="follow up",
+ webSearch=False,
+ sync=True
+ )
+ # The internal normalized payload should contain lists
+ assert isinstance(result["prompt"], list)
+ assert isinstance(result["country"], list)
+ assert isinstance(result["secondaryPrompt"], list)
+ assert isinstance(result["webSearch"], list)
+
+ # MOCKED API CALL TESTS
+
+ def test_sync_request_success(self, client, monkeypatch):
+ """Ensure sync request returns expected payload"""
+ mock_response = {"status": "ok", "data": "result"}
+ captured_payload = {}
+
+ def mock_post(url, json=None, timeout=None):
+ captured_payload.update(json or {})
+ from unittest.mock import Mock
+ r = Mock()
+ r.status_code = 200
+ r.json.return_value = mock_response
+ return r
+
+ monkeypatch.setattr(client.search_api.session, "post", mock_post)
+
+ response = client.search_gpt(prompt="Hello", country="US", sync=True)
+ assert response == mock_response
+ assert captured_payload["url"] == "https://chatgpt.com"
+ assert "prompt" in captured_payload
+ assert "country" in captured_payload
+
+ def test_async_request_timeout(self, client, monkeypatch):
+ """Ensure async mode uses correct timeout"""
+ captured_args = {}
+
+ def mock_post(url, json=None, timeout=None):
+ captured_args["timeout"] = timeout
+ from unittest.mock import Mock
+ r = Mock()
+ r.status_code = 200
+ r.json.return_value = {"id": "s_testid"}
+ return r
+
+ monkeypatch.setattr(client.search_api.session, "post", mock_post)
+
+ client.search_gpt(prompt="Async test", sync=False)
+ assert captured_args["timeout"] == 30 # default async timeout
+
+ # ERROR AND RETRY HANDLING
+
+ def test_retry_on_failure(self, client, monkeypatch):
+ """Test that request is retried on temporary failure"""
+ call_count = {"n": 0}
+
+ def mock_post(url, json=None, timeout=None):
+ call_count["n"] += 1
+ from unittest.mock import Mock
+ r = Mock()
+ r.status_code = 500 if call_count["n"] == 1 else 200
+ r.json.return_value = {"ok": True}
+ return r
+
+ monkeypatch.setattr(client.search_api.session, "post", mock_post)
+ result = client.search_gpt(prompt="retry", sync=True)
+ assert result["ok"] is True
+ assert call_count["n"] == 2 # retried once
+
+ def test_raises_error_after_max_retries(self, client, monkeypatch):
+ """Ensure error is raised after exceeding retries"""
+ def mock_post(url, json=None, timeout=None):
+ from unittest.mock import Mock
+ r = Mock()
+ r.status_code = 500
+ r.json.return_value = {"error": "server error"}
+ return r
+
+ monkeypatch.setattr(client.search_api.session, "post", mock_post)
+ with pytest.raises(RuntimeError, match="Failed after retries"):
+ client.search_gpt(prompt="fail test", sync=True)
+>>>>>>> mywork/main