Skip to content

failed and reached maximum retries, Assuming the session is blocked based on HTTP status code 403 #1581

@JiuZero

Description

@JiuZero

Is there a way to block this type of information issue (raise)?:

[crawlee.crawlers._playwright._playwright_crawler] ERROR Request to http://xxx.com/en/product_detail.asp&id=36?bigid=2 failed and reached maximum retries
 Traceback (most recent call last):
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\crawlers\_basic\_context_pipeline.py", line 100, in __call__
    result = await middleware_instance.action()
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\crawlers\_basic\_context_pipeline.py", line 40, in action
    self.output_context = await self.generator.__anext__()
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\crawlers\_playwright\_playwright_crawler.py", line 431, in _handle_status_code_response
    self._raise_for_error_status_code(status_code)
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\crawlers\_basic\_basic_crawler.py", line 1519, in _raise_for_error_status_code
    raise HttpClientStatusCodeError('Client error status code returned', status_code)
crawlee.errors.HttpClientStatusCodeError: Client error status code returned (status code: 404).

[BeautifulSoupCrawler] ERROR Request to http://ibmabb.com failed and reached maximum retries
 Traceback (most recent call last):
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\crawlers\_basic\_basic_crawler.py", line 1399, in __run_task_function
    await self._run_request_handler(context=context)
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\crawlers\_basic\_basic_crawler.py", line 1494, in _run_request_handler
    await wait_for(
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\_utils\wait.py", line 37, in wait_for
    return await asyncio.wait_for(operation(), timeout.total_seconds())
  File "D:\Base\apps\python310\current\lib\asyncio\tasks.py", line 445, in wait_for
    return fut.result()
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\crawlers\_basic\_context_pipeline.py", line 100, in __call__
    result = await middleware_instance.action()
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\crawlers\_basic\_context_pipeline.py", line 40, in action
    self.output_context = await self.generator.__anext__()
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\crawlers\_abstract_http\_abstract_http_crawler.py", line 246, in _handle_status_code_response
    self._raise_for_session_blocked_status_code(context.session, status_code)
  File "D:\Base\apps\python310\current\lib\site-packages\crawlee\crawlers\_basic\_basic_crawler.py", line 1538, in _raise_for_session_blocked_status_code
    raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}')
crawlee.errors.SessionError: Assuming the session is blocked based on HTTP status code 403

My implementation code is as follows:

from typing import Optional, Callable, List, Dict, Any
from dataclasses import dataclass
from enum import Enum
import asyncio
import json
import os
import random
from pathlib import Path
from datetime import timedelta

from crawlee.crawlers import (
    BeautifulSoupCrawler,
    PlaywrightCrawler,
)
from crawlee.storages import Dataset
from crawlee.http_clients import HttpxHttpClient


class CrawlerMode(Enum):
    """爬虫模式"""
    BEAUTIFULSOUP = "beautifulsoup"
    PLAYWRIGHT = "playwright"


@dataclass
class CrawlerConfig:
    """爬虫配置"""
    start_urls: List[str]
    mode: CrawlerMode = CrawlerMode.BEAUTIFULSOUP
    max_requests_per_crawl: Optional[int] = 100
    max_concurrency: int = 10
    request_handler: Optional[Callable] = None

    # Playwright 特定配置
    headless: bool = True
    browser_type: str = "chromium"

    # 请求配置
    proxy: Optional[str] = None
    timeout: int = 30
    user_agent: Optional[str] = None

    # 反反爬虫配置
    retry_on_blocked: bool = True
    max_session_rotations: int = 5
    max_request_retries: int = 0

    # Crawlee 高级配置
    use_session_pool: bool = True
    keep_alive: bool = False
    configure_logging: bool = True
    request_handler_timeout: Optional[timedelta] = None 

    # 延迟配置
    min_delay: float = 1.0  # 最小延迟(秒)
    max_delay: float = 3.0  # 最大延迟(秒)

    # 日志配置
    verbose: bool = False

    # 输出配置
    output_dir: str = "./crawler_output"
    json_output: bool = True


class UniversalCrawler:
    """通用爬虫类"""

    # 常用的 User-Agent 列表
    USER_AGENTS = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    ]

    def __init__(self, config: CrawlerConfig):
        self.config = config
        self.crawler = None
        self.results: List[Dict[str, Any]] = []
        self.last_error: Optional[Exception] = None
        self._setup_output_dir()

    def _setup_output_dir(self):
        """设置输出目录"""
        Path(self.config.output_dir).mkdir(parents=True, exist_ok=True)

    def _get_random_user_agent(self) -> str:
        """获取随机 User-Agent"""
        if self.config.user_agent:
            return self.config.user_agent
        return random.choice(self.USER_AGENTS)

    def _get_proxy_config(self) -> Optional[str]:
        """解析代理配置"""
        if not self.config.proxy:
            return None

        proxy_url = self.config.proxy
        if not proxy_url.startswith(('http://', 'https://', 'socks4://', 'socks5://')):
            proxy_url = f'http://{proxy_url}'

        return proxy_url

    def _get_additional_headers(self) -> Dict[str, str]:
        """获取额外的 HTTP 头"""
        return {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Cache-Control': 'max-age=0',
        }

    async def _delay_between_requests(self):
        """请求之间的随机延迟"""
        delay = random.uniform(self.config.min_delay, self.config.max_delay)
        await asyncio.sleep(delay)

    def _wrap_request_handler(self, handler: Callable) -> Callable:
        """包装请求处理器以添加延迟"""
        async def wrapped_handler(context):
            # 请求前延迟
            await self._delay_between_requests()
            # 执行原始处理器
            result = await handler(context)
            return result
        return wrapped_handler

    def _setup_crawler(self):
        """根据配置初始化爬虫"""
        # 基础配置
        crawler_kwargs = {
            'max_requests_per_crawl': self.config.max_requests_per_crawl,
            'request_handler_timeout': (
                self.config.request_handler_timeout
                or timedelta(seconds=self.config.timeout)
            ),  # <<<
            'max_session_rotations': self.config.max_session_rotations,
            'max_request_retries': self.config.max_request_retries,
            'retry_on_blocked': self.config.retry_on_blocked,
            'keep_alive': self.config.keep_alive,             # <<<
            'use_session_pool': self.config.use_session_pool, # <<<
            'configure_logging': self.config.configure_logging,  # <<<
        }

        # 根据模式创建对应的爬虫
        if self.config.mode == CrawlerMode.PLAYWRIGHT:
            # Playwright 特定配置
            playwright_options = {
                'headless': self.config.headless,
                'browser_type': self.config.browser_type,
            }

            # 添加代理配置
            proxy_url = self._get_proxy_config()
            launch_options: Dict[str, Any] = {}
            if proxy_url:
                launch_options['proxy'] = {'server': proxy_url}

            # 添加其他浏览器启动选项
            launch_options['args'] = [
                '--disable-blink-features=AutomationControlled',
                '--disable-dev-shm-usage',
                '--disable-web-security',
                '--no-sandbox',
                '--disable-logging',  # 禁用浏览器日志
            ]
            playwright_options['browser_launch_options'] = launch_options

            # 浏览器上下文配置 - 正确的参数名是 browser_new_context_options
            context_options = {
                'user_agent': self._get_random_user_agent(),
                'viewport': {'width': 1920, 'height': 1080},
                'locale': 'en-US',
                'timezone_id': 'America/New_York',
                'extra_http_headers': self._get_additional_headers(),
            }
            playwright_options['browser_new_context_options'] = context_options

            crawler_kwargs.update(playwright_options)
            self.crawler = PlaywrightCrawler(**crawler_kwargs)

        else:
            # BeautifulSoup 配置 - 创建 HTTP 客户端
            http_client_kwargs: Dict[str, Any] = {
                'timeout': self.config.timeout,
                'follow_redirects': True,
            }

            # 添加代理
            proxy_url = self._get_proxy_config()
            if proxy_url:
                http_client_kwargs['proxies'] = proxy_url

            # 添加自定义头
            headers = {
                'User-Agent': self._get_random_user_agent(),
                **self._get_additional_headers(),
            }
            http_client_kwargs['headers'] = headers
            http_client = HttpxHttpClient(**http_client_kwargs)
            crawler_kwargs['http_client'] = http_client

            self.crawler = BeautifulSoupCrawler(**crawler_kwargs)
        if self.config.request_handler:
            wrapped_handler = self._wrap_request_handler(self.config.request_handler)
            self.crawler.router.default_handler(wrapped_handler)

    async def run(self):
        """运行爬虫(不会向外 raise)"""
        self._setup_crawler()
        try:
            await self.crawler.run(self.config.start_urls)
        except Exception as e:
            self.last_error = e

        # 如果需要 JSON 输出,导出数据
        if self.config.json_output:
            await self._export_results()

    async def _export_results(self):
        """导出爬取结果为 JSON"""
        try:
            dataset = await Dataset.open()
            data = await dataset.get_data()

            # 保存为 JSON
            output_file = os.path.join(
                self.config.output_dir,
                'crawler_results.json'
            )

            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(data.items, f, ensure_ascii=False, indent=2)

            # 保存 URL 列表
            txt_file = os.path.join(
                self.config.output_dir,
                'crawler_urls.txt'
            )
            with open(txt_file, 'w', encoding='utf-8') as f:
                for item in data.items:
                    f.write(f"{item.get('url', '')}\n")

            # 保存统计信息
            stats_file = os.path.join(
                self.config.output_dir,
                'crawler_stats.json'
            )
            stats = {
                'total_items': len(data.items),
                'unique_urls': len(set(item.get('url', '') for item in data.items)),
                'output_file': output_file,
                'txt_file': txt_file,
            }
            with open(stats_file, 'w', encoding='utf-8') as f:
                json.dump(stats, f, ensure_ascii=False, indent=2)

            return output_file

        except Exception as e:
            return None

    def run_sync(self):
        try:
            return asyncio.run(self.run())
        except Exception as e:
            self.last_error = e
            return None


def create_crawler(
    start_urls: List[str],
    mode: str = "beautifulsoup",
    max_requests: int = 100,
    max_concurrency: int = 10,
    request_handler: Optional[Callable] = None,
    headless: bool = True,
    browser_type: str = "chromium",
    proxy: Optional[str] = None,
    timeout: int = 30,
    user_agent: Optional[str] = None,
    output_dir: str = "./crawler_output",
    json_output: bool = True,
    retry_on_blocked: bool = True,
    max_session_rotations: int = 5,
    max_request_retries: int = 0,
    min_delay: float = 1.0,
    max_delay: float = 3.0,
    verbose: bool = False,
) -> UniversalCrawler:
    """快速创建爬虫"""
    crawler_mode = (
        CrawlerMode.PLAYWRIGHT
        if mode.lower() == "playwright"
        else CrawlerMode.BEAUTIFULSOUP
    )

    config = CrawlerConfig(
        start_urls=start_urls,
        mode=crawler_mode,
        max_requests_per_crawl=max_requests,
        max_concurrency=max_concurrency,
        request_handler=request_handler,
        headless=headless,
        browser_type=browser_type,
        proxy=proxy,
        timeout=timeout,
        user_agent=user_agent,
        output_dir=output_dir,
        json_output=json_output,
        retry_on_blocked=retry_on_blocked,
        max_session_rotations=max_session_rotations,
        max_request_retries=max_request_retries,
        min_delay=min_delay,
        max_delay=max_delay,
        verbose=verbose,
    )

    return UniversalCrawler(config)

Metadata

Metadata

Assignees

No one assigned

    Labels

    t-toolingIssues with this label are in the ownership of the tooling team.

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions