apify · barjin · Jul 19, 2023 · Jun 26, 2023 · Jun 27, 2023 · Jun 27, 2023
diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts
@@ -261,6 +261,15 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      */
     statusMessageLoggingInterval?: number;
 
+    /**
+     * If set to `true`, the crawler will automatically try to bypass any detected bot protection.
+     *
+     * Currently supports:
+     * - [**Cloudflare** Bot Management](https://www.cloudflare.com/products/bot-management/)
+     * - [**Google Search** Rate Limiting](https://www.google.com/sorry/)
+     */
+    retryOnBlocked?: boolean;
+
     /** @internal */
     log?: Log;
 }
@@ -387,6 +396,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
     protected crawlingContexts = new Map<string, Context>();
     protected autoscaledPoolOptions: AutoscaledPoolOptions;
     protected events: EventManager;
+    protected retryOnBlocked: boolean;
     private _closeEvents?: boolean;
 
     protected static optionsShape = {
@@ -412,6 +422,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         useSessionPool: ow.optional.boolean,
         statusMessageLoggingInterval: ow.optional.number,
 
+        retryOnBlocked: ow.optional.boolean,
+
         // AutoscaledPool shorthands
         minConcurrency: ow.optional.number,
         maxConcurrency: ow.optional.number,
@@ -443,6 +455,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
             maxConcurrency,
             maxRequestsPerMinute,
 
+            retryOnBlocked = false,
+
             // internal
             log = defaultLog.child({ prefix: this.constructor.name }),
 
@@ -503,6 +517,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
             newRequestHandlerTimeout = requestHandlerTimeoutSecs * 1000;
         }
 
+        this.retryOnBlocked = retryOnBlocked;
+
         this._handlePropertyNameChange({
             newName: 'requestHandlerTimeoutSecs',
             oldName: 'handleRequestTimeoutSecs',
@@ -526,6 +542,13 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
             ...sessionPoolOptions,
             log,
         };
+        if (this.retryOnBlocked) {
+            this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
+            if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
+                log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. 
+Please note that the 'retryOnBlocked' feature might not work as expected.`);
+            }
+        }
         this.useSessionPool = useSessionPool;
         this.crawlingContexts = new Map();
 
@@ -593,6 +616,10 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
     }
 
+    protected isRequestBlocked(_crawlingContext: Context) {
+        throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
+    }
+
     private setStatusMessage(message: string, options: SetStatusMessageOptions = {}) {
         this.log.debug(`${options.isStatusMessageTerminal ? 'Terminal status message' : 'Status message'}: ${message}`);
 

diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts
@@ -37,6 +37,7 @@ import type {
 import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
 import ow from 'ow';
 import type { Cookie as CookieObject } from '@crawlee/types';
+import { RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
 import type { BrowserLaunchContext } from './browser-launcher';
 
 export interface BrowserCrawlingContext<
@@ -437,6 +438,19 @@ export abstract class BrowserCrawler<
         }
     }
 
+    protected override async isRequestBlocked(crawlingContext: Context): Promise<boolean> {
+        const { page, response } = crawlingContext;
+
+        // Cloudflare specific heuristic - wait 5 seconds if we get a 403 for the JS challenge to load / resolve.
+        if (response?.status() === 403) {
+            await sleep(5000);
+        };
+
+        return (
+            await Promise.all(RETRY_CSS_SELECTORS.map((selector) => (page as any).$(selector)))
+        ).some((el) => el !== null);
+    }
+
     /**
      * Wrapper around requestHandler that opens and closes pages etc.
      */
@@ -493,6 +507,13 @@ export abstract class BrowserCrawler<
             }
         }
 
+        if (this.retryOnBlocked) {
+            if (await this.isRequestBlocked(crawlingContext)) {
+                session?.retire();
+                throw new Error('Antibot protection detected, the session has been retired.');
+            }
+        }
+
         request.state = RequestState.REQUEST_HANDLER;
         try {
             await addTimeoutToPromise(

diff --git a/packages/http-crawler/package.json b/packages/http-crawler/package.json
@@ -57,6 +57,7 @@
         "@apify/utilities": "^2.0.0",
         "@crawlee/basic": "^3.4.1",
         "@crawlee/types": "^3.4.1",
+        "@crawlee/utils": "^3.4.1",
         "@types/content-type": "^1.1.5",
         "cheerio": "^1.0.0-rc.12",
         "content-type": "^1.0.4",

diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts
@@ -22,6 +22,7 @@ import {
     Configuration,
     RequestState,
 } from '@crawlee/basic';
+import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
 import type { Awaitable, Dictionary } from '@crawlee/types';
 import type { RequestLike, ResponseLike } from 'content-type';
 import * as cheerio from 'cheerio';
@@ -464,6 +465,11 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
             });
         }
 
+        if (this.retryOnBlocked && await this.isRequestBlocked(crawlingContext)) {
+            crawlingContext.session?.retire();
+            throw new Error('Antibot protection detected, the session has been retired.');
+        }
+
         request.state = RequestState.REQUEST_HANDLER;
         try {
             await addTimeoutToPromise(
@@ -478,6 +484,15 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
         }
     }
 
+    protected override async isRequestBlocked(crawlingContext: Context) {
+        if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
+            const $ = await crawlingContext.parseWithCheerio();
+
+            return RETRY_CSS_SELECTORS.some((selector) => $(selector).length > 0);
+        }
+        return false;
+    }
+
     protected async _handleNavigation(crawlingContext: Context) {
         const gotOptions = {} as OptionsInit;
         const { request, session } = crawlingContext;

diff --git a/packages/utils/src/index.ts b/packages/utils/src/index.ts
@@ -1,3 +1,4 @@
+export * from './internals/blocked';
 export * from './internals/cheerio';
 export * from './internals/chunk';
 export * from './internals/extract-urls';

diff --git a/packages/utils/src/internals/blocked.ts b/packages/utils/src/internals/blocked.ts
@@ -0,0 +1,7 @@
+/**
+ * CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked.
+ */
+export const RETRY_CSS_SELECTORS = [
+    'iframe[src^="https://challenges.cloudflare.com"]',
+    'div#infoDiv0 a[href*="//www.google.com/policies/terms/"]',
+];