diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index d9f9d332faed..0bd87c44cfec 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -29,6 +29,7 @@ import { CriticalError, NonRetryableError, RequestQueue, + RequestState, Router, SessionPool, Statistics, @@ -918,6 +919,7 @@ export class BasicCrawler this._runRequestHandler(crawlingContext), this.requestHandlerTimeoutMillis, @@ -934,14 +936,17 @@ export class BasicCrawler this._requestFunctionErrorHandler(err as Error, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`, ); + request.state = RequestState.DONE; } catch (secondaryError: any) { if (!secondaryError.triggeredFromUserHandler) { const apifySpecific = process.env.APIFY_IS_AT_HOME @@ -949,6 +954,7 @@ export class BasicCrawler Promise.resolve(this.userProvidedRequestHandler(crawlingContext)), - this.requestHandlerTimeoutMillis, - `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`, - ); + request.state = RequestState.REQUEST_HANDLER; + try { + await addTimeoutToPromise( + () => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)), + this.requestHandlerTimeoutMillis, + `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`, + ); + + request.state = RequestState.DONE; + } catch (e: any) { + request.state = RequestState.ERROR; + throw e; + } tryCancel(); if (session) session.markGood(); @@ -530,6 +539,7 @@ export abstract class BrowserCrawler< const preNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request); + crawlingContext.request.state = RequestState.BEFORE_NAV; await this._executeHooks(this.preNavigationHooks, crawlingContext, gotoOptions); tryCancel(); @@ -542,10 +552,12 @@ export abstract class BrowserCrawler< } catch (error) { await this._handleNavigationTimeout(crawlingContext, error as Error); + crawlingContext.request.state = RequestState.ERROR; throw error; } - tryCancel(); + + crawlingContext.request.state = RequestState.AFTER_NAV; await this._executeHooks(this.postNavigationHooks, crawlingContext, gotoOptions); } diff --git a/packages/core/src/request.ts b/packages/core/src/request.ts index cee570e2c26b..6104301a6b30 100644 --- a/packages/core/src/request.ts +++ b/packages/core/src/request.ts @@ -28,8 +28,19 @@ const requestOptionalPredicates = { keepUrlFragment: ow.optional.boolean, useExtendedUniqueKey: ow.optional.boolean, skipNavigation: ow.optional.boolean, + state: ow.optional.number.greaterThanOrEqual(0).lessThanOrEqual(6), }; +export enum RequestState { + UNPROCESSED, + BEFORE_NAV, + AFTER_NAV, + REQUEST_HANDLER, + DONE, + ERROR_HANDLER, + ERROR, +} + /** * Represents a URL to be crawled, optionally including HTTP method, headers, payload and other metadata. * The `Request` object also stores information about errors that occurred during processing of the request. @@ -111,6 +122,9 @@ export class Request { */ handledAt?: string; + /** Describes the request's current lifecycle stage. */ + state: RequestState = RequestState.UNPROCESSED; + /** * `Request` parameters including the URL, HTTP method and headers, and others. */ diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index ef13072a8b2f..eb0af16c0abd 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -18,6 +18,7 @@ import { Router, validators, Configuration, + RequestState, } from '@crawlee/basic'; import type { Awaitable, Dictionary } from '@crawlee/types'; import type { RequestLike, ResponseLike } from 'content-type'; @@ -452,11 +453,18 @@ export class HttpCrawler Promise.resolve(this.requestHandler(crawlingContext)), - this.userRequestHandlerTimeoutMillis, - `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`, - ); + request.state = RequestState.REQUEST_HANDLER; + try { + await addTimeoutToPromise( + () => Promise.resolve(this.requestHandler(crawlingContext)), + this.userRequestHandlerTimeoutMillis, + `requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`, + ); + request.state = RequestState.DONE; + } catch (e: any) { + request.state = RequestState.ERROR; + throw e; + } } protected async _handleNavigation(crawlingContext: Context) { @@ -464,6 +472,7 @@ export class HttpCrawler { +export function injectJQuery(page: Page, options?: { surviveNavigations?: boolean }): Promise { ow(page, ow.object.validate(validators.browserPage)); - return injectFile(page, jqueryPath, { surviveNavigations: true }); + return injectFile(page, jqueryPath, { surviveNavigations: options?.surviveNavigations ?? true }); } export interface DirectNavigationOptions { @@ -734,7 +735,14 @@ export interface PlaywrightContextUtils { export function registerUtilsToContext(context: PlaywrightCrawlingContext): void { context.injectFile = (filePath: string, options?: InjectFileOptions) => injectFile(context.page, filePath, options); - context.injectJQuery = () => injectJQuery(context.page); + context.injectJQuery = (async () => { + if (context.request.state === RequestState.BEFORE_NAV) { + log.warning('Using injectJQuery() in preNavigationHooks leads to unstable results. Use it in a postNavigationHook or a requestHandler instead.'); + await injectJQuery(context.page); + return; + } + await injectJQuery(context.page, { surviveNavigations: false }); + }); context.blockRequests = (options?: BlockRequestsOptions) => blockRequests(context.page, options); context.parseWithCheerio = () => parseWithCheerio(context.page); context.infiniteScroll = (options?: InfiniteScrollOptions) => infiniteScroll(context.page, options); diff --git a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts index f8af5d36863e..5f2122603e58 100644 --- a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts +++ b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts @@ -26,7 +26,7 @@ import type { ProtocolMapping } from 'devtools-protocol/types/protocol-mapping.j import type { Page, HTTPResponse, ResponseForRequest, HTTPRequest as PuppeteerRequest } from 'puppeteer'; import log_ from '@apify/log'; import type { Request } from '@crawlee/browser'; -import { KeyValueStore, validators } from '@crawlee/browser'; +import { KeyValueStore, RequestState, validators } from '@crawlee/browser'; import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types'; import type { CheerioRoot } from '@crawlee/utils'; import * as cheerio from 'cheerio'; @@ -144,7 +144,7 @@ export async function injectFile(page: Page, filePath: string, options: InjectFi * other libraries included by the page that use the same variable name (e.g. another version of jQuery). * This can affect functionality of page's scripts. * - * The injected jQuery will survive page navigations and reloads. + * The injected jQuery will survive page navigations and reloads by default. * * **Example usage:** * ```javascript @@ -159,10 +159,11 @@ export async function injectFile(page: Page, filePath: string, options: InjectFi * function in any way. * * @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object. + * @param [options.surviveNavigations] Opt-out option to disable the JQuery reinjection after navigation. */ -export function injectJQuery(page: Page): Promise { +export function injectJQuery(page: Page, options?: { surviveNavigations?: boolean }): Promise { ow(page, ow.object.validate(validators.browserPage)); - return injectFile(page, jqueryPath, { surviveNavigations: true }); + return injectFile(page, jqueryPath, { surviveNavigations: options?.surviveNavigations ?? true }); } /** @@ -911,7 +912,14 @@ export interface PuppeteerContextUtils { /** @internal */ export function registerUtilsToContext(context: PuppeteerCrawlingContext): void { context.injectFile = (filePath: string, options?: InjectFileOptions) => injectFile(context.page, filePath, options); - context.injectJQuery = () => injectJQuery(context.page); + context.injectJQuery = (async () => { + if (context.request.state === RequestState.BEFORE_NAV) { + log.warning('Using injectJQuery() in preNavigationHooks leads to unstable results. Use it in a postNavigationHook or a requestHandler instead.'); + await injectJQuery(context.page); + return; + } + await injectJQuery(context.page, { surviveNavigations: false }); + }); context.parseWithCheerio = () => parseWithCheerio(context.page); context.enqueueLinksByClickingElements = (options: Omit) => enqueueLinksByClickingElements({ page: context.page, diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index 6e831957fe09..4a5688a411b9 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -19,7 +19,7 @@ import { MissingRouteError, } from '@crawlee/basic'; import { - AutoscaledPool, + AutoscaledPool, RequestState, } from '@crawlee/core'; import express from 'express'; import type { Dictionary } from '@crawlee/utils'; @@ -328,6 +328,35 @@ describe('BasicCrawler', () => { expect(await requestList.isEmpty()).toBe(true); }); + test('should correctly track request.state', async () => { + const sources = [ + { url: 'http://example.com/1' }, + ]; + const requestList = await RequestList.open(null, sources); + const requestStates: RequestState[] = []; + + const requestHandler: RequestHandler = async ({ request }) => { + requestStates.push(request.state); + throw new Error('Error'); + }; + + const errorHandler: RequestHandler = async ({ request }) => { + requestStates.push(request.state); + }; + + const basicCrawler = new BasicCrawler({ + requestList, + maxConcurrency: 1, + maxRequestRetries: 1, + requestHandler, + errorHandler, + }); + + await basicCrawler.run(); + + expect(requestStates).toEqual([RequestState.REQUEST_HANDLER, RequestState.ERROR_HANDLER, RequestState.REQUEST_HANDLER]); + }); + test('should use errorHandler', async () => { const sources = [{ url: 'http://example.com/', label: 'start' }]; diff --git a/test/core/crawlers/browser_crawler.test.ts b/test/core/crawlers/browser_crawler.test.ts index 56095a79f7cf..c781000f3b40 100644 --- a/test/core/crawlers/browser_crawler.test.ts +++ b/test/core/crawlers/browser_crawler.test.ts @@ -13,6 +13,7 @@ import { ProxyConfiguration, Request, RequestList, + RequestState, Session, } from '@crawlee/puppeteer'; import { gotScraping } from 'got-scraping'; @@ -260,6 +261,51 @@ describe('BrowserCrawler', () => { expect(result[0]).toBe(serverAddress); }); + test.only('should correctly track request.state', async () => { + const sources = [ + { url: `${serverAddress}/?q=1` }, + ]; + const requestList = await RequestList.open(null, sources); + const requestStates: RequestState[] = []; + + const browserCrawler = new BrowserCrawlerTest({ + browserPoolOptions: { + browserPlugins: [puppeteerPlugin], + }, + requestList, + preNavigationHooks: [ + async ({ request }) => { + requestStates.push(request.state); + }, + ], + postNavigationHooks: [ + async ({ request }) => { + requestStates.push(request.state); + }, + ], + requestHandler: async ({ request }) => { + requestStates.push(request.state); + throw new Error('Error'); + }, + maxRequestRetries: 1, + errorHandler: async ({ request }) => { + requestStates.push(request.state); + }, + }); + + await browserCrawler.run(); + + expect(requestStates).toEqual([ + RequestState.BEFORE_NAV, + RequestState.AFTER_NAV, + RequestState.REQUEST_HANDLER, + RequestState.ERROR_HANDLER, + RequestState.BEFORE_NAV, + RequestState.AFTER_NAV, + RequestState.REQUEST_HANDLER, + ]); + }); + test('should allow modifying gotoOptions by pre navigation hooks', async () => { const requestList = await RequestList.open({ sources: [