Skip to content

Commit

Permalink
fix: injectJQuery in context does not survive navs (#1661)
Browse files Browse the repository at this point in the history
  • Loading branch information
barjin committed Nov 11, 2022
1 parent 22467ca commit 493a7cf
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 22 deletions.
6 changes: 6 additions & 0 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import {
CriticalError,
NonRetryableError,
RequestQueue,
RequestState,
Router,
SessionPool,
Statistics,
Expand Down Expand Up @@ -918,6 +919,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.crawlingContexts.set(crawlingContext.id, crawlingContext);

try {
request.state = RequestState.REQUEST_HANDLER;
await addTimeoutToPromise(
() => this._runRequestHandler(crawlingContext),
this.requestHandlerTimeoutMillis,
Expand All @@ -934,21 +936,25 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.handledRequestsCount++;

// reclaim session if request finishes successfully
request.state = RequestState.DONE;
session?.markGood();
} catch (err) {
try {
request.state = RequestState.ERROR_HANDLER;
await addTimeoutToPromise(
() => this._requestFunctionErrorHandler(err as Error, crawlingContext, source),
this.internalTimeoutMillis,
`Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`,
);
request.state = RequestState.DONE;
} catch (secondaryError: any) {
if (!secondaryError.triggeredFromUserHandler) {
const apifySpecific = process.env.APIFY_IS_AT_HOME
? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.` : '';
this.log.exception(secondaryError as Error, 'An exception occurred during handling of failed request. '
+ `This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
}
request.state = RequestState.ERROR;
throw secondaryError;
}
// decrease the session score if the request fails (but the error handler did not throw)
Expand Down
24 changes: 18 additions & 6 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import {
Configuration,
BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
BasicCrawler,
RequestState,
} from '@crawlee/basic';
import type {
BrowserController,
Expand Down Expand Up @@ -486,11 +487,19 @@ export abstract class BrowserCrawler<
}
}

await addTimeoutToPromise(
() => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)),
this.requestHandlerTimeoutMillis,
`requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`,
);
request.state = RequestState.REQUEST_HANDLER;
try {
await addTimeoutToPromise(
() => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)),
this.requestHandlerTimeoutMillis,
`requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`,
);

request.state = RequestState.DONE;
} catch (e: any) {
request.state = RequestState.ERROR;
throw e;
}
tryCancel();

if (session) session.markGood();
Expand Down Expand Up @@ -530,6 +539,7 @@ export abstract class BrowserCrawler<

const preNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);

crawlingContext.request.state = RequestState.BEFORE_NAV;
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotoOptions);
tryCancel();

Expand All @@ -542,10 +552,12 @@ export abstract class BrowserCrawler<
} catch (error) {
await this._handleNavigationTimeout(crawlingContext, error as Error);

crawlingContext.request.state = RequestState.ERROR;
throw error;
}

tryCancel();

crawlingContext.request.state = RequestState.AFTER_NAV;
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotoOptions);
}

Expand Down
14 changes: 14 additions & 0 deletions packages/core/src/request.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,19 @@ const requestOptionalPredicates = {
keepUrlFragment: ow.optional.boolean,
useExtendedUniqueKey: ow.optional.boolean,
skipNavigation: ow.optional.boolean,
state: ow.optional.number.greaterThanOrEqual(0).lessThanOrEqual(6),
};

export enum RequestState {
UNPROCESSED,
BEFORE_NAV,
AFTER_NAV,
REQUEST_HANDLER,
DONE,
ERROR_HANDLER,
ERROR,
}

/**
* Represents a URL to be crawled, optionally including HTTP method, headers, payload and other metadata.
* The `Request` object also stores information about errors that occurred during processing of the request.
Expand Down Expand Up @@ -111,6 +122,9 @@ export class Request<UserData extends Dictionary = Dictionary> {
*/
handledAt?: string;

/** Describes the request's current lifecycle stage. */
state: RequestState = RequestState.UNPROCESSED;

/**
* `Request` parameters including the URL, HTTP method and headers, and others.
*/
Expand Down
20 changes: 15 additions & 5 deletions packages/http-crawler/src/internals/http-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {
Router,
validators,
Configuration,
RequestState,
} from '@crawlee/basic';
import type { Awaitable, Dictionary } from '@crawlee/types';
import type { RequestLike, ResponseLike } from 'content-type';
Expand Down Expand Up @@ -452,18 +453,26 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
});
}

return addTimeoutToPromise(
() => Promise.resolve(this.requestHandler(crawlingContext)),
this.userRequestHandlerTimeoutMillis,
`requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`,
);
request.state = RequestState.REQUEST_HANDLER;
try {
await addTimeoutToPromise(
() => Promise.resolve(this.requestHandler(crawlingContext)),
this.userRequestHandlerTimeoutMillis,
`requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`,
);
request.state = RequestState.DONE;
} catch (e: any) {
request.state = RequestState.ERROR;
throw e;
}
}

protected async _handleNavigation(crawlingContext: Context) {
const gotOptions = {} as OptionsInit;
const { request, session } = crawlingContext;
const preNavigationHooksCookies = this._getCookieHeaderFromRequest(request);

request.state = RequestState.BEFORE_NAV;
// Execute pre navigation hooks before applying session pool cookies,
// as they may also set cookies in the session
await this._executeHooks(this.preNavigationHooks, crawlingContext, gotOptions);
Expand All @@ -482,6 +491,7 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
);
tryCancel();

request.state = RequestState.AFTER_NAV;
await this._executeHooks(this.postNavigationHooks, crawlingContext, gotOptions);
tryCancel();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import type { Page, Response, Route } from 'playwright';
import { LruCache } from '@apify/datastructures';
import log_ from '@apify/log';
import type { Request } from '@crawlee/browser';
import { validators, KeyValueStore } from '@crawlee/browser';
import { validators, KeyValueStore, RequestState } from '@crawlee/browser';
import type { CheerioRoot, Dictionary } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import type { BatchAddRequestsResult } from '@crawlee/types';
Expand Down Expand Up @@ -112,7 +112,7 @@ export async function injectFile(page: Page, filePath: string, options: InjectFi
* other libraries included by the page that use the same variable name (e.g. another version of jQuery).
* This can affect functionality of page's scripts.
*
* The injected jQuery will survive page navigations and reloads.
* The injected jQuery will survive page navigations and reloads by default.
*
* **Example usage:**
* ```javascript
Expand All @@ -127,10 +127,11 @@ export async function injectFile(page: Page, filePath: string, options: InjectFi
* function in any way.
*
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param [options.surviveNavigations] Opt-out option to disable the JQuery reinjection after navigation.
*/
export function injectJQuery(page: Page): Promise<unknown> {
export function injectJQuery(page: Page, options?: { surviveNavigations?: boolean }): Promise<unknown> {
ow(page, ow.object.validate(validators.browserPage));
return injectFile(page, jqueryPath, { surviveNavigations: true });
return injectFile(page, jqueryPath, { surviveNavigations: options?.surviveNavigations ?? true });
}

export interface DirectNavigationOptions {
Expand Down Expand Up @@ -734,7 +735,14 @@ export interface PlaywrightContextUtils {

export function registerUtilsToContext(context: PlaywrightCrawlingContext): void {
context.injectFile = (filePath: string, options?: InjectFileOptions) => injectFile(context.page, filePath, options);
context.injectJQuery = () => injectJQuery(context.page);
context.injectJQuery = (async () => {
if (context.request.state === RequestState.BEFORE_NAV) {
log.warning('Using injectJQuery() in preNavigationHooks leads to unstable results. Use it in a postNavigationHook or a requestHandler instead.');
await injectJQuery(context.page);
return;
}
await injectJQuery(context.page, { surviveNavigations: false });
});
context.blockRequests = (options?: BlockRequestsOptions) => blockRequests(context.page, options);
context.parseWithCheerio = () => parseWithCheerio(context.page);
context.infiniteScroll = (options?: InfiniteScrollOptions) => infiniteScroll(context.page, options);
Expand Down
18 changes: 13 additions & 5 deletions packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import type { ProtocolMapping } from 'devtools-protocol/types/protocol-mapping.j
import type { Page, HTTPResponse, ResponseForRequest, HTTPRequest as PuppeteerRequest } from 'puppeteer';
import log_ from '@apify/log';
import type { Request } from '@crawlee/browser';
import { KeyValueStore, validators } from '@crawlee/browser';
import { KeyValueStore, RequestState, validators } from '@crawlee/browser';
import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types';
import type { CheerioRoot } from '@crawlee/utils';
import * as cheerio from 'cheerio';
Expand Down Expand Up @@ -144,7 +144,7 @@ export async function injectFile(page: Page, filePath: string, options: InjectFi
* other libraries included by the page that use the same variable name (e.g. another version of jQuery).
* This can affect functionality of page's scripts.
*
* The injected jQuery will survive page navigations and reloads.
* The injected jQuery will survive page navigations and reloads by default.
*
* **Example usage:**
* ```javascript
Expand All @@ -159,10 +159,11 @@ export async function injectFile(page: Page, filePath: string, options: InjectFi
* function in any way.
*
* @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object.
* @param [options.surviveNavigations] Opt-out option to disable the JQuery reinjection after navigation.
*/
export function injectJQuery(page: Page): Promise<unknown> {
export function injectJQuery(page: Page, options?: { surviveNavigations?: boolean }): Promise<unknown> {
ow(page, ow.object.validate(validators.browserPage));
return injectFile(page, jqueryPath, { surviveNavigations: true });
return injectFile(page, jqueryPath, { surviveNavigations: options?.surviveNavigations ?? true });
}

/**
Expand Down Expand Up @@ -911,7 +912,14 @@ export interface PuppeteerContextUtils {
/** @internal */
export function registerUtilsToContext(context: PuppeteerCrawlingContext): void {
context.injectFile = (filePath: string, options?: InjectFileOptions) => injectFile(context.page, filePath, options);
context.injectJQuery = () => injectJQuery(context.page);
context.injectJQuery = (async () => {
if (context.request.state === RequestState.BEFORE_NAV) {
log.warning('Using injectJQuery() in preNavigationHooks leads to unstable results. Use it in a postNavigationHook or a requestHandler instead.');
await injectJQuery(context.page);
return;
}
await injectJQuery(context.page, { surviveNavigations: false });
});
context.parseWithCheerio = () => parseWithCheerio(context.page);
context.enqueueLinksByClickingElements = (options: Omit<EnqueueLinksByClickingElementsOptions, 'page' | 'requestQueue'>) => enqueueLinksByClickingElements({
page: context.page,
Expand Down
31 changes: 30 additions & 1 deletion test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import {
MissingRouteError,
} from '@crawlee/basic';
import {
AutoscaledPool,
AutoscaledPool, RequestState,
} from '@crawlee/core';
import express from 'express';
import type { Dictionary } from '@crawlee/utils';
Expand Down Expand Up @@ -328,6 +328,35 @@ describe('BasicCrawler', () => {
expect(await requestList.isEmpty()).toBe(true);
});

test('should correctly track request.state', async () => {
const sources = [
{ url: 'http://example.com/1' },
];
const requestList = await RequestList.open(null, sources);
const requestStates: RequestState[] = [];

const requestHandler: RequestHandler = async ({ request }) => {
requestStates.push(request.state);
throw new Error('Error');
};

const errorHandler: RequestHandler = async ({ request }) => {
requestStates.push(request.state);
};

const basicCrawler = new BasicCrawler({
requestList,
maxConcurrency: 1,
maxRequestRetries: 1,
requestHandler,
errorHandler,
});

await basicCrawler.run();

expect(requestStates).toEqual([RequestState.REQUEST_HANDLER, RequestState.ERROR_HANDLER, RequestState.REQUEST_HANDLER]);
});

test('should use errorHandler', async () => {
const sources = [{ url: 'http://example.com/', label: 'start' }];

Expand Down
46 changes: 46 additions & 0 deletions test/core/crawlers/browser_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
ProxyConfiguration,
Request,
RequestList,
RequestState,
Session,
} from '@crawlee/puppeteer';
import { gotScraping } from 'got-scraping';
Expand Down Expand Up @@ -260,6 +261,51 @@ describe('BrowserCrawler', () => {
expect(result[0]).toBe(serverAddress);
});

test.only('should correctly track request.state', async () => {
const sources = [
{ url: `${serverAddress}/?q=1` },
];
const requestList = await RequestList.open(null, sources);
const requestStates: RequestState[] = [];

const browserCrawler = new BrowserCrawlerTest({
browserPoolOptions: {
browserPlugins: [puppeteerPlugin],
},
requestList,
preNavigationHooks: [
async ({ request }) => {
requestStates.push(request.state);
},
],
postNavigationHooks: [
async ({ request }) => {
requestStates.push(request.state);
},
],
requestHandler: async ({ request }) => {
requestStates.push(request.state);
throw new Error('Error');
},
maxRequestRetries: 1,
errorHandler: async ({ request }) => {
requestStates.push(request.state);
},
});

await browserCrawler.run();

expect(requestStates).toEqual([
RequestState.BEFORE_NAV,
RequestState.AFTER_NAV,
RequestState.REQUEST_HANDLER,
RequestState.ERROR_HANDLER,
RequestState.BEFORE_NAV,
RequestState.AFTER_NAV,
RequestState.REQUEST_HANDLER,
]);
});

test('should allow modifying gotoOptions by pre navigation hooks', async () => {
const requestList = await RequestList.open({
sources: [
Expand Down

0 comments on commit 493a7cf

Please sign in to comment.