Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(Request): add requestHandlerTimeoutSecs and requestTimeoutSecs options #1560

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -821,13 +821,18 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

tryCancel();

// A request can have some request-specific timeouts defined. We check if they have been defined, and if so,
// use those timeouts instead. However; if they aren't present, the defaults will be used.
request!.requestTimeoutSecs ??= this.internalTimeoutMillis / 1e3;
request!.requestHandlerTimeoutSecs ??= this.requestHandlerTimeoutMillis / 1e3;

if (this.useSessionPool) {
await this._timeoutAndRetry(
async () => {
session = await this.sessionPool!.getSession();
},
this.internalTimeoutMillis,
`Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`,
request!.requestTimeoutSecs * 1e3,
`Fetching session timed out after ${request!.requestTimeoutSecs} seconds.`,
);
}

Expand Down Expand Up @@ -888,14 +893,14 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
try {
await addTimeoutToPromise(
() => this._runRequestHandler(crawlingContext),
this.requestHandlerTimeoutMillis,
`requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`,
request.requestHandlerTimeoutSecs * 1e3,
`requestHandler timed out after ${request.requestHandlerTimeoutSecs} seconds (${request.id}).`,
);

await this._timeoutAndRetry(
() => source.markRequestHandled(request!),
this.internalTimeoutMillis,
`Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`,
request.requestTimeoutSecs * 1e3,
`Marking request ${request.url} (${request.id}) as handled timed out after ${request.requestTimeoutSecs} seconds.`,
);

this.stats.finishJob(statisticsId);
Expand All @@ -907,8 +912,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
try {
await addTimeoutToPromise(
() => this._requestFunctionErrorHandler(err as Error, crawlingContext, source),
this.internalTimeoutMillis,
`Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`,
request.requestTimeoutSecs * 1e3,
`Handling request failure of ${request.url} (${request.id}) timed out after ${request.requestTimeoutSecs} seconds.`,
);
} catch (secondaryError: any) {
if (!secondaryError.triggeredFromUserHandler) {
Expand Down
11 changes: 8 additions & 3 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import type {
RequestHandler,
ErrorHandler,
EnqueueLinksOptions,
Request,
} from '@crawlee/basic';
import {
cookieStringToToughCookie,
Expand Down Expand Up @@ -482,10 +483,12 @@ export abstract class BrowserCrawler<
}
}

request.requestHandlerTimeoutSecs ??= this.requestHandlerTimeoutMillis / 1e3;

await addTimeoutToPromise(
() => Promise.resolve(this.userProvidedRequestHandler(crawlingContext)),
this.requestHandlerTimeoutMillis,
`requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds.`,
request.requestHandlerTimeoutSecs * 1e3,
`requestHandler timed out after ${request.requestHandlerTimeoutSecs} seconds.`,
);
tryCancel();

Expand Down Expand Up @@ -522,7 +525,9 @@ export abstract class BrowserCrawler<
}

protected async _handleNavigation(crawlingContext: Context) {
const gotoOptions = { timeout: this.navigationTimeoutMillis } as unknown as GoToOptions;
crawlingContext.request.requestTimeoutSecs ??= this.navigationTimeoutMillis / 1e3;

const gotoOptions = { timeout: crawlingContext.request.requestTimeoutSecs * 1e3 } as unknown as GoToOptions;

const preNavigationHooksCookies = this._getCookieHeaderFromRequest(crawlingContext.request);

Expand Down
29 changes: 28 additions & 1 deletion packages/core/src/request.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ const requestOptionalPredicates = {
keepUrlFragment: ow.optional.boolean,
useExtendedUniqueKey: ow.optional.boolean,
skipNavigation: ow.optional.boolean,
requestHandlerTimeoutSecs: ow.optional.number,
requestTimeoutSecs: ow.optional.number,
};

/**
Expand Down Expand Up @@ -111,6 +113,16 @@ export class Request<UserData extends Dictionary = Dictionary> {
*/
handledAt?: string;

/**
* The number of seconds to timeout after if the request hasn't succeeded yet.
*/
requestTimeoutSecs?: number;

/**
* The number of seconds to timeout after if the `requestHandler` function runs for too long.
*/
requestHandlerTimeoutSecs?: number;

/**
* `Request` parameters including the URL, HTTP method and headers, and others.
*/
Expand Down Expand Up @@ -150,6 +162,8 @@ export class Request<UserData extends Dictionary = Dictionary> {
keepUrlFragment = false,
useExtendedUniqueKey = false,
skipNavigation,
requestHandlerTimeoutSecs,
requestTimeoutSecs,
} = options as RequestOptions & { loadedUrl?: string; retryCount?: number; errorMessages?: string[]; handledAt?: string | Date };

let {
Expand All @@ -171,6 +185,8 @@ export class Request<UserData extends Dictionary = Dictionary> {
this.errorMessages = [...errorMessages];
this.headers = { ...headers };
this.handledAt = handledAt as unknown instanceof Date ? (handledAt as Date).toISOString() : handledAt!;
this.requestHandlerTimeoutSecs = requestHandlerTimeoutSecs;
this.requestTimeoutSecs = requestTimeoutSecs;

if (label) {
userData.label = label;
Expand Down Expand Up @@ -399,12 +415,23 @@ export interface RequestOptions<UserData extends Dictionary = Dictionary> {
*/
skipNavigation?: boolean;

/**
* Determines a request-specific number of seconds before the request times out.
* Overrides the default `navigationTimeoutSecs` provided to the crawler.
*/
requestTimeoutSecs?: number;

/**
* Determines a request-specific number of seconds before the request handler times out.
* Overrides the default `requestHandlerTimeoutSecs` provided to the crawler.
*/
requestHandlerTimeoutSecs?: number;

/** @internal */
id?: string;

/** @internal */
handledAt?: string;

}

export interface PushErrorMessageOptions {
Expand Down
12 changes: 8 additions & 4 deletions packages/http-crawler/src/internals/http-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -452,10 +452,12 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
});
}

crawlingContext.request.requestTimeoutSecs ??= this.userRequestHandlerTimeoutMillis / 1e3;

return addTimeoutToPromise(
() => Promise.resolve(this.requestHandler(crawlingContext)),
this.userRequestHandlerTimeoutMillis,
`requestHandler timed out after ${this.userRequestHandlerTimeoutMillis / 1000} seconds.`,
crawlingContext.request.requestTimeoutSecs * 1e3,
`requestHandler timed out after ${crawlingContext.request.requestTimeoutSecs} seconds.`,
);
}

Expand All @@ -475,10 +477,12 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H

const proxyUrl = crawlingContext.proxyInfo?.url;

crawlingContext.request.requestTimeoutSecs ??= this.navigationTimeoutMillis / 1e3;

crawlingContext.response = await addTimeoutToPromise(
() => this._requestFunction({ request, session, proxyUrl, gotOptions }),
this.navigationTimeoutMillis,
`request timed out after ${this.navigationTimeoutMillis / 1000} seconds.`,
crawlingContext.request.requestTimeoutSecs * 1e3,
`request timed out after ${crawlingContext.request.requestTimeoutSecs} seconds.`,
);
tryCancel();

Expand Down
23 changes: 23 additions & 0 deletions test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1213,4 +1213,27 @@ describe('BasicCrawler', () => {
expect(crawler).toBeTruthy();
});
});

it('Should use the requestHandlerTimeoutSecs provided within the request instead of the crawler one', async () => {
const timeStart = performance.now();

const crawler = new BasicCrawler({
requestHandlerTimeoutSecs: 99999,
maxRequestRetries: 1,
requestHandler: async () => {
await new Promise((resolve) => setTimeout(resolve, 5e4));
},
errorHandler: (_, error) => {
expect(error.message.includes('timed out after 2 seconds')).toBeTruthy();
},
});

await crawler.run([{
url: 'https://google.com',
requestHandlerTimeoutSecs: 2,
}]);

// The run should most definitely take less than 10 seconds with the 2 second timeout.
expect(performance.now() - timeStart).toBeLessThan(1e4);
});
});
48 changes: 48 additions & 0 deletions test/core/crawlers/cheerio_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1318,6 +1318,54 @@ describe('CheerioCrawler', () => {
expect(cheerioCrawler.requestHandler).toBeUndefined();
});
});

test('Should use the requestHandlerTimeoutSecs provided within the request instead of the crawler one', async () => {
const timeStart = performance.now();

const crawler = new CheerioCrawler({
requestHandlerTimeoutSecs: 99999,
maxRequestRetries: 1,
requestHandler: async () => {
await new Promise((resolve) => setTimeout(resolve, 5e4));
},
errorHandler: (_, error) => {
expect(error.message.includes('timed out after 2 seconds')).toBeTruthy();
},
});

await crawler.run([{
url: 'https://google.com',
requestHandlerTimeoutSecs: 2,
}]);

// The run should most definitely take less than 10 seconds with the 2 second timeout.
expect(performance.now() - timeStart).toBeLessThan(1e4);
});

test('Should use the requestTimeoutSecs provided within the request instead of the crawler one', async () => {
const timeStart = performance.now();

const crawler = new CheerioCrawler({
navigationTimeoutSecs: 99999,
maxRequestRetries: 1,
requestHandler: async () => {
log.info('Hey');
},
errorHandler: (_, error) => {
expect(error.message.includes('timed out after 1 seconds')).toBeTruthy();
},
});

await crawler.run([{
// https://deelay.me/
// Google with a huge delay
url: 'https://deelay.me/200000/https://google.com',
requestTimeoutSecs: 1,
}]);

// The run should most definitely take less than 15 seconds with the 0.00001 second timeout.
expect(performance.now() - timeStart).toBeLessThan(15e3);
});
});

async function getRequestListForMock(port: number, mockData: Dictionary, pathName = 'mock') {
Expand Down