Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(Request): add requestHandlerTimeoutSecs and requestTimeoutSecs options #1560

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected async _runTaskFunction() {
const source = this.requestQueue || this.requestList || await this.getRequestQueue();

let request: Request | null | undefined;
let request: Request & { requestHandlerTimeoutMillis?: number; requestTimeoutMillis?: number } | null | undefined;
let session: Session | undefined;

await this._timeoutAndRetry(
Expand All @@ -821,13 +821,18 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

tryCancel();

// A request can have some request-specific timeouts defined. We check if they have been defined, and if so,
// use those timeouts instead. However; if they aren't present, the defaults will be used.
const requestTimeoutMillis = request?.requestTimeoutMillis ?? this.internalTimeoutMillis;
const requestHandlerTimeoutMillis = request?.requestHandlerTimeoutMillis ?? this.requestHandlerTimeoutMillis;
Copy link
Contributor

@szmarczak szmarczak Sep 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not

            request.requestHandlerTimeoutSecs ??= this.requestHandlerTimeoutMillis / 1000;
            [...]
            await this._timeoutAndRetry(
                () => source.markRequestHandled(request!),
                request.requestHandlerTimeoutSecs * 1000,
                `Marking request ${request.url} (${request.id}) as handled timed out after ${requestTimeoutMillis / 1e3} seconds.`,
            );

This way we don't need those helpers.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes true, nullish coalescing reassignment. I rarely use it, but this is definitely a good use-case


if (this.useSessionPool) {
await this._timeoutAndRetry(
async () => {
session = await this.sessionPool!.getSession();
},
this.internalTimeoutMillis,
`Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`,
requestTimeoutMillis,
`Fetching session timed out after ${requestTimeoutMillis / 1e3} seconds.`,
);
}

Expand Down Expand Up @@ -888,14 +893,14 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
try {
await addTimeoutToPromise(
() => this._runRequestHandler(crawlingContext),
this.requestHandlerTimeoutMillis,
`requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`,
requestHandlerTimeoutMillis,
`requestHandler timed out after ${requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`,
);

await this._timeoutAndRetry(
() => source.markRequestHandled(request!),
this.internalTimeoutMillis,
`Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`,
requestTimeoutMillis,
`Marking request ${request.url} (${request.id}) as handled timed out after ${requestTimeoutMillis / 1e3} seconds.`,
);

this.stats.finishJob(statisticsId);
Expand All @@ -907,8 +912,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
try {
await addTimeoutToPromise(
() => this._requestFunctionErrorHandler(err as Error, crawlingContext, source),
this.internalTimeoutMillis,
`Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`,
requestTimeoutMillis,
`Handling request failure of ${request.url} (${request.id}) timed out after ${requestTimeoutMillis / 1e3} seconds.`,
);
} catch (secondaryError: any) {
if (!secondaryError.triggeredFromUserHandler) {
Expand Down
41 changes: 40 additions & 1 deletion packages/core/src/request.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,16 @@ export class Request<UserData extends Dictionary = Dictionary> {
*/
handledAt?: string;

/**
* The number of seconds to timeout after if the request hasn't succeeded yet.
*/
requestTimeoutSecs?: number;

/**
* The number of seconds to timeout after if the `requestHandler` function runs for too long.
*/
requestHandlerTimeoutSecs?: number;

/**
* `Request` parameters including the URL, HTTP method and headers, and others.
*/
Expand Down Expand Up @@ -150,6 +160,8 @@ export class Request<UserData extends Dictionary = Dictionary> {
keepUrlFragment = false,
useExtendedUniqueKey = false,
skipNavigation,
requestHandlerTimeoutSecs,
requestTimeoutSecs,
} = options as RequestOptions & { loadedUrl?: string; retryCount?: number; errorMessages?: string[]; handledAt?: string | Date };

let {
Expand All @@ -172,6 +184,22 @@ export class Request<UserData extends Dictionary = Dictionary> {
this.headers = { ...headers };
this.handledAt = handledAt as unknown instanceof Date ? (handledAt as Date).toISOString() : handledAt!;

this.requestHandlerTimeoutSecs = requestHandlerTimeoutSecs;
this.requestTimeoutSecs = requestTimeoutSecs;

Object.defineProperty(
this,
'requestHandlerTimeoutMillis',
// If zero was provided, this will still return null.
{ enumerable: false, value: requestHandlerTimeoutSecs ? requestHandlerTimeoutSecs * 1e3 : null },
);

Object.defineProperty(
this,
'requestTimeoutMillis',
{ enumerable: false, value: requestTimeoutSecs ? requestTimeoutSecs * 1e3 : null },
);
szmarczak marked this conversation as resolved.
Show resolved Hide resolved

if (label) {
userData.label = label;
}
Expand Down Expand Up @@ -399,12 +427,23 @@ export interface RequestOptions<UserData extends Dictionary = Dictionary> {
*/
skipNavigation?: boolean;

/**
* Determines a request-specific number of seconds before the request times out.
* Overrides the default `navigationTimeoutSecs` provided to the crawler.
*/
requestTimeoutSecs?: number;

/**
* Determines a request-specific number of seconds before the request handler times out.
* Overrides the default `requestHandlerTimeoutSecs` provided to the crawler.
*/
requestHandlerTimeoutSecs?: number;

/** @internal */
id?: string;

/** @internal */
handledAt?: string;

}

export interface PushErrorMessageOptions {
Expand Down