Skip to content

Commit

Permalink
chore: change implementation to call the crawler.setStatusMessage f…
Browse files Browse the repository at this point in the history
…rom inside the callback
  • Loading branch information
B4nan committed Jul 20, 2023
1 parent 3bcf74b commit 56e61a0
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 20 deletions.
5 changes: 5 additions & 0 deletions .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@
"no-console": "error",
"no-underscore-dangle": 0,
"no-void": 0,
"max-len": ["error", {
"code": 160,
"ignoreUrls": true,
"ignoreComments": true
}],
"import/order": ["error", {
"groups": ["builtin", "external", ["parent", "sibling"], "index", "object"],
"alphabetize": { "order": "asc", "caseInsensitive": true },
Expand Down
41 changes: 26 additions & 15 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,16 @@ export interface StatusMessageCallbackParams<
Context extends CrawlingContext = BasicCrawlingContext,
Crawler extends BasicCrawler<any> = BasicCrawler<Context>,
> {
crawler: Crawler;
state: StatisticState;
crawler: Crawler;
previousState: StatisticState;
message: string;
}

export type StatusMessageCallback<
Context extends CrawlingContext = BasicCrawlingContext,
Crawler extends BasicCrawler<any> = BasicCrawler<Context>,
> = (params: StatusMessageCallbackParams<Context, Crawler>) => Awaitable<string | undefined | null>;
> = (params: StatusMessageCallbackParams<Context, Crawler>) => Awaitable<void>;

export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCrawlingContext> {
/**
Expand Down Expand Up @@ -277,15 +278,23 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
statusMessageLoggingInterval?: number;

/**
* Allows overriding the default status message. When the callback returns `null` or `undefined`, the default message will be used as a fallback.
* Allows overriding the default status message. The callback needs to call `crawler.setStatusMessage()` explicitly.
* The default status message is provided in the parameters.
*
* ```ts
* const crawler = new CheerioCrawler({
* statusMessageCallback: async (ctx) => {
* return ctx.crawler.setStatusMessage(`this is status message from ${new Date().toISOString()}`, { level: 'INFO' }); // log level defaults to 'DEBUG'
* },
* statusMessageLoggingInterval: 1, // defaults to 10s
* async requestHandler({ $, enqueueLinks, request, log }) {
* // ...
* },
* });
* ```
*/
statusMessageCallback?: StatusMessageCallback;

/**
* Allows overriding the default status message. When the callback returns `null` or `undefined`, the default message will be used as a fallback.
*/
statusMessageLogLevel?: LogLevel.DEBUG | LogLevel.INFO | LogLevel.WARNING | LogLevel.ERROR;

/**
* If set to `true`, the crawler will automatically try to bypass any detected bot protection.
*
Expand Down Expand Up @@ -417,7 +426,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected handledRequestsCount: number;
protected statusMessageLoggingInterval: number;
protected statusMessageCallback?: StatusMessageCallback;
protected statusMessageLogLevel?: 'DEBUG' | 'INFO' | 'WARNING' | 'ERROR';
protected sessionPoolOptions: SessionPoolOptions;
protected useSessionPool: boolean;
protected crawlingContexts = new Map<string, Context>();
Expand Down Expand Up @@ -450,7 +458,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

statusMessageLoggingInterval: ow.optional.number,
statusMessageCallback: ow.optional.function,
statusMessageLogLevel: ow.optional.string.oneOf(['DEBUG', 'INFO', 'WARNING', 'ERROR']),

retryOnBlocked: ow.optional.boolean,

Expand Down Expand Up @@ -504,15 +511,13 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

statusMessageLoggingInterval = 10,
statusMessageCallback,
statusMessageLogLevel = 'DEBUG',
} = options;

this.requestList = requestList;
this.requestQueue = requestQueue;
this.log = log;
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
this.statusMessageCallback = statusMessageCallback as StatusMessageCallback;
this.statusMessageLogLevel = statusMessageLogLevel;
this.events = config.getEventManager();

this._handlePropertyNameChange({
Expand Down Expand Up @@ -654,7 +659,10 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
}

private setStatusMessage(message: string, options: SetStatusMessageOptions = {}) {
/**
* This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
*/
setStatusMessage(message: string, options: SetStatusMessageOptions = {}) {
const data = options.isStatusMessageTerminal != null ? { terminal: options.isStatusMessageTerminal } : undefined;
this.log.internal(LogLevel[options.level as 'DEBUG' ?? 'DEBUG'], message, data);

Expand Down Expand Up @@ -690,8 +698,11 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests.`;
}

message = await this.statusMessageCallback?.({ crawler: this as any, state: this.stats.state, previousState }) ?? message;
await this.setStatusMessage(message, { level: this.statusMessageLogLevel });
if (this.statusMessageCallback) {
return this.statusMessageCallback({ crawler: this as any, state: this.stats.state, previousState, message });
}

await this.setStatusMessage(message);
};

const interval = setInterval(log, this.statusMessageLoggingInterval * 1e3);
Expand Down
4 changes: 1 addition & 3 deletions packages/types/src/storages.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import type { LogLevel } from '@apify/log';

import type { AllowedHttpMethods, Dictionary } from './utility-types';

/**
Expand Down Expand Up @@ -309,7 +307,7 @@ export interface RequestQueueOptions {

export interface SetStatusMessageOptions {
isStatusMessageTerminal?: boolean;
level?: LogLevel.DEBUG | LogLevel.INFO | LogLevel.WARNING | LogLevel.ERROR | 'DEBUG' | 'INFO' | 'WARNING' | 'ERROR';
level?: 'DEBUG' | 'INFO' | 'WARNING' | 'ERROR';
}

/**
Expand Down
5 changes: 3 additions & 2 deletions test/e2e/cheerio-default/actor/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ const mainOptions = {

await Actor.main(async () => {
const crawler = new CheerioCrawler({
statusMessageCallback: async () => `this is status message from ${new Date().toISOString()}`,
statusMessageLogLevel: 'INFO',
statusMessageCallback: async (ctx) => {
return ctx.crawler.setStatusMessage(`this is status message from ${new Date().toISOString()}`, { level: 'INFO' });
},
statusMessageLoggingInterval: 1,
async requestHandler({ $, enqueueLinks, request, log }) {
const { url } = request;
Expand Down

0 comments on commit 56e61a0

Please sign in to comment.