Skip to content

Commit

Permalink
feat: Sitemap-based request list implementation (#2498)
Browse files Browse the repository at this point in the history
This introduces an alternative RequestList implementation based on
sitemaps. It should be possible to use this in tandem with
RequestProvider in BasicCrawler, just like with the current RequestList.

---------

Co-authored-by: Jindřich Bär <jindrichbar@gmail.com>
  • Loading branch information
janbuchar and barjin committed Jul 4, 2024
1 parent bf01cbd commit 7bf8f0b
Show file tree
Hide file tree
Showing 10 changed files with 1,218 additions and 196 deletions.
10 changes: 5 additions & 5 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ import type {
DatasetExportOptions,
FinalStatistics,
GetUserDataFromRequest,
IRequestList,
ProxyInfo,
Request,
RequestList,
RequestOptions,
RequestProvider,
RouterHandler,
Expand Down Expand Up @@ -171,7 +171,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
* > Alternatively, `requests` parameter of {@apilink BasicCrawler.run|`crawler.run()`} could be used to enqueue the initial requests -
* it is a shortcut for running `crawler.addRequests()` before the `crawler.run()`.
*/
requestList?: RequestList;
requestList?: IRequestList;

/**
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
Expand Down Expand Up @@ -445,7 +445,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
* A reference to the underlying {@apilink RequestList} class that manages the crawler's {@apilink Request|requests}.
* Only available if used by the crawler.
*/
requestList?: RequestList;
requestList?: IRequestList;

/**
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
Expand Down Expand Up @@ -1170,7 +1170,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
* adding it back to the queue after the timeout passes. Returns `true` if the request
* should be ignored and will be reclaimed to the queue once ready.
*/
protected delayRequest(request: Request, source: RequestList | RequestProvider) {
protected delayRequest(request: Request, source: IRequestList | RequestProvider) {
const domain = getDomain(request.url);

if (!domain || !request) {
Expand Down Expand Up @@ -1415,7 +1415,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected async _requestFunctionErrorHandler(
error: Error,
crawlingContext: Context,
source: RequestList | RequestProvider,
source: IRequestList | RequestProvider,
): Promise<void> {
const { request } = crawlingContext;
request.pushErrorMessage(error);
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/storages/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ export { RequestQueue as RequestQueueV2 } from './request_queue_v2';
export * from './storage_manager';
export * from './utils';
export * from './access_checking';
export * from './sitemap_request_list';
113 changes: 93 additions & 20 deletions packages/core/src/storages/request_list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,80 @@ export const REQUESTS_PERSISTENCE_KEY = 'REQUEST_LIST_REQUESTS';

const CONTENT_TYPE_BINARY = 'application/octet-stream';

/**
* Represents a static list of URLs to crawl.
*/
export interface IRequestList {
/**
* Returns the total number of unique requests present in the list.
*/
length(): number;

/**
* Returns `true` if all requests were already handled and there are no more left.
*/
isFinished(): Promise<boolean>;

/**
* Resolves to `true` if the next call to {@apilink IRequestList.fetchNextRequest} function
* would return `null`, otherwise it resolves to `false`.
* Note that even if the list is empty, there might be some pending requests currently being processed.
*/
isEmpty(): Promise<boolean>;

/**
* Returns number of handled requests.
*/
handledCount(): number;

/**
* Persists the current state of the `IRequestList` into the default {@apilink KeyValueStore}.
* The state is persisted automatically in regular intervals, but calling this method manually
* is useful in cases where you want to have the most current state available after you pause
* or stop fetching its requests. For example after you pause or abort a crawl. Or just before
* a server migration.
*/
persistState(): Promise<void>;

/**
* Gets the next {@apilink Request} to process. First, the function gets a request previously reclaimed
* using the {@apilink RequestList.reclaimRequest} function, if there is any.
* Otherwise it gets the next request from sources.
*
* The function's `Promise` resolves to `null` if there are no more
* requests to process.
*/
fetchNextRequest(): Promise<Request | null>;

/**
* Gets the next {@apilink Request} to process. First, the function gets a request previously reclaimed
* using the {@apilink RequestList.reclaimRequest} function, if there is any.
* Otherwise it gets the next request from sources.
*
* The function resolves to `null` if there are no more requests to process.
*
* Can be used to iterate over the `RequestList` instance in a `for await .. of` loop.
* Provides an alternative for the repeated use of `fetchNextRequest`.
*/
[Symbol.asyncIterator](): AsyncGenerator<Request>;

/**
* Reclaims request to the list if its processing failed.
* The request will become available in the next `this.fetchNextRequest()`.
*/
reclaimRequest(request: Request): Promise<void>;

/**
* Marks request as handled after successful processing.
*/
markRequestHandled(request: Request): Promise<void>;

/**
* @internal
*/
inProgress: Set<string>;
}

export interface RequestListOptions {
/**
* An array of sources of URLs for the {@apilink RequestList}. It can be either an array of strings,
Expand Down Expand Up @@ -229,7 +303,7 @@ export interface RequestListOptions {
* ```
* @category Sources
*/
export class RequestList {
export class RequestList implements IRequestList {
private log = log.child({ prefix: 'RequestList' });

/**
Expand Down Expand Up @@ -431,11 +505,7 @@ export class RequestList {
}

/**
* Persists the current state of the `RequestList` into the default {@apilink KeyValueStore}.
* The state is persisted automatically in regular intervals, but calling this method manually
* is useful in cases where you want to have the most current state available after you pause
* or stop fetching its requests. For example after you pause or abort a crawl. Or just before
* a server migration.
* @inheritDoc
*/
async persistState(): Promise<void> {
if (!this.persistStateKey) {
Expand Down Expand Up @@ -570,9 +640,7 @@ export class RequestList {
}

/**
* Resolves to `true` if the next call to {@apilink RequestList.fetchNextRequest} function
* would return `null`, otherwise it resolves to `false`.
* Note that even if the list is empty, there might be some pending requests currently being processed.
* @inheritDoc
*/
async isEmpty(): Promise<boolean> {
this._ensureIsInitialized();
Expand All @@ -581,7 +649,7 @@ export class RequestList {
}

/**
* Returns `true` if all requests were already handled and there are no more left.
* @inheritDoc
*/
async isFinished(): Promise<boolean> {
this._ensureIsInitialized();
Expand All @@ -590,12 +658,7 @@ export class RequestList {
}

/**
* Gets the next {@apilink Request} to process. First, the function gets a request previously reclaimed
* using the {@apilink RequestList.reclaimRequest} function, if there is any.
* Otherwise it gets the next request from sources.
*
* The function's `Promise` resolves to `null` if there are no more
* requests to process.
* @inheritDoc
*/
async fetchNextRequest(): Promise<Request | null> {
this._ensureIsInitialized();
Expand All @@ -621,6 +684,17 @@ export class RequestList {
return null;
}

/**
* @inheritDoc
*/
async *[Symbol.asyncIterator]() {
while (true) {
const req = await this.fetchNextRequest();
if (!req) break;
yield req;
}
}

private ensureRequest(requestLike: Request | RequestOptions, index: number): Request {
if (requestLike instanceof Request) {
return requestLike;
Expand All @@ -631,7 +705,7 @@ export class RequestList {
}

/**
* Marks request as handled after successful processing.
* @inheritDoc
*/
async markRequestHandled(request: Request): Promise<void> {
const { uniqueKey } = request;
Expand All @@ -645,8 +719,7 @@ export class RequestList {
}

/**
* Reclaims request to the list if its processing failed.
* The request will become available in the next `this.fetchNextRequest()`.
* @inheritDoc
*/
async reclaimRequest(request: Request): Promise<void> {
const { uniqueKey } = request;
Expand Down Expand Up @@ -798,7 +871,7 @@ export class RequestList {
}

/**
* Returns number of handled requests.
* @inheritDoc
*/
handledCount(): number {
this._ensureIsInitialized();
Expand Down
Loading

0 comments on commit 7bf8f0b

Please sign in to comment.