Skip to content

Commit

Permalink
Implement a basic SitemapRequestList
Browse files Browse the repository at this point in the history
  • Loading branch information
janbuchar committed May 24, 2024
1 parent 265df3f commit d616a9e
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 0 deletions.
1 change: 1 addition & 0 deletions packages/core/src/storages/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ export { RequestQueue as RequestQueueV2 } from './request_queue_v2';
export * from './storage_manager';
export * from './utils';
export * from './access_checking';
export * from './sitemap_request_list';
90 changes: 90 additions & 0 deletions packages/core/src/storages/sitemap_request_list.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import { Sitemap } from '@crawlee/utils';

Check failure on line 1 in packages/core/src/storages/sitemap_request_list.ts

View workflow job for this annotation

GitHub Actions / Lint

There should be at least one empty line between import groups
import { Request } from '../request';
import { IRequestList } from './request_list';

Check failure on line 3 in packages/core/src/storages/sitemap_request_list.ts

View workflow job for this annotation

GitHub Actions / Lint

All imports in the declaration are only used as types. Use `import type`

Check failure on line 3 in packages/core/src/storages/sitemap_request_list.ts

View workflow job for this annotation

GitHub Actions / Lint

`./request_list` import should occur before import of `../request`

export class SitemapRequestList implements IRequestList {
inProgress = new Set<string>();

reclaimed = new Set<string>();

requests: Request[] = [];

uniqueKeyToIndex = new Map<string, number>();

private nextIndex = 0;

private constructor(sitemap: Sitemap) {
for (const url of sitemap.urls) {
this.addRequest(url);
}
}

static async open({ sitemapUrls }: { sitemapUrls: string | string[] }): Promise<SitemapRequestList> {
return new SitemapRequestList(await Sitemap.load(sitemapUrls));
}

length(): number {
return this.requests.length;
}

async isFinished(): Promise<boolean> {
return this.inProgress.size === 0 && this.nextIndex >= this.requests.length;
}

async isEmpty(): Promise<boolean> {
return this.reclaimed.size === 0 && this.nextIndex >= this.requests.length;
}

handledCount(): number {
return this.nextIndex - this.inProgress.size;
}

async persistState(): Promise<void> {
throw new Error('SitemapRequestList persistence is not yet implemented.');
}

async fetchNextRequest(): Promise<Request | null> {
// Try to return a reclaimed request first
const uniqueKey = this.reclaimed.values().next().value;
if (uniqueKey) {
this.reclaimed.delete(uniqueKey);
const index = this.uniqueKeyToIndex.get(uniqueKey)!;
return this.requests[index];
}

// Otherwise return next request.
if (this.nextIndex <= this.requests.length) {
const request = this.requests[this.nextIndex];
this.nextIndex += 1;

this.inProgress.add(request.uniqueKey);

return request;
}

return null;
}

async reclaimRequest(request: Request): Promise<void> {
this.ensureInProgressAndNotReclaimed(request.uniqueKey);
this.reclaimed.add(request.uniqueKey);
}

async markRequestHandled(request: Request): Promise<void> {
this.ensureInProgressAndNotReclaimed(request.uniqueKey);
this.inProgress.delete(request.uniqueKey);
}

private ensureInProgressAndNotReclaimed(uniqueKey: string): void {
if (!this.inProgress.has(uniqueKey)) {
throw new Error(`The request is not being processed (uniqueKey: ${uniqueKey})`);
}
if (this.reclaimed.has(uniqueKey)) {
throw new Error(`The request was already reclaimed (uniqueKey: ${uniqueKey})`);
}
}

private addRequest(url: string): void {
this.requests.push(new Request({ url }));
}
}

0 comments on commit d616a9e

Please sign in to comment.