Skip to content

Commit

Permalink
feat: adaptive playwright crawler (#2316)
Browse files Browse the repository at this point in the history
This uses the newly added restricted crawling contexts to execute
request handlers. This allows us to compare browser and http-only
request handler runs for a request and switch to http-only crawling on
sites that we predict to be static. More information will be added here.

The intended usage is as follows:

```ts
import { AdaptivePlaywrightCrawler } from 'crawlee';

const startUrls = [{url: 'https://warehouse-theme-metal.myshopify.com/collections', label: 'START'}];

const crawler = new AdaptivePlaywrightCrawler({
    requestHandler: async ({ request, enqueueLinks, pushData, querySelector }) => {
        console.log(`Processing: ${request.url} (${request.label})`);

        if (request.label === 'DETAIL') {
            const urlPart = request.url.split('/').slice(-1); // ['sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440']
            const manufacturer = urlPart[0].split('-')[0]; // 'sennheiser'

            const title = (await querySelector('.product-meta h1')).text();
            const sku = (await querySelector('span.product-meta__sku-number')).text();

            const $prices = await querySelector('span.price')
            const currentPriceString = $prices.filter(':contains("$")').first().text()

            const rawPrice = currentPriceString.split('$')[1];
            const price = Number(rawPrice.replaceAll(',', ''));

            const inStockElements = await querySelector('span.product-form__inventory')
            const inStock = inStockElements.filter(':contains("In stock")').length > 0;

            const results = {
                url: request.url,
                manufacturer,
                title,
                sku,
                currentPrice: price,
                availableInStock: inStock,
            };

            await pushData(results);
        } else if (request.label === 'CATEGORY') {
            await enqueueLinks({
                selector: '.product-item > a',
                label: 'DETAIL', // <= note the different label
            });

            await enqueueLinks({
                selector: 'a.pagination__next',
                label: 'CATEGORY', // <= note the same label
            });
        } else if (request.label === 'START') {
            await enqueueLinks({
                selector: '.collection-block-item',
                label: 'CATEGORY',
            });
        }
    },
    renderingTypeDetectionRatio: 0.1,
    maxRequestsPerCrawl: 100,
    maxRequestRetries: 0,
    minConcurrency: 1,
    maxConcurrency: 1,
    headless: true,
});

await crawler.run(startUrls);

```

When handling a request from the queue, the crawler

1. tries to predict the rendering type (static/client only) based on
URL, label and potentially other criteria (using a logistic regression
model that gets updated on the fly)
2. for static pages, a HTTP-only scrape is done and the request handler
works with Cheerio-based portadom
3. for client only pages, a playwright scrape is done and the request
handler receives a portadom instance that uses Playwright locators
(hence it waits for content to appear implicitly)
4. for a configurable percentage of requests, a detection is done (also
if we're not confident about the prediction) - both HTTP-only and
playwright scrapes are done and the results are compared. If (and only
if) the HTTP-only scrape behaves the same, we conclude the page is
static and update our logistic regression model.
  • Loading branch information
janbuchar committed Feb 21, 2024
1 parent 28aba08 commit 8e4218a
Show file tree
Hide file tree
Showing 17 changed files with 1,025 additions and 25 deletions.
1 change: 1 addition & 0 deletions package.json
Expand Up @@ -66,6 +66,7 @@
"@types/htmlparser2": "^3.10.3",
"@types/inquirer": "^8.2.1",
"@types/is-ci": "^3.0.1",
"@types/lodash.isequal": "^4.5.8",
"@types/lodash.merge": "^4.6.7",
"@types/mime-types": "^2.1.1",
"@types/node": "^20.0.0",
Expand Down
2 changes: 1 addition & 1 deletion packages/basic-crawler/src/internals/basic-crawler.ts
Expand Up @@ -427,7 +427,7 @@ export interface CrawlerExperiments {
* @category Crawlers
*/
export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext> {
private static readonly CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
protected static readonly CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';

/**
* A reference to the underlying {@apilink Statistics} class that collects and logs run statistics for requests.
Expand Down
3 changes: 2 additions & 1 deletion packages/browser-crawler/package.json
Expand Up @@ -59,6 +59,7 @@
"@crawlee/types": "3.7.3",
"@crawlee/utils": "3.7.3",
"ow": "^0.28.1",
"tslib": "^2.4.0"
"tslib": "^2.4.0",
"type-fest": "^4.0.0"
}
}
5 changes: 3 additions & 2 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Expand Up @@ -40,6 +40,7 @@ import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
import type { Cookie as CookieObject } from '@crawlee/types';
import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
import ow from 'ow';
import type { ReadonlyDeep } from 'type-fest';

import type { BrowserLaunchContext } from './browser-launcher';

Expand Down Expand Up @@ -749,7 +750,7 @@ export abstract class BrowserCrawler<

/** @internal */
interface EnqueueLinksInternalOptions {
options?: EnqueueLinksOptions;
options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
page: CommonPage;
requestQueue: RequestProvider;
originalRequestUrl: string;
Expand Down Expand Up @@ -786,7 +787,7 @@ export async function browserCrawlerEnqueueLinks({
* @ignore
*/
// eslint-disable-next-line @typescript-eslint/ban-types
async function extractUrlsFromPage(page: { $$eval: Function }, selector: string, baseUrl: string): Promise<string[]> {
export async function extractUrlsFromPage(page: { $$eval: Function }, selector: string, baseUrl: string): Promise<string[]> {
const urls = await page.$$eval(selector, (linkEls: HTMLLinkElement[]) => linkEls.map((link) => link.getAttribute('href')).filter((href) => !!href)) ?? [];
const [base] = await page.$$eval('base', (els: HTMLLinkElement[]) => els.map((el) => el.getAttribute('href')));
const absoluteBaseUrl = base && tryAbsoluteURL(base, baseUrl);
Expand Down
152 changes: 144 additions & 8 deletions packages/core/src/crawlers/crawler_commons.ts
@@ -1,16 +1,18 @@
import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
import type { Response as GotResponse, OptionsInit } from 'got-scraping';
import type { ReadonlyDeep } from 'type-fest';

import type { Configuration } from '../configuration';
import type { EnqueueLinksOptions } from '../enqueue_links/enqueue_links';
import type { Log } from '../log';
import type { ProxyInfo } from '../proxy_configuration';
import type { Request, Source } from '../request';
import type { Session } from '../session_pool/session';
import type { RequestQueueOperationOptions, Dataset, KeyValueStore } from '../storages';
import type { RequestQueueOperationOptions, Dataset, RecordOptions } from '../storages';
import { KeyValueStore } from '../storages';

// eslint-disable-next-line @typescript-eslint/ban-types
export interface RestrictedCrawlingContext<UserData extends Dictionary = Dictionary> extends Record<string & {}, unknown>{
export interface RestrictedCrawlingContext<UserData extends Dictionary = Dictionary> extends Record<string, unknown>{
/**
* The original {@apilink Request} object.
*/
Expand All @@ -23,7 +25,7 @@ export interface RestrictedCrawlingContext<UserData extends Dictionary = Diction
*
* @param [data] Data to be pushed to the default dataset.
*/
pushData(data: Parameters<Dataset['pushData']>[0], datasetIdOrName?: string): Promise<void>;
pushData(data: ReadonlyDeep<Parameters<Dataset['pushData']>[0]>, datasetIdOrName?: string): Promise<void>;

/**
* This function automatically finds and enqueues links from the current page, adding them to the {@apilink RequestQueue}
Expand All @@ -49,7 +51,7 @@ export interface RestrictedCrawlingContext<UserData extends Dictionary = Diction
*
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
*/
enqueueLinks: (options?: Omit<EnqueueLinksOptions, 'requestQueue'>) => Promise<unknown>;
enqueueLinks: (options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>>) => Promise<unknown>;

/**
* Add requests directly to the request queue.
Expand All @@ -58,8 +60,8 @@ export interface RestrictedCrawlingContext<UserData extends Dictionary = Diction
* @param options Options for the request queue
*/
addRequests: (
requestsLike: (string | Source)[],
options?: RequestQueueOperationOptions,
requestsLike: ReadonlyDeep<(string | Source)[]>,
options?: ReadonlyDeep<RequestQueueOperationOptions>,
) => Promise<void>;

/**
Expand Down Expand Up @@ -115,7 +117,9 @@ export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
* @returns Promise that resolves to {@apilink BatchAddRequestsResult} object.
*/
enqueueLinks(options?: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;
enqueueLinks(
options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>
): Promise<BatchAddRequestsResult>;

/**
* Get a key-value store with given name or id, or the default one for the crawler.
Expand All @@ -141,3 +145,135 @@ export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary
*/
sendRequest<Response = string>(overrideOptions?: Partial<OptionsInit>): Promise<GotResponse<Response>>;
}

/**
* A partial implementation of {@apilink RestrictedCrawlingContext} that stores parameters of calls to context methods for later inspection.
*
* @experimental
*/
export class RequestHandlerResult {
private _keyValueStoreChanges: Record<string, Record<string, { changedValue: unknown; options?: RecordOptions }>> = {};
private pushDataCalls: Parameters<RestrictedCrawlingContext['pushData']>[] = [];
private addRequestsCalls: Parameters<RestrictedCrawlingContext['addRequests']>[] = [];
private enqueueLinksCalls: Parameters<RestrictedCrawlingContext['enqueueLinks']>[] = [];

constructor(private config: Configuration, private crawleeStateKey: string) {}

/**
* A record of calls to {@apilink RestrictedCrawlingContext.pushData}, {@apilink RestrictedCrawlingContext.addRequests}, {@apilink RestrictedCrawlingContext.enqueueLinks} made by a request handler.
*/
get calls(): ReadonlyDeep<{
pushData: Parameters<RestrictedCrawlingContext['pushData']>[];
addRequests: Parameters<RestrictedCrawlingContext['addRequests']>[];
enqueueLinks: Parameters<RestrictedCrawlingContext['enqueueLinks']>[];
}> {
return { pushData: this.pushDataCalls, addRequests: this.addRequestsCalls, enqueueLinks: this.enqueueLinksCalls };
}

/**
* A record of changes made to key-value stores by a request handler.
*/
get keyValueStoreChanges(): ReadonlyDeep<Record<string, Record<string, { changedValue: unknown; options?: RecordOptions }>>> {
return this._keyValueStoreChanges;
}

/**
* Items added to datasets by a request handler.
*/
get datasetItems(): ReadonlyDeep<{ item: Dictionary; datasetIdOrName?: string }[]> {
return this.pushDataCalls.flatMap(([data, datasetIdOrName]) => (Array.isArray(data) ? data : [data]).map((item) => ({ item, datasetIdOrName })));
}

/**
* URLs enqueued to the request queue by a request handler, either via {@apilink RestrictedCrawlingContext.addRequests} or {@apilink RestrictedCrawlingContext.enqueueLinks}
*/
get enqueuedUrls(): ReadonlyDeep<{ url: string; label?: string }[]> {
const result: {url: string; label? : string}[] = [];

for (const [options] of this.enqueueLinksCalls) {
result.push(...(options?.urls?.map((url) => ({ url, label: options?.label })) ?? []));
}

for (const [requests] of this.addRequestsCalls) {
for (const request of requests) {
if (typeof request === 'object' && (!('requestsFromUrl' in request) || request.requestsFromUrl !== undefined) && request.url !== undefined) {
result.push({ url: request.url, label: request.label });
} else if (typeof request === 'string') {
result.push({ url: request });
}
}
}

return result;
}

/**
* URL lists enqueued to the request queue by a request handler via {@apilink RestrictedCrawlingContext.addRequests} using the `requestsFromUrl` option.
*/
get enqueuedUrlLists(): ReadonlyDeep<{ listUrl: string; label? : string }[]> {
const result: {listUrl: string; label? : string}[] = [];

for (const [requests] of this.addRequestsCalls) {
for (const request of requests) {
if (typeof request === 'object' && 'requestsFromUrl' in request && request.requestsFromUrl !== undefined) {
result.push({ listUrl: request.requestsFromUrl, label: request.label });
}
}
}

return result;
}

pushData: RestrictedCrawlingContext['pushData'] = async (data, datasetIdOrName) => {
this.pushDataCalls.push([data, datasetIdOrName]);
};

enqueueLinks: RestrictedCrawlingContext['enqueueLinks'] = async (options) => {
this.enqueueLinksCalls.push([options]);
};

addRequests: RestrictedCrawlingContext['addRequests'] = async (requests, options = {}) => {
this.addRequestsCalls.push([requests, options]);
};

useState: RestrictedCrawlingContext['useState'] = async (defaultValue) => {
const store = await this.getKeyValueStore(undefined);
return await store.getAutoSavedValue(this.crawleeStateKey, defaultValue);
};

getKeyValueStore: RestrictedCrawlingContext['getKeyValueStore'] = async (idOrName) => {
const store = await KeyValueStore.open(idOrName, { config: this.config });

return {
id: this.idOrDefault(idOrName),
name: idOrName,
getValue: async (key) => this.getKeyValueStoreChangedValue(idOrName, key) ?? await store.getValue(key),
getAutoSavedValue: async <T extends Dictionary = Dictionary>(key: string, defaultValue: T = {} as T) => {
let value = this.getKeyValueStoreChangedValue(idOrName, key);
if (value === null) {
value = await store.getValue(key) ?? defaultValue;
this.setKeyValueStoreChangedValue(idOrName, key, value);
}

return value as T;
},
setValue: async (key, value, options) => {
this.setKeyValueStoreChangedValue(idOrName, key, value, options);
},
};
};

private idOrDefault = (idOrName?: string): string => idOrName ?? this.config.get('defaultKeyValueStoreId');

private getKeyValueStoreChangedValue = (idOrName: string | undefined, key: string) => {
const id = this.idOrDefault(idOrName);
this._keyValueStoreChanges[id] ??= {};
return this.keyValueStoreChanges[id][key]?.changedValue ?? null;
};

private setKeyValueStoreChangedValue = (idOrName: string | undefined, key: string, changedValue: unknown, options?: RecordOptions) => {
const id = this.idOrDefault(idOrName);
this._keyValueStoreChanges[id] ??= {};
this._keyValueStoreChanges[id][key] = { changedValue, options };
};
}
4 changes: 2 additions & 2 deletions packages/core/src/crawlers/statistics.ts
Expand Up @@ -93,8 +93,8 @@ export class Statistics {
*/
private readonly config: Configuration;

private keyValueStore?: KeyValueStore = undefined;
private persistStateKey = `SDK_CRAWLER_STATISTICS_${this.id}`;
protected keyValueStore?: KeyValueStore = undefined;
protected persistStateKey = `SDK_CRAWLER_STATISTICS_${this.id}`;
private logIntervalMillis: number;
private logMessage: string;
private listener: () => Promise<void>;
Expand Down
10 changes: 5 additions & 5 deletions packages/core/src/enqueue_links/enqueue_links.ts
Expand Up @@ -21,7 +21,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
limit?: number;

/** An array of URLs to enqueue. */
urls?: string[];
urls?: Readonly<string[]>;

/** A request queue to which the URLs will be enqueued. */
requestQueue?: RequestProvider;
Expand Down Expand Up @@ -60,7 +60,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
* If `globs` is an empty array or `undefined`, and `regexps` are also not defined, then the function
* enqueues the links with the same subdomain.
*/
globs?: GlobInput[];
globs?: Readonly<GlobInput[]>;

/**
* An array of glob pattern strings, regexp patterns or plain objects
Expand All @@ -72,7 +72,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
* Glob matching is always case-insensitive.
* If you need case-sensitive matching, provide a regexp.
*/
exclude?: (GlobInput | RegExpInput)[];
exclude?: Readonly<(GlobInput | RegExpInput)[]>;

/**
* An array of regular expressions or plain objects
Expand All @@ -84,7 +84,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
* If `regexps` is an empty array or `undefined`, and `globs` are also not defined, then the function
* enqueues the links with the same subdomain.
*/
regexps?: RegExpInput[];
regexps?: Readonly<RegExpInput[]>;

/**
* *NOTE:* In future versions of SDK the options will be removed.
Expand All @@ -104,7 +104,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
*
* @deprecated prefer using `globs` or `regexps` instead
*/
pseudoUrls?: PseudoUrlInput[];
pseudoUrls?: Readonly<PseudoUrlInput[]>;

/**
* Just before a new {@apilink Request} is constructed and enqueued to the {@apilink RequestQueue}, this function can be used
Expand Down
6 changes: 3 additions & 3 deletions packages/core/src/enqueue_links/shared.ts
Expand Up @@ -52,7 +52,7 @@ export function updateEnqueueLinksPatternCache(item: GlobInput | RegExpInput | P
* to construct RegExps from PseudoUrl strings.
* @ignore
*/
export function constructRegExpObjectsFromPseudoUrls(pseudoUrls: PseudoUrlInput[]): RegExpObject[] {
export function constructRegExpObjectsFromPseudoUrls(pseudoUrls: Readonly<PseudoUrlInput[]>): RegExpObject[] {
return pseudoUrls.map((item) => {
// Get pseudoUrl object from cache.
let regexpObject = enqueueLinksPatternCache.get(item);
Expand All @@ -76,7 +76,7 @@ export function constructRegExpObjectsFromPseudoUrls(pseudoUrls: PseudoUrlInput[
* to construct Glob objects from Glob pattern strings.
* @ignore
*/
export function constructGlobObjectsFromGlobs(globs: GlobInput[]): GlobObject[] {
export function constructGlobObjectsFromGlobs(globs: Readonly<GlobInput[]>): GlobObject[] {
return globs
.filter((glob) => {
// Skip possibly nullish, empty strings
Expand Down Expand Up @@ -126,7 +126,7 @@ export function validateGlobPattern(glob: string): string {
* to check RegExps input and return valid RegExps.
* @ignore
*/
export function constructRegExpObjectsFromRegExps(regexps: RegExpInput[]): RegExpObject[] {
export function constructRegExpObjectsFromRegExps(regexps: Readonly<RegExpInput[]>): RegExpObject[] {
return regexps.map((item) => {
// Get regexp object from cache.
let regexpObject = enqueueLinksPatternCache.get(item);
Expand Down
5 changes: 5 additions & 0 deletions packages/playwright-crawler/package.json
Expand Up @@ -55,14 +55,19 @@
"dependencies": {
"@apify/datastructures": "^2.0.0",
"@apify/log": "^2.4.0",
"@apify/timeout": "^0.3.1",
"@crawlee/browser": "3.7.3",
"@crawlee/browser-pool": "3.7.3",
"@crawlee/core": "3.7.3",
"@crawlee/types": "3.7.3",
"@crawlee/utils": "3.7.3",
"cheerio": "^1.0.0-rc.12",
"idcac-playwright": "^0.1.2",
"jquery": "^3.6.0",
"lodash.isequal": "^4.5.0",
"ml-logistic-regression": "^2.0.0",
"ow": "^0.28.1",
"string-comparison": "^1.3.0",
"tslib": "^2.4.0"
},
"peerDependencies": {
Expand Down
2 changes: 2 additions & 0 deletions packages/playwright-crawler/src/index.ts
@@ -1,7 +1,9 @@
export * from '@crawlee/browser';
export * from './internals/playwright-crawler';
export * from './internals/playwright-launcher';
export * from './internals/adaptive-playwright-crawler';

export * as playwrightUtils from './internals/utils/playwright-utils';
export * as playwrightClickElements from './internals/enqueue-links/click-elements';
export type { DirectNavigationOptions as PlaywrightDirectNavigationOptions } from './internals/utils/playwright-utils';
export type { RenderingType } from './internals/utils/rendering-type-prediction';

0 comments on commit 8e4218a

Please sign in to comment.