diff --git a/package.json b/package.json index 5027cd1e8c1..aaca3261b30 100644 --- a/package.json +++ b/package.json @@ -66,6 +66,7 @@ "@types/htmlparser2": "^3.10.3", "@types/inquirer": "^8.2.1", "@types/is-ci": "^3.0.1", + "@types/lodash.isequal": "^4.5.8", "@types/lodash.merge": "^4.6.7", "@types/mime-types": "^2.1.1", "@types/node": "^20.0.0", diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index d966a670baf..cccf3809580 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -427,7 +427,7 @@ export interface CrawlerExperiments { * @category Crawlers */ export class BasicCrawler { - private static readonly CRAWLEE_STATE_KEY = 'CRAWLEE_STATE'; + protected static readonly CRAWLEE_STATE_KEY = 'CRAWLEE_STATE'; /** * A reference to the underlying {@apilink Statistics} class that collects and logs run statistics for requests. diff --git a/packages/browser-crawler/package.json b/packages/browser-crawler/package.json index 470817b8f3d..93fbf170796 100644 --- a/packages/browser-crawler/package.json +++ b/packages/browser-crawler/package.json @@ -59,6 +59,7 @@ "@crawlee/types": "3.7.3", "@crawlee/utils": "3.7.3", "ow": "^0.28.1", - "tslib": "^2.4.0" + "tslib": "^2.4.0", + "type-fest": "^4.0.0" } } diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 6b29e7bd43f..3d07c0ac674 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -40,6 +40,7 @@ import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool'; import type { Cookie as CookieObject } from '@crawlee/types'; import { CLOUDFLARE_RETRY_CSS_SELECTORS, RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils'; import ow from 'ow'; +import type { ReadonlyDeep } from 'type-fest'; import type { BrowserLaunchContext } from './browser-launcher'; @@ -749,7 +750,7 @@ export abstract class BrowserCrawler< /** @internal */ interface EnqueueLinksInternalOptions { - options?: EnqueueLinksOptions; + options?: ReadonlyDeep> & Pick; page: CommonPage; requestQueue: RequestProvider; originalRequestUrl: string; @@ -786,7 +787,7 @@ export async function browserCrawlerEnqueueLinks({ * @ignore */ // eslint-disable-next-line @typescript-eslint/ban-types -async function extractUrlsFromPage(page: { $$eval: Function }, selector: string, baseUrl: string): Promise { +export async function extractUrlsFromPage(page: { $$eval: Function }, selector: string, baseUrl: string): Promise { const urls = await page.$$eval(selector, (linkEls: HTMLLinkElement[]) => linkEls.map((link) => link.getAttribute('href')).filter((href) => !!href)) ?? []; const [base] = await page.$$eval('base', (els: HTMLLinkElement[]) => els.map((el) => el.getAttribute('href'))); const absoluteBaseUrl = base && tryAbsoluteURL(base, baseUrl); diff --git a/packages/core/src/crawlers/crawler_commons.ts b/packages/core/src/crawlers/crawler_commons.ts index b42e4393307..e7a5509d2c2 100644 --- a/packages/core/src/crawlers/crawler_commons.ts +++ b/packages/core/src/crawlers/crawler_commons.ts @@ -1,16 +1,18 @@ import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types'; // @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood import type { Response as GotResponse, OptionsInit } from 'got-scraping'; +import type { ReadonlyDeep } from 'type-fest'; +import type { Configuration } from '../configuration'; import type { EnqueueLinksOptions } from '../enqueue_links/enqueue_links'; import type { Log } from '../log'; import type { ProxyInfo } from '../proxy_configuration'; import type { Request, Source } from '../request'; import type { Session } from '../session_pool/session'; -import type { RequestQueueOperationOptions, Dataset, KeyValueStore } from '../storages'; +import type { RequestQueueOperationOptions, Dataset, RecordOptions } from '../storages'; +import { KeyValueStore } from '../storages'; -// eslint-disable-next-line @typescript-eslint/ban-types -export interface RestrictedCrawlingContext extends Record{ +export interface RestrictedCrawlingContext extends Record{ /** * The original {@apilink Request} object. */ @@ -23,7 +25,7 @@ export interface RestrictedCrawlingContext[0], datasetIdOrName?: string): Promise; + pushData(data: ReadonlyDeep[0]>, datasetIdOrName?: string): Promise; /** * This function automatically finds and enqueues links from the current page, adding them to the {@apilink RequestQueue} @@ -49,7 +51,7 @@ export interface RestrictedCrawlingContext) => Promise; + enqueueLinks: (options?: ReadonlyDeep>) => Promise; /** * Add requests directly to the request queue. @@ -58,8 +60,8 @@ export interface RestrictedCrawlingContext, + options?: ReadonlyDeep, ) => Promise; /** @@ -115,7 +117,9 @@ export interface CrawlingContext; + enqueueLinks( + options?: ReadonlyDeep> & Pick + ): Promise; /** * Get a key-value store with given name or id, or the default one for the crawler. @@ -141,3 +145,135 @@ export interface CrawlingContext(overrideOptions?: Partial): Promise>; } + +/** + * A partial implementation of {@apilink RestrictedCrawlingContext} that stores parameters of calls to context methods for later inspection. + * + * @experimental + */ +export class RequestHandlerResult { + private _keyValueStoreChanges: Record> = {}; + private pushDataCalls: Parameters[] = []; + private addRequestsCalls: Parameters[] = []; + private enqueueLinksCalls: Parameters[] = []; + + constructor(private config: Configuration, private crawleeStateKey: string) {} + + /** + * A record of calls to {@apilink RestrictedCrawlingContext.pushData}, {@apilink RestrictedCrawlingContext.addRequests}, {@apilink RestrictedCrawlingContext.enqueueLinks} made by a request handler. + */ + get calls(): ReadonlyDeep<{ + pushData: Parameters[]; + addRequests: Parameters[]; + enqueueLinks: Parameters[]; + }> { + return { pushData: this.pushDataCalls, addRequests: this.addRequestsCalls, enqueueLinks: this.enqueueLinksCalls }; + } + + /** + * A record of changes made to key-value stores by a request handler. + */ + get keyValueStoreChanges(): ReadonlyDeep>> { + return this._keyValueStoreChanges; + } + + /** + * Items added to datasets by a request handler. + */ + get datasetItems(): ReadonlyDeep<{ item: Dictionary; datasetIdOrName?: string }[]> { + return this.pushDataCalls.flatMap(([data, datasetIdOrName]) => (Array.isArray(data) ? data : [data]).map((item) => ({ item, datasetIdOrName }))); + } + + /** + * URLs enqueued to the request queue by a request handler, either via {@apilink RestrictedCrawlingContext.addRequests} or {@apilink RestrictedCrawlingContext.enqueueLinks} + */ + get enqueuedUrls(): ReadonlyDeep<{ url: string; label?: string }[]> { + const result: {url: string; label? : string}[] = []; + + for (const [options] of this.enqueueLinksCalls) { + result.push(...(options?.urls?.map((url) => ({ url, label: options?.label })) ?? [])); + } + + for (const [requests] of this.addRequestsCalls) { + for (const request of requests) { + if (typeof request === 'object' && (!('requestsFromUrl' in request) || request.requestsFromUrl !== undefined) && request.url !== undefined) { + result.push({ url: request.url, label: request.label }); + } else if (typeof request === 'string') { + result.push({ url: request }); + } + } + } + + return result; + } + + /** + * URL lists enqueued to the request queue by a request handler via {@apilink RestrictedCrawlingContext.addRequests} using the `requestsFromUrl` option. + */ + get enqueuedUrlLists(): ReadonlyDeep<{ listUrl: string; label? : string }[]> { + const result: {listUrl: string; label? : string}[] = []; + + for (const [requests] of this.addRequestsCalls) { + for (const request of requests) { + if (typeof request === 'object' && 'requestsFromUrl' in request && request.requestsFromUrl !== undefined) { + result.push({ listUrl: request.requestsFromUrl, label: request.label }); + } + } + } + + return result; + } + + pushData: RestrictedCrawlingContext['pushData'] = async (data, datasetIdOrName) => { + this.pushDataCalls.push([data, datasetIdOrName]); + }; + + enqueueLinks: RestrictedCrawlingContext['enqueueLinks'] = async (options) => { + this.enqueueLinksCalls.push([options]); + }; + + addRequests: RestrictedCrawlingContext['addRequests'] = async (requests, options = {}) => { + this.addRequestsCalls.push([requests, options]); + }; + + useState: RestrictedCrawlingContext['useState'] = async (defaultValue) => { + const store = await this.getKeyValueStore(undefined); + return await store.getAutoSavedValue(this.crawleeStateKey, defaultValue); + }; + + getKeyValueStore: RestrictedCrawlingContext['getKeyValueStore'] = async (idOrName) => { + const store = await KeyValueStore.open(idOrName, { config: this.config }); + + return { + id: this.idOrDefault(idOrName), + name: idOrName, + getValue: async (key) => this.getKeyValueStoreChangedValue(idOrName, key) ?? await store.getValue(key), + getAutoSavedValue: async (key: string, defaultValue: T = {} as T) => { + let value = this.getKeyValueStoreChangedValue(idOrName, key); + if (value === null) { + value = await store.getValue(key) ?? defaultValue; + this.setKeyValueStoreChangedValue(idOrName, key, value); + } + + return value as T; + }, + setValue: async (key, value, options) => { + this.setKeyValueStoreChangedValue(idOrName, key, value, options); + }, + }; + }; + + private idOrDefault = (idOrName?: string): string => idOrName ?? this.config.get('defaultKeyValueStoreId'); + + private getKeyValueStoreChangedValue = (idOrName: string | undefined, key: string) => { + const id = this.idOrDefault(idOrName); + this._keyValueStoreChanges[id] ??= {}; + return this.keyValueStoreChanges[id][key]?.changedValue ?? null; + }; + + private setKeyValueStoreChangedValue = (idOrName: string | undefined, key: string, changedValue: unknown, options?: RecordOptions) => { + const id = this.idOrDefault(idOrName); + this._keyValueStoreChanges[id] ??= {}; + this._keyValueStoreChanges[id][key] = { changedValue, options }; + }; +} diff --git a/packages/core/src/crawlers/statistics.ts b/packages/core/src/crawlers/statistics.ts index 556c268d96a..345b2859dc1 100644 --- a/packages/core/src/crawlers/statistics.ts +++ b/packages/core/src/crawlers/statistics.ts @@ -93,8 +93,8 @@ export class Statistics { */ private readonly config: Configuration; - private keyValueStore?: KeyValueStore = undefined; - private persistStateKey = `SDK_CRAWLER_STATISTICS_${this.id}`; + protected keyValueStore?: KeyValueStore = undefined; + protected persistStateKey = `SDK_CRAWLER_STATISTICS_${this.id}`; private logIntervalMillis: number; private logMessage: string; private listener: () => Promise; diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 7faffae6eb9..7ed778f53a5 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -21,7 +21,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { limit?: number; /** An array of URLs to enqueue. */ - urls?: string[]; + urls?: Readonly; /** A request queue to which the URLs will be enqueued. */ requestQueue?: RequestProvider; @@ -60,7 +60,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { * If `globs` is an empty array or `undefined`, and `regexps` are also not defined, then the function * enqueues the links with the same subdomain. */ - globs?: GlobInput[]; + globs?: Readonly; /** * An array of glob pattern strings, regexp patterns or plain objects @@ -72,7 +72,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { * Glob matching is always case-insensitive. * If you need case-sensitive matching, provide a regexp. */ - exclude?: (GlobInput | RegExpInput)[]; + exclude?: Readonly<(GlobInput | RegExpInput)[]>; /** * An array of regular expressions or plain objects @@ -84,7 +84,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { * If `regexps` is an empty array or `undefined`, and `globs` are also not defined, then the function * enqueues the links with the same subdomain. */ - regexps?: RegExpInput[]; + regexps?: Readonly; /** * *NOTE:* In future versions of SDK the options will be removed. @@ -104,7 +104,7 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions { * * @deprecated prefer using `globs` or `regexps` instead */ - pseudoUrls?: PseudoUrlInput[]; + pseudoUrls?: Readonly; /** * Just before a new {@apilink Request} is constructed and enqueued to the {@apilink RequestQueue}, this function can be used diff --git a/packages/core/src/enqueue_links/shared.ts b/packages/core/src/enqueue_links/shared.ts index eb862631b32..88149060fa8 100644 --- a/packages/core/src/enqueue_links/shared.ts +++ b/packages/core/src/enqueue_links/shared.ts @@ -52,7 +52,7 @@ export function updateEnqueueLinksPatternCache(item: GlobInput | RegExpInput | P * to construct RegExps from PseudoUrl strings. * @ignore */ -export function constructRegExpObjectsFromPseudoUrls(pseudoUrls: PseudoUrlInput[]): RegExpObject[] { +export function constructRegExpObjectsFromPseudoUrls(pseudoUrls: Readonly): RegExpObject[] { return pseudoUrls.map((item) => { // Get pseudoUrl object from cache. let regexpObject = enqueueLinksPatternCache.get(item); @@ -76,7 +76,7 @@ export function constructRegExpObjectsFromPseudoUrls(pseudoUrls: PseudoUrlInput[ * to construct Glob objects from Glob pattern strings. * @ignore */ -export function constructGlobObjectsFromGlobs(globs: GlobInput[]): GlobObject[] { +export function constructGlobObjectsFromGlobs(globs: Readonly): GlobObject[] { return globs .filter((glob) => { // Skip possibly nullish, empty strings @@ -126,7 +126,7 @@ export function validateGlobPattern(glob: string): string { * to check RegExps input and return valid RegExps. * @ignore */ -export function constructRegExpObjectsFromRegExps(regexps: RegExpInput[]): RegExpObject[] { +export function constructRegExpObjectsFromRegExps(regexps: Readonly): RegExpObject[] { return regexps.map((item) => { // Get regexp object from cache. let regexpObject = enqueueLinksPatternCache.get(item); diff --git a/packages/playwright-crawler/package.json b/packages/playwright-crawler/package.json index e77e6a3e4f0..86100767064 100644 --- a/packages/playwright-crawler/package.json +++ b/packages/playwright-crawler/package.json @@ -55,14 +55,19 @@ "dependencies": { "@apify/datastructures": "^2.0.0", "@apify/log": "^2.4.0", + "@apify/timeout": "^0.3.1", "@crawlee/browser": "3.7.3", "@crawlee/browser-pool": "3.7.3", + "@crawlee/core": "3.7.3", "@crawlee/types": "3.7.3", "@crawlee/utils": "3.7.3", "cheerio": "^1.0.0-rc.12", "idcac-playwright": "^0.1.2", "jquery": "^3.6.0", + "lodash.isequal": "^4.5.0", + "ml-logistic-regression": "^2.0.0", "ow": "^0.28.1", + "string-comparison": "^1.3.0", "tslib": "^2.4.0" }, "peerDependencies": { diff --git a/packages/playwright-crawler/src/index.ts b/packages/playwright-crawler/src/index.ts index f265ec1891c..8b2efafca29 100644 --- a/packages/playwright-crawler/src/index.ts +++ b/packages/playwright-crawler/src/index.ts @@ -1,7 +1,9 @@ export * from '@crawlee/browser'; export * from './internals/playwright-crawler'; export * from './internals/playwright-launcher'; +export * from './internals/adaptive-playwright-crawler'; export * as playwrightUtils from './internals/utils/playwright-utils'; export * as playwrightClickElements from './internals/enqueue-links/click-elements'; export type { DirectNavigationOptions as PlaywrightDirectNavigationOptions } from './internals/utils/playwright-utils'; +export type { RenderingType } from './internals/utils/rendering-type-prediction'; diff --git a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts new file mode 100644 index 00000000000..02270b99b2f --- /dev/null +++ b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts @@ -0,0 +1,341 @@ +import { addTimeoutToPromise } from '@apify/timeout'; +import { extractUrlsFromPage } from '@crawlee/browser'; +import type { RestrictedCrawlingContext, StatisticState, StatisticPersistedState } from '@crawlee/core'; +import { Configuration, RequestHandlerResult, Statistics } from '@crawlee/core'; +import type { Awaitable, Dictionary } from '@crawlee/types'; +import { extractUrlsFromCheerio } from '@crawlee/utils'; +import { load, type Cheerio, type Element } from 'cheerio'; +import isEqual from 'lodash.isequal'; + +import type { PlaywrightCrawlerOptions, PlaywrightCrawlingContext } from './playwright-crawler'; +import { PlaywrightCrawler } from './playwright-crawler'; +import { RenderingTypePredictor, type RenderingType } from './utils/rendering-type-prediction'; + +type Result = { result: TResult; ok: true } | { error: unknown; ok: false } + +interface AdaptivePlaywrightCrawlerStatisticState extends StatisticState { + httpOnlyRequestHandlerRuns?: number; + browserRequestHandlerRuns?: number; + renderingTypeMispredictions?: number; +} + +interface AdaptivePlaywrightCrawlerPersistedStatisticState extends StatisticPersistedState { + httpOnlyRequestHandlerRuns?: number; + browserRequestHandlerRuns?: number; + renderingTypeMispredictions?: number; +} + +class AdaptivePlaywrightCrawlerStatistics extends Statistics { + override state: AdaptivePlaywrightCrawlerStatisticState = null as any; // this needs to be assigned for a valid override, but the initialization is done by a reset() call from the parent constructor + + override reset(): void { + super.reset(); + this.state.httpOnlyRequestHandlerRuns = 0; + this.state.browserRequestHandlerRuns = 0; + this.state.renderingTypeMispredictions = 0; + } + + protected override async _maybeLoadStatistics(): Promise { + await super._maybeLoadStatistics(); + const savedState = await this.keyValueStore?.getValue(this.persistStateKey); + + if (!savedState) { + return; + } + + this.state.httpOnlyRequestHandlerRuns = savedState.httpOnlyRequestHandlerRuns; + this.state.browserRequestHandlerRuns = savedState.browserRequestHandlerRuns; + this.state.renderingTypeMispredictions = savedState.renderingTypeMispredictions; + } + + trackHttpOnlyRequestHandlerRun(): void { + this.state.httpOnlyRequestHandlerRuns ??= 0; + this.state.httpOnlyRequestHandlerRuns += 1; + } + + trackBrowserRequestHandlerRun(): void { + this.state.browserRequestHandlerRuns ??= 0; + this.state.browserRequestHandlerRuns += 1; + } + + trackRenderingTypeMisprediction(): void { + this.state.renderingTypeMispredictions ??= 0; + this.state.renderingTypeMispredictions += 1; + } +} + +interface AdaptivePlaywrightCrawlerContext extends RestrictedCrawlingContext { + /** + * Wait for an element matching the selector to appear and return a Cheerio object of matched elements. + */ + querySelector: (selector: string, timeoutMs?: number) => Awaitable>; +} + +export interface AdaptivePlaywrightCrawlerOptions extends Omit { + /** + * Function that is called to process each request. + * + * The function receives the {@apilink AdaptivePlaywrightCrawlingContext} as an argument, and it must refrain from calling code with side effects, + * other than the methods of the crawling context. Any other side effects may be invoked repeatedly by the crawler, which can lead to inconsistent results. + * + * The function must return a promise, which is then awaited by the crawler. + * + * If the function throws an exception, the crawler will try to re-crawl the + * request later, up to `option.maxRequestRetries` times. + */ + requestHandler: (crawlingContext: AdaptivePlaywrightCrawlerContext) => Awaitable; + + /** + * Specifies the frequency of rendering type detection checks - 0.1 means roughly 10% of requests. + */ + renderingTypeDetectionRatio: number; + + /** + * An optional callback that is called on dataset items found by the request handler in plain HTTP mode. + * If it returns false, the request is retried in a browser. + * If no callback is specified, every dataset item is considered valid. + */ + resultChecker?: (result: RequestHandlerResult) => boolean; + + /** + * An optional callback used in rendering type detection. On each detection, the result of the plain HTTP run is compared to that of the browser one. + * If the callback returns true, the results are considered equal and the target site is considered static. + * If no result comparator is specified, but there is a `resultChecker`, any site where the `resultChecker` returns true is considered static. + * If neither `resultComparator` nor `resultChecker` are specified, a deep comparison of returned dataset items is used as a default. + */ + resultComparator?: (resultA: RequestHandlerResult, resultB: RequestHandlerResult) => boolean; + + /** + * A custom rendering type predictor + */ + renderingTypePredictor?: Pick; +} + +/** + * An extension of {@apilink PlaywrightCrawler} that uses a more limited request handler interface so that it is able to switch to HTTP-only crawling when it detects it may be possible. + * + * **Example usage:** + * + * ```javascript + * const crawler = new AdaptivePlaywrightCrawler({ + * renderingTypeDetectionRatio: 0.1, + * async requestHandler({ querySelector, pushData, enqueueLinks, request, log }) { + * // This function is called to extract data from a single web page + * const $prices = await querySelector('span.price') + * + * await pushData({ + * url: request.url, + * price: $prices.filter(':contains("$")').first().text(), + * }) + * + * await enqueueLinks({ selector: '.pagination a' }) + * }, + * }); + * + * await crawler.run([ + * 'http://www.example.com/page-1', + * 'http://www.example.com/page-2', + * ]); + * ``` + * + * @experimental + */ +export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { + private adaptiveRequestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler']; + private renderingTypePredictor: NonNullable; + private resultChecker: NonNullable; + private resultComparator: NonNullable; + override readonly stats: AdaptivePlaywrightCrawlerStatistics; + + constructor( + { + requestHandler, + renderingTypeDetectionRatio, + renderingTypePredictor, + resultChecker, + resultComparator, + statisticsOptions, + ...options + }: AdaptivePlaywrightCrawlerOptions, + override readonly config = Configuration.getGlobalConfig(), + ) { + super(options, config); + this.adaptiveRequestHandler = requestHandler; + this.renderingTypePredictor = renderingTypePredictor ?? new RenderingTypePredictor({ detectionRatio: renderingTypeDetectionRatio }); + this.resultChecker = resultChecker ?? (() => true); + + if (resultComparator !== undefined) { + this.resultComparator = resultComparator; + } else if (resultChecker !== undefined) { + this.resultComparator = (resultA, resultB) => this.resultChecker(resultA) && this.resultChecker(resultB); + } else { + this.resultComparator = (resultA, resultB) => { + return resultA.datasetItems.length === resultB.datasetItems.length + && resultA.datasetItems.every((itemA, i) => { + const itemB = resultB.datasetItems[i]; + return isEqual(itemA, itemB); + }); + }; + } + + this.stats = new AdaptivePlaywrightCrawlerStatistics({ + logMessage: `${this.log.getOptions().prefix} request statistics:`, + config, + ...statisticsOptions, + }); + } + + protected override async _runRequestHandler(crawlingContext: PlaywrightCrawlingContext): Promise { + const url = new URL(crawlingContext.request.loadedUrl ?? crawlingContext.request.url); + + const renderingTypePrediction = this.renderingTypePredictor.predict(url, crawlingContext.request.label); + const shouldDetectRenderingType = Math.random() < renderingTypePrediction.detectionProbabilityRecommendation; + + if (!shouldDetectRenderingType) { + crawlingContext.log.info(`Predicted rendering type ${renderingTypePrediction.renderingType} for ${crawlingContext.request.url}`); + } + + if (renderingTypePrediction.renderingType === 'static' && !shouldDetectRenderingType) { + crawlingContext.log.info(`Running HTTP-only request handler for ${crawlingContext.request.url}`); + this.stats.trackHttpOnlyRequestHandlerRun(); + + const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext); + + if (plainHTTPRun.ok && this.resultChecker(plainHTTPRun.result)) { + crawlingContext.log.info(`HTTP-only request handler succeeded for ${crawlingContext.request.url}`); + await this.commitResult(crawlingContext, plainHTTPRun.result); + return; + } if (!plainHTTPRun.ok) { + crawlingContext.log.exception(plainHTTPRun.error as Error, `HTTP-only request handler failed for ${crawlingContext.request.url}`); + } else { + crawlingContext.log.warning(`HTTP-only request handler returned a suspicious result for ${crawlingContext.request.url}`); + this.stats.trackRenderingTypeMisprediction(); + } + } + + crawlingContext.log.info(`Running browser request handler for ${crawlingContext.request.url}`); + this.stats.trackBrowserRequestHandlerRun(); + + const browserRun = await this.runRequestHandlerInBrowser(crawlingContext); + if (!browserRun.ok) { + throw browserRun.error; + } + + await this.commitResult(crawlingContext, browserRun.result); + + if (shouldDetectRenderingType) { + crawlingContext.log.info(`Detecting rendering type for ${crawlingContext.request.url}`); + const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext); + + const detectionResult: RenderingType = (() => { + if (!plainHTTPRun.ok) { + return 'clientOnly'; + } + + if (this.resultComparator(plainHTTPRun.result, browserRun.result)) { + return 'static'; + } + + return 'clientOnly'; + })(); + + crawlingContext.log.info(`Detected rendering type ${detectionResult} for ${crawlingContext.request.url}`); + this.renderingTypePredictor.storeResult(url, crawlingContext.request.label, detectionResult); + } + } + + protected async commitResult( + crawlingContext: PlaywrightCrawlingContext, + { calls, keyValueStoreChanges }: RequestHandlerResult, + ): Promise { + await Promise.all([ + ...calls.pushData.map(async (params) => crawlingContext.pushData(...params)), + ...calls.enqueueLinks.map(async (params) => await crawlingContext.enqueueLinks(...params)), + ...calls.addRequests.map(async (params) => crawlingContext.addRequests(...params)), + ...Object.entries(keyValueStoreChanges).map(async ([storeIdOrName, changes]) => { + const store = await crawlingContext.getKeyValueStore(storeIdOrName); + await Promise.all( + Object.entries(changes).map(async ([key, { changedValue, options }]) => store.setValue(key, changedValue, options)), + ); + }), + ]); + } + + protected async runRequestHandlerInBrowser(crawlingContext: PlaywrightCrawlingContext): Promise> { + const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY); + + try { + await super._runRequestHandler.call( + new Proxy(this, { + get: (target, propertyName, receiver) => { + if (propertyName === 'userProvidedRequestHandler') { + return (playwrightContext: PlaywrightCrawlingContext) => this.adaptiveRequestHandler({ + request: crawlingContext.request, + log: crawlingContext.log, + querySelector: async (selector, timeoutMs) => { + const locator = playwrightContext.page.locator(selector).first(); + await locator.waitFor({ timeout: timeoutMs }); + return (await playwrightContext.parseWithCheerio())(selector) as Cheerio; + }, + enqueueLinks: async (options = {}) => { + const selector = options.selector ?? 'a'; + const locator = playwrightContext.page.locator(selector).first(); + await locator.waitFor(); + + const urls = await extractUrlsFromPage( + playwrightContext.page, + selector, + options.baseUrl ?? playwrightContext.request.loadedUrl ?? playwrightContext.request.url, + ); + await result.enqueueLinks({ ...options, urls }); + }, + addRequests: result.addRequests, + pushData: result.pushData, + useState: result.useState, + getKeyValueStore: result.getKeyValueStore, + }); + } + return Reflect.get(target, propertyName, receiver); + }, + }), + crawlingContext, + ); + return { result, ok: true }; + } catch (error) { + return { error, ok: false }; + } + } + + protected async runRequestHandlerWithPlainHTTP(crawlingContext: PlaywrightCrawlingContext): Promise> { + const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY); + + const response = await crawlingContext.sendRequest({}); + const loadedUrl = response.url; + crawlingContext.request.loadedUrl = loadedUrl; + const $ = load(response.body); + + try { + await addTimeoutToPromise( + async () => this.adaptiveRequestHandler({ + request: crawlingContext.request, + log: crawlingContext.log, + querySelector: (selector) => $(selector) as Cheerio, + enqueueLinks: async (options: Parameters[0] = {}) => { + const urls = extractUrlsFromCheerio($, options.selector, options.baseUrl ?? loadedUrl); + await result.enqueueLinks({ ...options, urls }); + }, + addRequests: result.addRequests, + pushData: result.pushData, + useState: result.useState, + getKeyValueStore: result.getKeyValueStore, + }), + this.requestHandlerTimeoutInnerMillis, + 'Request handler timed out', + ); + + return { result, ok: true }; + } catch (error) { + return { error, ok: false }; + } + } +} diff --git a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts index 174a42fd398..10677d3d0e8 100644 --- a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts +++ b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts @@ -32,6 +32,7 @@ import { getInjectableScript as getCookieClosingScript } from 'idcac-playwright' import ow from 'ow'; import type { Page, Response, Route } from 'playwright'; +import { RenderingTypePredictor } from './rendering-type-prediction'; import type { EnqueueLinksByClickingElementsOptions } from '../enqueue-links/click-elements'; import { enqueueLinksByClickingElements } from '../enqueue-links/click-elements'; import type { PlaywrightCrawlingContext } from '../playwright-crawler'; @@ -789,4 +790,5 @@ export const playwrightUtils = { saveSnapshot, compileScript, closeCookieModals, + RenderingTypePredictor, }; diff --git a/packages/playwright-crawler/src/internals/utils/rendering-type-prediction.ts b/packages/playwright-crawler/src/internals/utils/rendering-type-prediction.ts new file mode 100644 index 00000000000..9938a61c2cf --- /dev/null +++ b/packages/playwright-crawler/src/internals/utils/rendering-type-prediction.ts @@ -0,0 +1,117 @@ +import LogisticRegression from 'ml-logistic-regression'; +import { Matrix } from 'ml-matrix'; +import stringComparison from 'string-comparison'; + +export type RenderingType = 'clientOnly' | 'static' + +type URLComponents = string[]; + +const urlComponents = (url: URL): URLComponents => { + return [url.hostname, ...url.pathname.split('/')]; +}; + +const calculateUrlSimilarity = (a: URLComponents, b: URLComponents): number | undefined => { + const values: number[] = []; + + if (a[0] !== b[0]) { + return 0; + } + + for (let i = 1; i < Math.max(a.length, b.length); i++) { + values.push(stringComparison.jaroWinkler.similarity(a[i] ?? '', b[i] ?? '') > 0.8 ? 1 : 0); + } + + return sum(values) / Math.max(a.length, b.length); +}; + +const sum = (values: number[]) => values.reduce((acc, value) => acc + value); +const mean = (values: number[]) => (values.length > 0 ? sum(values) / values.length : undefined); + +type FeatureVector = [staticResultsSimilarity: number, clientOnlyResultsSimilarity: number]; + +export interface RenderingTypePredictorOptions { + /** A number between 0 and 1 that determines the desired ratio of rendering type detections */ + detectionRatio: number; +} + +/** + * Stores rendering type information for previously crawled URLs and predicts the rendering type for URLs that have yet to be crawled and recommends when rendering type detection should be performed. + * + * @experimental + */ +export class RenderingTypePredictor { + private renderingTypeDetectionResults = new Map>(); + private detectionRatio: number; + private logreg: LogisticRegression; + + constructor({ detectionRatio }: RenderingTypePredictorOptions) { + this.detectionRatio = detectionRatio; + this.logreg = new LogisticRegression({ numSteps: 1000, learningRate: 0.05 }); + } + + /** + * Predict the rendering type for a given URL and request label. + */ + public predict(url: URL, label: string | undefined): { renderingType: RenderingType; detectionProbabilityRecommendation: number } { + if (this.logreg.classifiers.length === 0) { + return { renderingType: 'clientOnly', detectionProbabilityRecommendation: 1 }; + } + + const urlFeature = new Matrix([this.calculateFeatureVector(urlComponents(url), label)]); + const [prediction] = this.logreg.predict(urlFeature); + const scores = [this.logreg.classifiers[0].testScores(urlFeature), this.logreg.classifiers[1].testScores(urlFeature)]; + + return { + renderingType: prediction === 1 ? 'static' : 'clientOnly', + detectionProbabilityRecommendation: Math.abs(scores[0] - scores[1]) < 0.1 ? 1 : this.detectionRatio * Math.max(1, 5 - this.resultCount(label)), + }; + } + + /** + * Store the rendering type for a given URL and request label. This updates the underlying prediction model, which may be costly. + */ + public storeResult(url: URL, label: string | undefined, renderingType: RenderingType) { + if (!this.renderingTypeDetectionResults.has(renderingType)) { + this.renderingTypeDetectionResults.set(renderingType, new Map()); + } + + if (!this.renderingTypeDetectionResults.get(renderingType)!.has(label)) { + this.renderingTypeDetectionResults.get(renderingType)!.set(label, []); + } + + this.renderingTypeDetectionResults.get(renderingType)!.get(label)!.push(urlComponents(url)); + this.retrain(); + } + + private resultCount(label: string | undefined): number { + return Array.from(this.renderingTypeDetectionResults.values()) + .map((results) => results.get(label)?.length ?? 0) + .reduce((acc, value) => acc + value, 0); + } + + protected calculateFeatureVector(url: URLComponents, label: string | undefined): FeatureVector { + return [ + mean((this.renderingTypeDetectionResults.get('static')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0, + mean((this.renderingTypeDetectionResults.get('clientOnly')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0, + ]; + } + + protected retrain(): void { + const X: FeatureVector[] = [ + [0, 1], + [1, 0], + ]; + const Y: number[] = [0, 1]; + + for (const [renderingType, urlsByLabel] of this.renderingTypeDetectionResults.entries()) { + for (const [label, urls] of urlsByLabel) { + for (const url of urls) { + X.push(this.calculateFeatureVector(url, label)); + Y.push(renderingType === 'static' ? 1 : 0); + } + } + } + + this.logreg.train(new Matrix(X), Matrix.columnVector(Y)); + } +} diff --git a/packages/playwright-crawler/src/logistic-regression.d.ts b/packages/playwright-crawler/src/logistic-regression.d.ts new file mode 100644 index 00000000000..c4b501a8575 --- /dev/null +++ b/packages/playwright-crawler/src/logistic-regression.d.ts @@ -0,0 +1,22 @@ +declare module 'ml-logistic-regression' { + import Matrix from 'ml-matrix'; + + class LogisticRegressionTwoClasses { + testScores(Xtest: Matrix): number; + } + + export default class LogisticRegression { + classifiers: LogisticRegressionTwoClasses[]; + + constructor( + options: Partial<{ + numSteps: number; + learningRate: number; + }>, + ); + + train(X: Matrix, Y: Matrix): void; + + predict(Xtest: Matrix): number[]; + } +} diff --git a/test/core/crawlers/adaptive_playwright_crawler.test.ts b/test/core/crawlers/adaptive_playwright_crawler.test.ts new file mode 100644 index 00000000000..73dbd389252 --- /dev/null +++ b/test/core/crawlers/adaptive_playwright_crawler.test.ts @@ -0,0 +1,264 @@ +import type { Server } from 'http'; +import type { AddressInfo } from 'net'; + +import type { AdaptivePlaywrightCrawlerOptions } from '@crawlee/playwright'; +import { + AdaptivePlaywrightCrawler, RequestList, +} from '@crawlee/playwright'; +import express from 'express'; +import { startExpressAppPromise } from 'test/shared/_helper'; +import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator'; + +describe('AdaptivePlaywrightCrawler', () => { + // Set up an express server that will serve test pages + const HOSTNAME = '127.0.0.1'; + let port: number; + let server: Server; + + beforeAll(async () => { + const app = express(); + server = await startExpressAppPromise(app, 0); + port = (server.address() as AddressInfo).port; + + app.get('/static', (_req, res) => { + res.send(` + + + Example Domain + + +

Heading

+ Link 1 + Link 2 + Link 3 + Link 4 + Link 5 + + + `); + res.status(200); + }); + + app.get('/dynamic', (_req, res) => { + res.send(` + + + Example Domain + + + + + + `); + res.status(200); + }); + }); + afterAll(async () => { + server.close(); + }); + + // Set up local storage emulator + const localStorageEmulator = new MemoryStorageEmulator(); + + beforeEach(async () => { + await localStorageEmulator.init(); + }); + afterAll(async () => { + await localStorageEmulator.destroy(); + }); + + // Test setup helpers + const makeOneshotCrawler = async ( + options: Required> & Partial, + sources: string[], + ) => new AdaptivePlaywrightCrawler({ + renderingTypeDetectionRatio: 0.1, + maxConcurrency: 1, + maxRequestRetries: 0, + maxRequestsPerCrawl: 1, + requestList: await RequestList.open({ sources }), + ...options, + }); + + const makeRiggedRenderingTypePredictor = (prediction: {detectionProbabilityRecommendation: number; renderingType: 'clientOnly' | 'static'}) => ({ + predict: vi.fn((_url: URL) => prediction), + storeResult: vi.fn((_url: URL, _label: string | unknown, _renderingType: string) => {}), + }); + + describe('should detect page rendering type', () => { + test.each([['/static', 'static'], ['/dynamic', 'clientOnly']] as const)('for %s', async (path, expectedType) => { + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 1, renderingType: 'clientOnly' }); + const url = new URL(`http://${HOSTNAME}:${port}${path}`); + + const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async ({ pushData, querySelector }) => { + await pushData({ + heading: (await querySelector('h1')).text(), + }); + }); + + const crawler = await makeOneshotCrawler( + { + requestHandler, + renderingTypePredictor, + }, + [url.toString()], + ); + + await crawler.run(); + + // Check the detection result + expect(renderingTypePredictor.predict).toHaveBeenCalledWith(url, undefined); + expect(renderingTypePredictor.storeResult).toHaveBeenCalledWith(url, undefined, expectedType); + + // Check if the request handler was called twice + expect(requestHandler).toHaveBeenCalledTimes(2); + + // Check if only one item was added to the dataset + expect(await localStorageEmulator.getDatasetItems()).toEqual([{ heading: 'Heading' }]); + }); + }); + + test('should not store detection results on non-detection runs', async () => { + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType: 'static' }); + const url = new URL(`http://${HOSTNAME}:${port}/static`); + + const crawler = await makeOneshotCrawler( + { + requestHandler: async () => {}, + renderingTypePredictor, + }, + [url.toString()], + ); + + await crawler.run(); + + expect(renderingTypePredictor.predict).toHaveBeenCalledWith(url, undefined); + expect(renderingTypePredictor.storeResult).not.toHaveBeenCalled(); + }); + + test('should retry with browser if result checker returns false', async () => { + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType: 'static' }); + const url = new URL(`http://${HOSTNAME}:${port}/dynamic`); + + const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async ({ pushData, querySelector }) => { + await pushData({ + heading: (await querySelector('h1')).text(), + }); + }); + + const resultChecker: AdaptivePlaywrightCrawlerOptions['resultChecker'] = vi.fn( + (result) => result.datasetItems.length > 0 && result.datasetItems.every(({ item }) => item.heading?.length > 0), + ); + + const crawler = await makeOneshotCrawler( + { + requestHandler, + renderingTypePredictor, + resultChecker, + }, + [url.toString()], + ); + + await crawler.run(); + + expect(requestHandler).toHaveBeenCalledTimes(2); + expect(resultChecker).toHaveBeenCalledTimes(1); + }); + + describe('should enqueue links correctly', () => { + test.each([['/static', 'static'], ['/dynamic', 'clientOnly']] as const)('for %s', async (path, renderingType) => { + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType }); + const url = new URL(`http://${HOSTNAME}:${port}${path}`); + + const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async ({ enqueueLinks }) => { + await enqueueLinks(); + }); + + const crawler = await makeOneshotCrawler( + { + requestHandler, + renderingTypePredictor, + }, + [url.toString()], + ); + + await crawler.run(); + + const enqueuedUrls = (await localStorageEmulator.getRequestQueueItems()).map((item) => item.url); + expect(new Set(enqueuedUrls)).toEqual(new Set([ + `http://${HOSTNAME}:${port}/static?q=1`, + `http://${HOSTNAME}:${port}/static?q=2`, + `http://${HOSTNAME}:${port}/static?q=3`, + `http://${HOSTNAME}:${port}/static?q=4`, + `http://${HOSTNAME}:${port}/static?q=5`, + ])); + }); + }); + + test('should persist crawler state', async () => { + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType: 'static' }); + + const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async ({ useState }) => { + const state = await useState({ count: 0 }); + state.count += 1; + }); + + const crawler = await makeOneshotCrawler( + { + requestHandler, + renderingTypePredictor, + maxRequestsPerCrawl: 3, + }, + [ + `http://${HOSTNAME}:${port}/static?q=1`, + `http://${HOSTNAME}:${port}/static?q=2`, + `http://${HOSTNAME}:${port}/static?q=3`, + ], + ); + + await crawler.run(); + const state = await localStorageEmulator.getState(); + expect(state.value).toEqual({ count: 3 }); + }); + + test('should persist key-value store changes', async () => { + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType: 'static' }); + + const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async ({ request, getKeyValueStore }) => { + const store = await getKeyValueStore(); + const search = new URLSearchParams(new URL(request.url).search); + store.setValue(search.get('q'), { content: 42 }); + }); + + const crawler = await makeOneshotCrawler( + { + requestHandler, + renderingTypePredictor, + maxRequestsPerCrawl: 3, + }, + [ + `http://${HOSTNAME}:${port}/static?q=1`, + `http://${HOSTNAME}:${port}/static?q=2`, + `http://${HOSTNAME}:${port}/static?q=3`, + ], + ); + + await crawler.run(); + const store = localStorageEmulator.getKeyValueStore(); + + expect((await store.getRecord('1')).value).toEqual({ content: 42 }); + expect((await store.getRecord('2')).value).toEqual({ content: 42 }); + expect((await store.getRecord('3')).value).toEqual({ content: 42 }); + }); +}); diff --git a/test/shared/MemoryStorageEmulator.ts b/test/shared/MemoryStorageEmulator.ts index 5a4f0374dfa..0f2b69b16a6 100644 --- a/test/shared/MemoryStorageEmulator.ts +++ b/test/shared/MemoryStorageEmulator.ts @@ -11,20 +11,50 @@ import { StorageEmulator } from './StorageEmulator'; const LOCAL_EMULATION_DIR = resolve(__dirname, '..', 'tmp', 'memory-emulation-dir'); export class MemoryStorageEmulator extends StorageEmulator { + private storage: MemoryStorage; + override async init({ dirName = cryptoRandomObjectId(10), persistStorage = false }: MemoryEmulatorOptions = {}) { await super.init(); const localStorageDir = resolve(LOCAL_EMULATION_DIR, dirName); this.localStorageDirectories.push(localStorageDir); await ensureDir(localStorageDir); - const storage = new MemoryStorage({ localDataDirectory: localStorageDir, persistStorage, writeMetadata: false }); - Configuration.getGlobalConfig().useStorageClient(storage); + this.storage = new MemoryStorage({ localDataDirectory: localStorageDir, persistStorage, writeMetadata: false }); + + Configuration.getGlobalConfig().useStorageClient(this.storage); log.debug(`Initialized emulated memory storage in folder ${localStorageDir}`); } static override toString() { return '@crawlee/memory-storage'; } + + getDataset(id?: string) { + return this.storage.dataset(id ?? Configuration.getGlobalConfig().get('defaultDatasetId')); + } + + async getDatasetItems(id?: string) { + const dataset = this.getDataset(id); + return (await dataset.listItems()).items; + } + + getRequestQueue(id?: string) { + return this.storage.requestQueue(id ?? Configuration.getGlobalConfig().get('defaultRequestQueueId')); + } + + async getRequestQueueItems(id?: string) { + const requestQueue = this.getRequestQueue(id); + const { items: heads } = await requestQueue.listHead(); + return heads; + } + + getKeyValueStore(id?: string) { + return this.storage.keyValueStore(id ?? Configuration.getGlobalConfig().get('defaultKeyValueStoreId')); + } + + async getState() { + return await this.getKeyValueStore().getRecord('CRAWLEE_STATE'); + } } export interface MemoryEmulatorOptions { diff --git a/yarn.lock b/yarn.lock index bbc8cdbef05..a4dd41c9695 100644 --- a/yarn.lock +++ b/yarn.lock @@ -114,7 +114,7 @@ __metadata: languageName: node linkType: hard -"@apify/timeout@npm:^0.3.0": +"@apify/timeout@npm:^0.3.0, @apify/timeout@npm:^0.3.1": version: 0.3.1 resolution: "@apify/timeout@npm:0.3.1" checksum: 10c0/3019f9ef14bad1e3675553e85032f87ef5a2552dcbb1d4b26305e00591647999000641a0660d30838ea1a6aaab558881bc9f5514e52614969bca9bdccaf96fa6 @@ -463,6 +463,7 @@ __metadata: "@crawlee/utils": "npm:3.7.3" ow: "npm:^0.28.1" tslib: "npm:^2.4.0" + type-fest: "npm:^4.0.0" languageName: unknown linkType: soft @@ -600,14 +601,19 @@ __metadata: dependencies: "@apify/datastructures": "npm:^2.0.0" "@apify/log": "npm:^2.4.0" + "@apify/timeout": "npm:^0.3.1" "@crawlee/browser": "npm:3.7.3" "@crawlee/browser-pool": "npm:3.7.3" + "@crawlee/core": "npm:3.7.3" "@crawlee/types": "npm:3.7.3" "@crawlee/utils": "npm:3.7.3" cheerio: "npm:^1.0.0-rc.12" idcac-playwright: "npm:^0.1.2" jquery: "npm:^3.6.0" + lodash.isequal: "npm:^4.5.0" + ml-logistic-regression: "npm:^2.0.0" ow: "npm:^0.28.1" + string-comparison: "npm:^1.3.0" tslib: "npm:^2.4.0" peerDependencies: playwright: "*" @@ -660,6 +666,7 @@ __metadata: "@types/htmlparser2": "npm:^3.10.3" "@types/inquirer": "npm:^8.2.1" "@types/is-ci": "npm:^3.0.1" + "@types/lodash.isequal": "npm:^4.5.8" "@types/lodash.merge": "npm:^4.6.7" "@types/mime-types": "npm:^2.1.1" "@types/node": "npm:^20.0.0" @@ -2047,6 +2054,15 @@ __metadata: languageName: node linkType: hard +"@types/lodash.isequal@npm:^4.5.8": + version: 4.5.8 + resolution: "@types/lodash.isequal@npm:4.5.8" + dependencies: + "@types/lodash": "npm:*" + checksum: 10c0/6db28cacf165d55421fbf2970ccfb1682a7b82b743bb7aba4398fa8ab98f1711fca2fe4afa1aa2b7b4afb3eff76c8aca13b22206f5efeb038d99e41300589bca + languageName: node + linkType: hard + "@types/lodash.merge@npm:^4.6.7": version: 4.6.9 resolution: "@types/lodash.merge@npm:4.6.9" @@ -6719,6 +6735,13 @@ __metadata: languageName: node linkType: hard +"is-any-array@npm:^2.0.0, is-any-array@npm:^2.0.1": + version: 2.0.1 + resolution: "is-any-array@npm:2.0.1" + checksum: 10c0/f9807458a51e63ca1ac27fd6f3a3ace8200f077094e00d9b05b24cfbc9d5594d586d6ecf3416271f26939d5cb93fc52ca869cb5744e77318c3f53ec70b08d61f + languageName: node + linkType: hard + "is-arguments@npm:^1.1.1": version: 1.1.1 resolution: "is-arguments@npm:1.1.1" @@ -8340,6 +8363,54 @@ __metadata: languageName: node linkType: hard +"ml-array-max@npm:^1.2.4": + version: 1.2.4 + resolution: "ml-array-max@npm:1.2.4" + dependencies: + is-any-array: "npm:^2.0.0" + checksum: 10c0/05eacc44ccc182f6d191bc7cd233c97b55ebd695e423f0f07e2358831af7d7a2f64e2698fee4e1edc9bbcd962b8df2141f52b5403219bc2432a77b2bab8e25df + languageName: node + linkType: hard + +"ml-array-min@npm:^1.2.3": + version: 1.2.3 + resolution: "ml-array-min@npm:1.2.3" + dependencies: + is-any-array: "npm:^2.0.0" + checksum: 10c0/ba7aef2fd1bfe9f1937efa96242147c0ab30e5af4f22364b419a47f8c194b3c16b96b024fd6562eaddbf2d8648fcda3f54d3c2f394ec58beb756b9ed71c81593 + languageName: node + linkType: hard + +"ml-array-rescale@npm:^1.3.7": + version: 1.3.7 + resolution: "ml-array-rescale@npm:1.3.7" + dependencies: + is-any-array: "npm:^2.0.0" + ml-array-max: "npm:^1.2.4" + ml-array-min: "npm:^1.2.3" + checksum: 10c0/2b8fca33c38bee3c957e5e2f178d7f109d61acc2add924cc8a4a0eac30dfa2d0974642f25f2b4e46e82f9ac90de275268aca382bd644dac2960a83b349c7e706 + languageName: node + linkType: hard + +"ml-logistic-regression@npm:^2.0.0": + version: 2.0.0 + resolution: "ml-logistic-regression@npm:2.0.0" + dependencies: + ml-matrix: "npm:^6.5.0" + checksum: 10c0/8713dc63d98e08038fba537afb2e9f675065ec6b8b02dbb5dd33b7823bfcf627dac10cad270650e23afb4fb7c14547a856d89306d588f075924970290d96039a + languageName: node + linkType: hard + +"ml-matrix@npm:^6.5.0": + version: 6.11.0 + resolution: "ml-matrix@npm:6.11.0" + dependencies: + is-any-array: "npm:^2.0.1" + ml-array-rescale: "npm:^1.3.7" + checksum: 10c0/bf64bc5037568d8b2fddc3b2d90cb927868ed5f7894208b3e88b178645411c19ce25351a02f08dbf071a7a8a6c81370410c2692f0f800341be38670c23853df5 + languageName: node + linkType: hard + "mlly@npm:^1.2.0, mlly@npm:^1.4.2": version: 1.5.0 resolution: "mlly@npm:1.5.0" @@ -10906,6 +10977,13 @@ __metadata: languageName: node linkType: hard +"string-comparison@npm:^1.3.0": + version: 1.3.0 + resolution: "string-comparison@npm:1.3.0" + checksum: 10c0/9118ea5d33cc3e9761b12b481e44a75a2e632564b33a511c24cc753105e844f2e0c5997ec5210d2a233a5669a635aaff65f74aa156a5205fba7cd0e81c53ceab + languageName: node + linkType: hard + "string-width-cjs@npm:string-width@^4.2.0, string-width@npm:^1.0.2 || 2 || 3 || 4, string-width@npm:^4.1.0, string-width@npm:^4.2.0, string-width@npm:^4.2.3": version: 4.2.3 resolution: "string-width@npm:4.2.3"