diff --git a/packages/core/src/crawlers/statistics.ts b/packages/core/src/crawlers/statistics.ts index eddc45a05b03..14af93518587 100644 --- a/packages/core/src/crawlers/statistics.ts +++ b/packages/core/src/crawlers/statistics.ts @@ -77,6 +77,11 @@ export class Statistics { */ readonly requestRetryHistogram: number[] = []; + /** + * Contains the associated Configuration instance + */ + private readonly config: Configuration; + private keyValueStore?: KeyValueStore = undefined; private persistStateKey = `SDK_CRAWLER_STATISTICS_${this.id}`; private logIntervalMillis: number; @@ -111,6 +116,7 @@ export class Statistics { this.keyValueStore = keyValueStore; this.listener = this.persistState.bind(this); this.events = config.getEventManager(); + this.config = config; // initialize by "resetting" this.reset(); @@ -239,7 +245,7 @@ export class Statistics { * displaying the current state in predefined intervals */ async startCapturing() { - this.keyValueStore ??= await KeyValueStore.open(); + this.keyValueStore ??= await KeyValueStore.open(null, { config: this.config }); await this._maybeLoadStatistics(); diff --git a/packages/core/src/storages/dataset.ts b/packages/core/src/storages/dataset.ts index 8bfc94c41da1..83ff4e8f2601 100644 --- a/packages/core/src/storages/dataset.ts +++ b/packages/core/src/storages/dataset.ts @@ -301,7 +301,7 @@ export class Dataset { * @param [contentType] Only JSON and CSV are supported currently, defaults to JSON. */ async exportTo(key: string, options?: ExportOptions, contentType?: string): Promise { - const kvStore = await KeyValueStore.open(options?.toKVS ?? null); + const kvStore = await KeyValueStore.open(options?.toKVS ?? null, { config: this.config }); const items: Data[] = []; const fetchNextChunk = async (offset = 0): Promise => { diff --git a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts index b1f1690c6248..750a13d9f276 100644 --- a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts +++ b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts @@ -24,7 +24,7 @@ import vm from 'vm'; import { LruCache } from '@apify/datastructures'; import log_ from '@apify/log'; import type { Request } from '@crawlee/browser'; -import { validators, KeyValueStore, RequestState } from '@crawlee/browser'; +import { validators, KeyValueStore, RequestState, Configuration } from '@crawlee/browser'; import type { BatchAddRequestsResult } from '@crawlee/types'; import type { CheerioRoot, Dictionary } from '@crawlee/utils'; import * as cheerio from 'cheerio'; @@ -498,6 +498,12 @@ export interface SaveSnapshotOptions { * @default null */ keyValueStoreName?: string | null; + + /** + * Configuration of the crawler that will be used to save the snapshot. + * @default Configuration.getGlobalConfig() + */ + config?: Configuration; } /** @@ -513,6 +519,7 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {} saveScreenshot: ow.optional.boolean, saveHtml: ow.optional.boolean, keyValueStoreName: ow.optional.string, + config: ow.optional.object, })); const { @@ -521,10 +528,11 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {} saveScreenshot = true, saveHtml = true, keyValueStoreName, + config, } = options; try { - const store = await KeyValueStore.open(keyValueStoreName); + const store = await KeyValueStore.open(keyValueStoreName, { config: config ?? Configuration.getGlobalConfig() }); if (saveScreenshot) { const screenshotName = `${key}.jpg`; @@ -756,7 +764,7 @@ export function registerUtilsToContext(context: PlaywrightCrawlingContext): void context.blockRequests = (options?: BlockRequestsOptions) => blockRequests(context.page, options); context.parseWithCheerio = () => parseWithCheerio(context.page); context.infiniteScroll = (options?: InfiniteScrollOptions) => infiniteScroll(context.page, options); - context.saveSnapshot = (options?: SaveSnapshotOptions) => saveSnapshot(context.page, options); + context.saveSnapshot = (options?: SaveSnapshotOptions) => saveSnapshot(context.page, { ...options, config: context.crawler.config }); context.enqueueLinksByClickingElements = (options: Omit) => enqueueLinksByClickingElements({ ...options, page: context.page, diff --git a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts index f5c0d2fc8c2a..61b719b3f97a 100644 --- a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts +++ b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts @@ -24,7 +24,7 @@ import vm from 'vm'; import { LruCache } from '@apify/datastructures'; import log_ from '@apify/log'; import type { Request } from '@crawlee/browser'; -import { KeyValueStore, RequestState, validators } from '@crawlee/browser'; +import { KeyValueStore, RequestState, validators, Configuration } from '@crawlee/browser'; import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types'; import type { CheerioRoot } from '@crawlee/utils'; import * as cheerio from 'cheerio'; @@ -630,6 +630,12 @@ export interface SaveSnapshotOptions { * @default null */ keyValueStoreName?: string | null; + + /** + * Configuration of the crawler that will be used to save the snapshot. + * @default Configuration.getGlobalConfig() + */ + config?: Configuration; } /** @@ -645,6 +651,7 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {} saveScreenshot: ow.optional.boolean, saveHtml: ow.optional.boolean, keyValueStoreName: ow.optional.string, + config: ow.optional.object, })); const { @@ -653,10 +660,11 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {} saveScreenshot = true, saveHtml = true, keyValueStoreName, + config, } = options; try { - const store = await KeyValueStore.open(keyValueStoreName); + const store = await KeyValueStore.open(keyValueStoreName, { config: config ?? Configuration.getGlobalConfig() }); if (saveScreenshot) { const screenshotName = `${key}.jpg`; @@ -962,7 +970,7 @@ export function registerUtilsToContext(context: PuppeteerCrawlingContext): void context.addInterceptRequestHandler = (handler: InterceptHandler) => addInterceptRequestHandler(context.page, handler); context.removeInterceptRequestHandler = (handler: InterceptHandler) => removeInterceptRequestHandler(context.page, handler); context.infiniteScroll = (options?: InfiniteScrollOptions) => infiniteScroll(context.page, options); - context.saveSnapshot = (options?: SaveSnapshotOptions) => saveSnapshot(context.page, options); + context.saveSnapshot = (options?: SaveSnapshotOptions) => saveSnapshot(context.page, { ...options, config: context.crawler.config }); context.closeCookieModals = () => closeCookieModals(context.page); }