From 1c6b730cb0aeccc23cdfb6eb0c11e987f8aa4d9c Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Mon, 12 Feb 2024 20:26:18 +0300 Subject: [PATCH 01/28] feat: Implement ErrorSnapshotter for error context capture (crawlee#2280) This commit introduces the ErrorSnapshotter class to the crawlee package, providing functionality to capture screenshots and HTML snapshots when an error occurs during web crawling. --- .../src/internals/basic-crawler.ts | 4 +- packages/utils/src/index.ts | 1 + .../utils/src/internals/error_snapshotter.ts | 99 ++++++++++++++ packages/utils/src/internals/error_tracker.ts | 25 +++- .../test/non-error-objects-working.test.ts | 2 +- test/core/error_tracker.test.ts | 124 +++++++++--------- 6 files changed, 187 insertions(+), 68 deletions(-) create mode 100644 packages/utils/src/internals/error_snapshotter.ts diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index d966a670baf4..504c61da9654 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1363,7 +1363,7 @@ export class BasicCrawler { + const page = context?.page as PuppeteerPage | PlaywrightPage | undefined; + const body = context?.body; + + const keyValueStore = await context?.getKeyValueStore(); + // If the key-value store is not available, or the body and page are not available, return empty filenames + if (!keyValueStore || (!body && !page)) { + return {}; + } + + const filename = this.generateFilename(error); + const screenshotFilename = page ? await this.captureScreenshot(page, keyValueStore, filename) : undefined; + + let htmlFilename: string | undefined; + if (typeof body === 'string') { + htmlFilename = await this.saveHTMLSnapshot(body, keyValueStore, filename); + } else if (page) { + const html = await page?.content() || ''; + htmlFilename = html ? await this.saveHTMLSnapshot(html, keyValueStore, filename) : ''; + } + + const basePath = `${ErrorSnapshotter.KEY_VALUE_STORE_PATH}/${keyValueStore.id}/records`; + + return { + screenshotFilename: screenshotFilename ? `${basePath}/${screenshotFilename}` : undefined, + htmlFilename: htmlFilename ? `${basePath}/${htmlFilename}` : undefined, + }; + } + + /** + * Capture a screenshot of the page (For Browser only), and return the filename with the extension. + */ + async captureScreenshot(page: PuppeteerPage | PlaywrightPage, keyValueStore: KeyValueStore, filename: string): Promise { + try { + const screenshotBuffer = await page.screenshot(); + + await keyValueStore.setValue(filename, screenshotBuffer, { contentType: 'image/png' }); + return `${filename}.png`; + } catch (e) { + return undefined; + } + } + + /** + * Save the HTML snapshot of the page, and return the filename with the extension. + */ + async saveHTMLSnapshot(html: string, keyValueStore: KeyValueStore, filename: string): Promise { + try { + await keyValueStore.setValue(filename, html, { contentType: 'text/html' }); + return `${filename}.html`; + } catch (e) { + return undefined; + } + } + + /** + * Generate a unique filename for each error snapshot. + */ + // Method to generate a unique filename for each error snapshot + generateFilename(error: ErrnoException): string { + // Create a hash of the error stack trace + const errorStackHash = crypto.createHash('sha1').update(error.stack || '').digest('hex').slice(0, 30); + // Extract the first 30 characters of the error message + const errorMessagePrefix = ( + error.message || ErrorSnapshotter.BASE_MESSAGE + ).substring(0, Math.min(ErrorSnapshotter.MAX_ERROR_CHARACTERS, error.message?.length || 0)); + + // Generate filename and remove disallowed characters + let filename = `${ErrorSnapshotter.SNAPSHOT_PREFIX}_${errorStackHash}_${errorMessagePrefix}`; + filename = filename.replace(/[^a-zA-Z0-9!-_.]/g, ''); + + // Ensure filename is not too long + if (filename.length > 250) { + filename = filename.slice(0, 250); // to allow space for the extension + } + + return filename; + } +} diff --git a/packages/utils/src/internals/error_tracker.ts b/packages/utils/src/internals/error_tracker.ts index 259a4c33b5bd..19c519d0e8f7 100644 --- a/packages/utils/src/internals/error_tracker.ts +++ b/packages/utils/src/internals/error_tracker.ts @@ -1,9 +1,12 @@ import { inspect } from 'node:util'; +import type { CrawlingContext } from '@crawlee/basic'; +import { ErrorSnapshotter } from '@crawlee/utils'; + /** * Node.js Error interface */ - interface ErrnoException extends Error { +export interface ErrnoException extends Error { errno?: number | undefined; code?: string | number | undefined; path?: string | undefined; @@ -283,6 +286,8 @@ export class ErrorTracker { total: number; + errorSnapshotter: ErrorSnapshotter; + constructor(options: Partial = {}) { this.#options = { showErrorCode: true, @@ -294,11 +299,13 @@ export class ErrorTracker { ...options, }; + this.errorSnapshotter = new ErrorSnapshotter(); + this.result = Object.create(null); this.total = 0; } - add(error: ErrnoException) { + async add(error: ErrnoException, context?: CrawlingContext) { this.total++; let group = this.result; @@ -321,8 +328,13 @@ export class ErrorTracker { increaseCount(group as { count: number }); + // Capture a snapshot (screenshot and HTML) on the first occurrence of an error + if (group.count === 1 && context) { + await this.captureSnapshot(group, error, context); + } + if (typeof error.cause === 'object' && error.cause !== null) { - this.add(error.cause); + await this.add(error.cause); } } @@ -366,6 +378,13 @@ export class ErrorTracker { return result.sort((a, b) => b[0] - a[0]).slice(0, count); } + async captureSnapshot(storage: Record, error: ErrnoException, context: CrawlingContext) { + const { screenshotFilename, htmlFilename } = await this.errorSnapshotter.captureSnapshot(error, context); + + storage.firstErrorScreenshot = screenshotFilename; + storage.firstErrorHtmlSnapshot = htmlFilename; + } + reset() { // This actually safe, since we Object.create(null) so no prototype pollution can happen. // eslint-disable-next-line no-restricted-syntax, guard-for-in diff --git a/packages/utils/test/non-error-objects-working.test.ts b/packages/utils/test/non-error-objects-working.test.ts index 167f1b44b212..68c69b6f0294 100644 --- a/packages/utils/test/non-error-objects-working.test.ts +++ b/packages/utils/test/non-error-objects-working.test.ts @@ -4,6 +4,6 @@ describe('ErrorTracker', () => { test('processing a non-error error should not crash', () => { const errorTracker = new ErrorTracker(); // @ts-expect-error tracking non-error errors - expect(() => errorTracker.add('foo')).not.toThrow(); + expect(async () => await errorTracker.add('foo')).not.toThrow(); }); }); diff --git a/test/core/error_tracker.test.ts b/test/core/error_tracker.test.ts index d51dec7e8a00..e8c78745cc2c 100644 --- a/test/core/error_tracker.test.ts +++ b/test/core/error_tracker.test.ts @@ -83,7 +83,7 @@ const errorRandomStack = { cause: undefined as any, } as const; -test('works', () => { +test('works', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -97,7 +97,7 @@ test('works', () => { expect(tracker.result).toMatchObject({}); - tracker.add(g(e)); + await tracker.add(g(e)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -112,7 +112,7 @@ test('works', () => { }); expect(tracker.getUniqueErrorCount()).toBe(1); - tracker.add(g(e)); + await tracker.add(g(e)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -128,7 +128,7 @@ test('works', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('no code is null code', () => { +test('no code is null code', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -140,8 +140,8 @@ test('no code is null code', () => { const e = errorNoCode; - tracker.add(g(e)); - tracker.add(g(e)); + await tracker.add(g(e)); + await tracker.add(g(e)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -157,7 +157,7 @@ test('no code is null code', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can hide error code', () => { +test('can hide error code', async () => { const tracker = new ErrorTracker({ showErrorCode: false, showErrorMessage: true, @@ -169,8 +169,8 @@ test('can hide error code', () => { const e = errorRandomCode; - tracker.add(g(errorRandomCode)); - tracker.add(g(errorRandomCode)); + await tracker.add(g(errorRandomCode)); + await tracker.add(g(errorRandomCode)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -184,7 +184,7 @@ test('can hide error code', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can hide error name', () => { +test('can hide error name', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -196,8 +196,8 @@ test('can hide error name', () => { const e = errorRandomName; - tracker.add(g(e)); - tracker.add(g(e)); + await tracker.add(g(e)); + await tracker.add(g(e)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -211,7 +211,7 @@ test('can hide error name', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can hide error message', () => { +test('can hide error message', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: false, @@ -223,8 +223,8 @@ test('can hide error message', () => { const e = errorRandomMessage; - tracker.add(g(errorRandomMessage)); - tracker.add(g(errorRandomMessage)); + await tracker.add(g(errorRandomMessage)); + await tracker.add(g(errorRandomMessage)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -238,7 +238,7 @@ test('can hide error message', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can hide error stack', () => { +test('can hide error stack', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -248,8 +248,8 @@ test('can hide error stack', () => { showFullMessage: true, }); - tracker.add(g(errorRandomStack)); - tracker.add(g(errorRandomStack)); + await tracker.add(g(errorRandomStack)); + await tracker.add(g(errorRandomStack)); expect(tracker.result).toMatchObject({ 'ERR_INVALID_URL': { // code @@ -263,7 +263,7 @@ test('can hide error stack', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can display full stack', () => { +test('can display full stack', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -275,8 +275,8 @@ test('can display full stack', () => { const e = multilineError; - tracker.add(g(e)); - tracker.add(g(e)); + await tracker.add(g(e)); + await tracker.add(g(e)); expect(tracker.result).toMatchObject({ [s(e.stack)]: { // source @@ -292,7 +292,7 @@ test('can display full stack', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('stack looks for user files first', () => { +test('stack looks for user files first', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -304,8 +304,8 @@ test('stack looks for user files first', () => { const e = multilineError; - tracker.add(g(e)); - tracker.add(g(e)); + await tracker.add(g(e)); + await tracker.add(g(e)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -321,7 +321,7 @@ test('stack looks for user files first', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can shorten the message to the first line', () => { +test('can shorten the message to the first line', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -333,7 +333,7 @@ test('can shorten the message to the first line', () => { const e = g(multilineError); - tracker.add(e); + await tracker.add(e); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -349,7 +349,7 @@ test('can shorten the message to the first line', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('supports error.cause', () => { +test('supports error.cause', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -362,7 +362,7 @@ test('supports error.cause', () => { const e = g(multilineError); e.cause = g(errorRandomMessage); - tracker.add(e); + await tracker.add(e); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -381,7 +381,7 @@ test('supports error.cause', () => { expect(tracker.getUniqueErrorCount()).toBe(2); }); -test('placeholder #1', () => { +test('placeholder #1', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -391,17 +391,17 @@ test('placeholder #1', () => { showFullMessage: true, }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected boolean, got number', }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected boolean, got string', }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected boolean, got undefined', }); @@ -420,7 +420,7 @@ test('placeholder #1', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #2', () => { +test('placeholder #2', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -430,17 +430,17 @@ test('placeholder #2', () => { showFullMessage: true, }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected `boolean`, got `number`', }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected `boolean`, got `string`', }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected `boolean`, got `undefined`', }); @@ -459,7 +459,7 @@ test('placeholder #2', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #3', () => { +test('placeholder #3', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -469,17 +469,17 @@ test('placeholder #3', () => { showFullMessage: true, }); - tracker.add({ + await tracker.add({ name: 'Error', message: '1 2 3', }); - tracker.add({ + await tracker.add({ name: 'Error', message: '1 4 3', }); - tracker.add({ + await tracker.add({ name: 'Error', message: '1 5 3', }); @@ -498,7 +498,7 @@ test('placeholder #3', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #4', () => { +test('placeholder #4', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -508,17 +508,17 @@ test('placeholder #4', () => { showFullMessage: true, }); - tracker.add({ + await tracker.add({ name: 'Error', message: '1 2 3', }); - tracker.add({ + await tracker.add({ name: 'Error', message: '1 2 4', }); - tracker.add({ + await tracker.add({ name: 'Error', message: '1 2 5', }); @@ -537,7 +537,7 @@ test('placeholder #4', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #5', () => { +test('placeholder #5', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -547,17 +547,17 @@ test('placeholder #5', () => { showFullMessage: true, }); - tracker.add({ + await tracker.add({ name: 'Error', message: '1 2 3', }); - tracker.add({ + await tracker.add({ name: 'Error', message: '4 2 3', }); - tracker.add({ + await tracker.add({ name: 'Error', message: '5 2 3', }); @@ -576,7 +576,7 @@ test('placeholder #5', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #6', () => { +test('placeholder #6', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -586,17 +586,17 @@ test('placeholder #6', () => { showFullMessage: true, }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'The weather is sunny today, but the grass is wet.', }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'The weather is rainy today, but the grass is still dry.', }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'The weather is wild today, however the grass is yellow.', }); @@ -615,7 +615,7 @@ test('placeholder #6', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #7', () => { +test('placeholder #7', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -625,12 +625,12 @@ test('placeholder #7', () => { showFullMessage: true, }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected `boolean`, got `number`', }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected `boolean`, got `number`', }); @@ -648,7 +648,7 @@ test('placeholder #7', () => { }); expect(tracker.getUniqueErrorCount()).toBe(1); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected `boolean`, got `falsy value`', }); @@ -667,7 +667,7 @@ test('placeholder #7', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #8', () => { +test('placeholder #8', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -677,12 +677,12 @@ test('placeholder #8', () => { showFullMessage: true, }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected `boolean`, got `number`', }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Expected `string`, got `null`', }); @@ -704,7 +704,7 @@ test('placeholder #8', () => { expect(tracker.getUniqueErrorCount()).toBe(2); }); -test('placeholder #9', () => { +test('placeholder #9', async () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -714,12 +714,12 @@ test('placeholder #9', () => { showFullMessage: true, }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Unexpected `show` property in `options` object', }); - tracker.add({ + await tracker.add({ name: 'Error', message: 'Missing `display` in style', }); From 994965dd25b1b4dc775b01dee1457210f7bef964 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Mon, 12 Feb 2024 21:45:50 +0300 Subject: [PATCH 02/28] fix: update imports in error_snapshotter.ts and error_tracker.ts --- packages/utils/src/internals/error_snapshotter.ts | 7 ++++--- packages/utils/src/internals/error_tracker.ts | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/utils/src/internals/error_snapshotter.ts b/packages/utils/src/internals/error_snapshotter.ts index 610d029f3ca8..d4c5d85c5b0b 100644 --- a/packages/utils/src/internals/error_snapshotter.ts +++ b/packages/utils/src/internals/error_snapshotter.ts @@ -1,11 +1,12 @@ -import * as crypto from 'crypto'; +import crypto from 'node:crypto'; import type { CrawlingContext } from '@crawlee/basic'; -import type { ErrnoException } from '@crawlee/utils'; -import type { KeyValueStore } from 'packages/core/src/storages'; +import type { KeyValueStore } from '@crawlee/core'; import type { Page as PlaywrightPage } from 'playwright'; import type { Page as PuppeteerPage } from 'puppeteer'; +import type { ErrnoException } from './error_tracker'; + /** * ErrorSnapshotter class is used to capture a screenshot of the page and a snapshot of the HTML when an error occur during web crawling. */ diff --git a/packages/utils/src/internals/error_tracker.ts b/packages/utils/src/internals/error_tracker.ts index 19c519d0e8f7..59a24e546ec2 100644 --- a/packages/utils/src/internals/error_tracker.ts +++ b/packages/utils/src/internals/error_tracker.ts @@ -1,7 +1,8 @@ import { inspect } from 'node:util'; import type { CrawlingContext } from '@crawlee/basic'; -import { ErrorSnapshotter } from '@crawlee/utils'; + +import { ErrorSnapshotter } from './error_snapshotter'; /** * Node.js Error interface From 5f50e30181dfd3e680e2d82c373bdd5e1e619d06 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Tue, 13 Feb 2024 02:03:20 +0300 Subject: [PATCH 03/28] refactor: error handling and snapshot capture --- packages/basic-crawler/src/internals/basic-crawler.ts | 11 +++++++++-- packages/utils/src/internals/error_snapshotter.ts | 2 +- packages/utils/src/internals/error_tracker.ts | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 504c61da9654..69b75a779711 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1363,7 +1363,7 @@ export class BasicCrawler { try { - const screenshotBuffer = await page.screenshot(); + const screenshotBuffer = await page.screenshot({ fullPage: true, type: 'png' }); await keyValueStore.setValue(filename, screenshotBuffer, { contentType: 'image/png' }); return `${filename}.png`; diff --git a/packages/utils/src/internals/error_tracker.ts b/packages/utils/src/internals/error_tracker.ts index 59a24e546ec2..b232054199f3 100644 --- a/packages/utils/src/internals/error_tracker.ts +++ b/packages/utils/src/internals/error_tracker.ts @@ -331,7 +331,7 @@ export class ErrorTracker { // Capture a snapshot (screenshot and HTML) on the first occurrence of an error if (group.count === 1 && context) { - await this.captureSnapshot(group, error, context); + await this.captureSnapshot(group, error, context).catch(() => {}); } if (typeof error.cause === 'object' && error.cause !== null) { From 4061fed03872347856994eeb4a164306895bc8e1 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Wed, 21 Feb 2024 10:33:43 +0300 Subject: [PATCH 04/28] fix: update import statement in error_tracker.ts --- packages/utils/src/internals/error_tracker.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/utils/src/internals/error_tracker.ts b/packages/utils/src/internals/error_tracker.ts index b232054199f3..5413e6abec4a 100644 --- a/packages/utils/src/internals/error_tracker.ts +++ b/packages/utils/src/internals/error_tracker.ts @@ -1,6 +1,6 @@ import { inspect } from 'node:util'; -import type { CrawlingContext } from '@crawlee/basic'; +import type { CrawlingContext } from '@crawlee/core'; import { ErrorSnapshotter } from './error_snapshotter'; @@ -331,7 +331,7 @@ export class ErrorTracker { // Capture a snapshot (screenshot and HTML) on the first occurrence of an error if (group.count === 1 && context) { - await this.captureSnapshot(group, error, context).catch(() => {}); + await this.captureSnapshot(group, error, context).catch(() => { }); } if (typeof error.cause === 'object' && error.cause !== null) { @@ -380,10 +380,10 @@ export class ErrorTracker { } async captureSnapshot(storage: Record, error: ErrnoException, context: CrawlingContext) { - const { screenshotFilename, htmlFilename } = await this.errorSnapshotter.captureSnapshot(error, context); + const { screenshotFilename, htmlFileName } = await this.errorSnapshotter.captureSnapshot(error, context); storage.firstErrorScreenshot = screenshotFilename; - storage.firstErrorHtmlSnapshot = htmlFilename; + storage.firstErrorHtml = htmlFileName; } reset() { From 512b3ff77129290fc6c9e882b298da6586c8c990 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Wed, 21 Feb 2024 10:35:36 +0300 Subject: [PATCH 05/28] refactor: updated ErrorSnapshotter class snapshot methods and handled local snapshots paths --- .../utils/src/internals/error_snapshotter.ts | 81 +++++++++++++------ 1 file changed, 57 insertions(+), 24 deletions(-) diff --git a/packages/utils/src/internals/error_snapshotter.ts b/packages/utils/src/internals/error_snapshotter.ts index ea5fb8922b9e..f897b930c413 100644 --- a/packages/utils/src/internals/error_snapshotter.ts +++ b/packages/utils/src/internals/error_snapshotter.ts @@ -1,12 +1,15 @@ import crypto from 'node:crypto'; -import type { CrawlingContext } from '@crawlee/basic'; -import type { KeyValueStore } from '@crawlee/core'; +import type { KeyValueStore, CrawlingContext } from '@crawlee/core'; +import type { PlaywrightCrawlingContext } from '@crawlee/playwright'; +import type { PuppeteerCrawlingContext } from '@crawlee/puppeteer'; import type { Page as PlaywrightPage } from 'playwright'; import type { Page as PuppeteerPage } from 'puppeteer'; import type { ErrnoException } from './error_tracker'; +const { PWD, CRAWLEE_STORAGE_DIR, APIFY_IS_AT_HOME } = process.env; + /** * ErrorSnapshotter class is used to capture a screenshot of the page and a snapshot of the HTML when an error occur during web crawling. */ @@ -14,12 +17,13 @@ export class ErrorSnapshotter { static readonly MAX_ERROR_CHARACTERS = 30; static readonly BASE_MESSAGE = 'An error occurred'; static readonly SNAPSHOT_PREFIX = 'SNAPSHOT'; - static readonly KEY_VALUE_STORE_PATH = 'https://api.apify.com/v2/key-value-stores'; + static readonly KEY_VALUE_PLATFORM_PATH = 'https://api.apify.com/v2/key-value-stores'; + static readonly KEY_VALUE_STORE_LOCAL_PATH = `file://${PWD}/${CRAWLEE_STORAGE_DIR}/key_value_stores` || `file://${PWD}/storage/key_value_stores`; /** * Capture a snapshot of the error context. */ - async captureSnapshot(error: ErrnoException, context: CrawlingContext): Promise<{ screenshotFilename?: string; htmlFilename?: string }> { + async captureSnapshot(error: ErrnoException, context: CrawlingContext): Promise<{ screenshotFilename?: string; htmlFileName?: string }> { const page = context?.page as PuppeteerPage | PlaywrightPage | undefined; const body = context?.body; @@ -30,33 +34,62 @@ export class ErrorSnapshotter { } const filename = this.generateFilename(error); - const screenshotFilename = page ? await this.captureScreenshot(page, keyValueStore, filename) : undefined; - - let htmlFilename: string | undefined; - if (typeof body === 'string') { - htmlFilename = await this.saveHTMLSnapshot(body, keyValueStore, filename); - } else if (page) { - const html = await page?.content() || ''; - htmlFilename = html ? await this.saveHTMLSnapshot(html, keyValueStore, filename) : ''; + + let screenshotFilename: string | undefined; + let htmlFileName: string | undefined; + + if (page) { + const capturedFiles = await this.captureSnapShot( + context as + | PlaywrightCrawlingContext + | PuppeteerCrawlingContext, + filename, + ); + + if (capturedFiles) { + screenshotFilename = capturedFiles.screenshotFilename; + htmlFileName = capturedFiles.htmlFileName; + } + + // If the snapshot for browsers failed to capture the HTML, try to capture it from the page content + if (!htmlFileName) { + const html = await page?.content() || undefined; + htmlFileName = html ? await this.saveHTMLSnapshot(html, keyValueStore, filename) : undefined; + } + } else if (typeof body === 'string') { // for non-browser contexts + htmlFileName = await this.saveHTMLSnapshot(body, keyValueStore, filename); } - const basePath = `${ErrorSnapshotter.KEY_VALUE_STORE_PATH}/${keyValueStore.id}/records`; + if (APIFY_IS_AT_HOME) { + const platformPath = `${ErrorSnapshotter.KEY_VALUE_PLATFORM_PATH}/${keyValueStore.id}/records`; + return { + screenshotFilename: screenshotFilename ? `${platformPath}/${screenshotFilename}` : undefined, + htmlFileName: htmlFileName ? `${platformPath}/${htmlFileName}` : undefined, + }; + } + const localPath = `${ErrorSnapshotter.KEY_VALUE_STORE_LOCAL_PATH}/${keyValueStore.name || 'default'}`; return { - screenshotFilename: screenshotFilename ? `${basePath}/${screenshotFilename}` : undefined, - htmlFilename: htmlFilename ? `${basePath}/${htmlFilename}` : undefined, + screenshotFilename: screenshotFilename ? `${localPath}/${screenshotFilename}` : undefined, + htmlFileName: htmlFileName ? `${localPath}/${htmlFileName}` : undefined, }; } /** - * Capture a screenshot of the page (For Browser only), and return the filename with the extension. + * Capture a screenshot and HTML of the page (For Browser only), and return the filename with the extension. */ - async captureScreenshot(page: PuppeteerPage | PlaywrightPage, keyValueStore: KeyValueStore, filename: string): Promise { + async captureSnapShot( + context: PlaywrightCrawlingContext | PuppeteerCrawlingContext, + filename: string): Promise<{ + screenshotFilename?: string; + htmlFileName?: string; + } | undefined> { try { - const screenshotBuffer = await page.screenshot({ fullPage: true, type: 'png' }); - - await keyValueStore.setValue(filename, screenshotBuffer, { contentType: 'image/png' }); - return `${filename}.png`; + await context.saveSnapshot({ key: filename }); + return { + screenshotFilename: `${filename}.jpg`, + htmlFileName: `${filename}.html`, + }; } catch (e) { return undefined; } @@ -84,15 +117,15 @@ export class ErrorSnapshotter { // Extract the first 30 characters of the error message const errorMessagePrefix = ( error.message || ErrorSnapshotter.BASE_MESSAGE - ).substring(0, Math.min(ErrorSnapshotter.MAX_ERROR_CHARACTERS, error.message?.length || 0)); + ).slice(0, ErrorSnapshotter.MAX_ERROR_CHARACTERS); // Generate filename and remove disallowed characters let filename = `${ErrorSnapshotter.SNAPSHOT_PREFIX}_${errorStackHash}_${errorMessagePrefix}`; - filename = filename.replace(/[^a-zA-Z0-9!-_.]/g, ''); + filename = filename.replace(/[^a-zA-Z0-9!-_.]/g, '-'); // Ensure filename is not too long if (filename.length > 250) { - filename = filename.slice(0, 250); // to allow space for the extension + filename = filename.slice(0, 250); // 250 to allow space for the extension } return filename; From 33019449a7c7579e4613bcb90d0557b56c68320d Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Wed, 21 Feb 2024 13:02:47 +0300 Subject: [PATCH 06/28] refactor: update file paths for error snapshots --- packages/utils/src/internals/error_snapshotter.ts | 10 +++++----- packages/utils/src/internals/error_tracker.ts | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/utils/src/internals/error_snapshotter.ts b/packages/utils/src/internals/error_snapshotter.ts index f897b930c413..9e8475898bbe 100644 --- a/packages/utils/src/internals/error_snapshotter.ts +++ b/packages/utils/src/internals/error_snapshotter.ts @@ -23,7 +23,7 @@ export class ErrorSnapshotter { /** * Capture a snapshot of the error context. */ - async captureSnapshot(error: ErrnoException, context: CrawlingContext): Promise<{ screenshotFilename?: string; htmlFileName?: string }> { + async captureSnapshot(error: ErrnoException, context: CrawlingContext): Promise<{ screenshotFileUrl?: string; htmlFileUrl?: string }> { const page = context?.page as PuppeteerPage | PlaywrightPage | undefined; const body = context?.body; @@ -63,15 +63,15 @@ export class ErrorSnapshotter { if (APIFY_IS_AT_HOME) { const platformPath = `${ErrorSnapshotter.KEY_VALUE_PLATFORM_PATH}/${keyValueStore.id}/records`; return { - screenshotFilename: screenshotFilename ? `${platformPath}/${screenshotFilename}` : undefined, - htmlFileName: htmlFileName ? `${platformPath}/${htmlFileName}` : undefined, + screenshotFileUrl: screenshotFilename ? `${platformPath}/${screenshotFilename}` : undefined, + htmlFileUrl: htmlFileName ? `${platformPath}/${htmlFileName}` : undefined, }; } const localPath = `${ErrorSnapshotter.KEY_VALUE_STORE_LOCAL_PATH}/${keyValueStore.name || 'default'}`; return { - screenshotFilename: screenshotFilename ? `${localPath}/${screenshotFilename}` : undefined, - htmlFileName: htmlFileName ? `${localPath}/${htmlFileName}` : undefined, + screenshotFileUrl: screenshotFilename ? `${localPath}/${screenshotFilename}` : undefined, + htmlFileUrl: htmlFileName ? `${localPath}/${htmlFileName}` : undefined, }; } diff --git a/packages/utils/src/internals/error_tracker.ts b/packages/utils/src/internals/error_tracker.ts index 5413e6abec4a..ef591421845c 100644 --- a/packages/utils/src/internals/error_tracker.ts +++ b/packages/utils/src/internals/error_tracker.ts @@ -380,10 +380,10 @@ export class ErrorTracker { } async captureSnapshot(storage: Record, error: ErrnoException, context: CrawlingContext) { - const { screenshotFilename, htmlFileName } = await this.errorSnapshotter.captureSnapshot(error, context); + const { screenshotFileUrl, htmlFileUrl } = await this.errorSnapshotter.captureSnapshot(error, context); - storage.firstErrorScreenshot = screenshotFilename; - storage.firstErrorHtml = htmlFileName; + storage.firstErrorScreenshotUrl = screenshotFileUrl; + storage.firstErrorHtmlUrl = htmlFileUrl; } reset() { From 00c7d1ad7d5dd6bb056707970a07a6a4251802cc Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Thu, 22 Feb 2024 10:06:58 +0300 Subject: [PATCH 07/28] refactor: shorten the code in error_snapshotter.ts and update error files names --- .../src/internals/basic-crawler.ts | 3 +- .../utils/src/internals/error_snapshotter.ts | 41 +++++++++++-------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 69b75a779711..8fba69b4d648 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1392,7 +1392,8 @@ export class BasicCrawler { + const { KEY_VALUE_PLATFORM_PATH, KEY_VALUE_STORE_LOCAL_PATH } = ErrorSnapshotter; const page = context?.page as PuppeteerPage | PlaywrightPage | undefined; const body = context?.body; @@ -61,14 +64,14 @@ export class ErrorSnapshotter { } if (APIFY_IS_AT_HOME) { - const platformPath = `${ErrorSnapshotter.KEY_VALUE_PLATFORM_PATH}/${keyValueStore.id}/records`; + const platformPath = `${KEY_VALUE_PLATFORM_PATH}/${keyValueStore.id}/records`; return { screenshotFileUrl: screenshotFilename ? `${platformPath}/${screenshotFilename}` : undefined, htmlFileUrl: htmlFileName ? `${platformPath}/${htmlFileName}` : undefined, }; } - const localPath = `${ErrorSnapshotter.KEY_VALUE_STORE_LOCAL_PATH}/${keyValueStore.name || 'default'}`; + const localPath = `${KEY_VALUE_STORE_LOCAL_PATH}/${keyValueStore.name || 'default'}`; return { screenshotFileUrl: screenshotFilename ? `${localPath}/${screenshotFilename}` : undefined, htmlFileUrl: htmlFileName ? `${localPath}/${htmlFileName}` : undefined, @@ -87,7 +90,8 @@ export class ErrorSnapshotter { try { await context.saveSnapshot({ key: filename }); return { - screenshotFilename: `${filename}.jpg`, + // The screenshot file extension is different for Apify and local environments + screenshotFilename: `${filename}${APIFY_IS_AT_HOME ? '.jpg' : '.jpeg'}`, htmlFileName: `${filename}.html`, }; } catch (e) { @@ -107,26 +111,27 @@ export class ErrorSnapshotter { } } + /** + * Remove non-word characters from the start and end of a string. + */ + sanitizeString(str: string): string { + return str.replace(/^\W+|\W+$/g, ''); + } + /** * Generate a unique filename for each error snapshot. */ - // Method to generate a unique filename for each error snapshot generateFilename(error: ErrnoException): string { + const { SNAPSHOT_PREFIX, BASE_MESSAGE, MAX_HASH_LENGTH, MAX_ERROR_CHARACTERS, MAX_FILENAME_LENGTH } = ErrorSnapshotter; + const { sanitizeString } = this; // Create a hash of the error stack trace - const errorStackHash = crypto.createHash('sha1').update(error.stack || '').digest('hex').slice(0, 30); - // Extract the first 30 characters of the error message - const errorMessagePrefix = ( - error.message || ErrorSnapshotter.BASE_MESSAGE - ).slice(0, ErrorSnapshotter.MAX_ERROR_CHARACTERS); + const errorStackHash = crypto.createHash('sha1').update(error.stack || error.message || '').digest('hex').slice(0, MAX_HASH_LENGTH); + const errorMessagePrefix = (error.message || BASE_MESSAGE).slice(0, MAX_ERROR_CHARACTERS).trim(); // Generate filename and remove disallowed characters - let filename = `${ErrorSnapshotter.SNAPSHOT_PREFIX}_${errorStackHash}_${errorMessagePrefix}`; - filename = filename.replace(/[^a-zA-Z0-9!-_.]/g, '-'); - - // Ensure filename is not too long - if (filename.length > 250) { - filename = filename.slice(0, 250); // 250 to allow space for the extension - } + const filename = `${SNAPSHOT_PREFIX}_${sanitizeString(errorStackHash)}_${sanitizeString(errorMessagePrefix)}` + .replace(/\W+/g, '-') // Replace non-word characters with a dash + .slice(0, MAX_FILENAME_LENGTH); return filename; } From 3205465bf7404331699a3982872a0332ad077984 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Thu, 22 Feb 2024 10:13:06 +0300 Subject: [PATCH 08/28] chore: add tests for ErrorSnapshotter --- .../actor/.actor/actor.json | 7 +++ .../cheerio-error-snapshot/actor/.gitignore | 7 +++ .../cheerio-error-snapshot/actor/Dockerfile | 16 ++++++ test/e2e/cheerio-error-snapshot/actor/main.js | 55 +++++++++++++++++++ .../cheerio-error-snapshot/actor/package.json | 28 ++++++++++ test/e2e/cheerio-error-snapshot/test.mjs | 10 ++++ .../actor/.actor/actor.json | 7 +++ .../puppeteer-error-snapshot/actor/.gitignore | 7 +++ .../puppeteer-error-snapshot/actor/Dockerfile | 23 ++++++++ .../puppeteer-error-snapshot/actor/main.js | 54 ++++++++++++++++++ .../actor/package.json | 29 ++++++++++ test/e2e/puppeteer-error-snapshot/test.mjs | 10 ++++ 12 files changed, 253 insertions(+) create mode 100644 test/e2e/cheerio-error-snapshot/actor/.actor/actor.json create mode 100644 test/e2e/cheerio-error-snapshot/actor/.gitignore create mode 100644 test/e2e/cheerio-error-snapshot/actor/Dockerfile create mode 100644 test/e2e/cheerio-error-snapshot/actor/main.js create mode 100644 test/e2e/cheerio-error-snapshot/actor/package.json create mode 100644 test/e2e/cheerio-error-snapshot/test.mjs create mode 100644 test/e2e/puppeteer-error-snapshot/actor/.actor/actor.json create mode 100644 test/e2e/puppeteer-error-snapshot/actor/.gitignore create mode 100644 test/e2e/puppeteer-error-snapshot/actor/Dockerfile create mode 100644 test/e2e/puppeteer-error-snapshot/actor/main.js create mode 100644 test/e2e/puppeteer-error-snapshot/actor/package.json create mode 100644 test/e2e/puppeteer-error-snapshot/test.mjs diff --git a/test/e2e/cheerio-error-snapshot/actor/.actor/actor.json b/test/e2e/cheerio-error-snapshot/actor/.actor/actor.json new file mode 100644 index 000000000000..051de11176db --- /dev/null +++ b/test/e2e/cheerio-error-snapshot/actor/.actor/actor.json @@ -0,0 +1,7 @@ +{ + "actorSpecification": 1, + "name": "test-cheerio-throw-on-ssl-errors", + "version": "0.0", + "buildTag": "latest", + "env": null +} diff --git a/test/e2e/cheerio-error-snapshot/actor/.gitignore b/test/e2e/cheerio-error-snapshot/actor/.gitignore new file mode 100644 index 000000000000..ced7cbfc582d --- /dev/null +++ b/test/e2e/cheerio-error-snapshot/actor/.gitignore @@ -0,0 +1,7 @@ +.idea +.DS_Store +node_modules +package-lock.json +apify_storage +crawlee_storage +storage diff --git a/test/e2e/cheerio-error-snapshot/actor/Dockerfile b/test/e2e/cheerio-error-snapshot/actor/Dockerfile new file mode 100644 index 000000000000..4874669fcad0 --- /dev/null +++ b/test/e2e/cheerio-error-snapshot/actor/Dockerfile @@ -0,0 +1,16 @@ +FROM apify/actor-node:18-beta + +COPY packages ./packages +COPY package*.json ./ + +RUN npm --quiet set progress=false \ + && npm install --only=prod --no-optional --no-audit \ + && npm update --no-audit \ + && echo "Installed NPM packages:" \ + && (npm list --only=prod --no-optional --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version + +COPY . ./ diff --git a/test/e2e/cheerio-error-snapshot/actor/main.js b/test/e2e/cheerio-error-snapshot/actor/main.js new file mode 100644 index 000000000000..7836911a1e03 --- /dev/null +++ b/test/e2e/cheerio-error-snapshot/actor/main.js @@ -0,0 +1,55 @@ +import { CheerioCrawler } from '@crawlee/cheerio'; +import { sleep } from '@crawlee/utils'; +import { Actor } from 'apify'; + +const mainOptions = { + exit: Actor.isAtHome(), + storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, +}; + +const LABELS = { + TIMEOUT: 'TIMEOUT', + TYPE_ERROR: 'TYPE_ERROR', + ERROR_OPENING_PAGE: 'ERROR_OPENING_PAGE', + POST_NAVIGATION_ERROR: 'POST_NAVIGATION_ERROR', +}; + +// Pre Navigation errors snapshots will not be saved as we don't get the response in the context +await Actor.main(async () => { + const crawler = new CheerioCrawler({ + requestHandlerTimeoutSecs: 2, + maxRequestRetries: 0, + async requestHandler({ $, request, log }) { + const { userData: { label } } = request; + + if (label === LABELS.TIMEOUT) { + log.error('Timeout error'); + await sleep(20_000); + } if (label === LABELS.TYPE_ERROR) { + log.error('TypeError: $(...).error is not a function'); + $().error(); + } else if (label === LABELS.ERROR_OPENING_PAGE) { + log.error('Error opening page'); + throw new Error('An error occurred while opening the page'); + } + }, + postNavigationHooks: [async ({ request, log }) => { + const { userData: { label } } = request; + + // Post navigation errors snapshots are not saved as we don't get the body in the context + if (label === LABELS.POST_NAVIGATION_ERROR) { + log.error('Post navigation error'); + throw new Error(' Unable to navigate to the requested post'); + } + }], + }); + + await crawler.run(Object.values(LABELS).map((label) => ({ url: 'https://example.com', userData: { label }, uniqueKey: label }))); + + // await crawler.run([ + // { url: 'https://example.com', userData: { label: 'TIMEOUT' }, uniqueKey: 'TIMEOUT' }, + // { url: 'https://example.com', userData: { label: 'TYPE_ERROR' }, uniqueKey: 'TYPE_ERROR' }, + // { url: 'https://example.com', userData: { label: 'ERROR_OPENING_PAGE' }, uniqueKey: 'ERROR_OPENING_PAGE' }, + // { url: 'https://example.com', userData: { label: 'POST_NAVIGATION_ERROR' }, uniqueKey: 'POST_NAVIGATION_ERROR' }, + // ]); +}, mainOptions); diff --git a/test/e2e/cheerio-error-snapshot/actor/package.json b/test/e2e/cheerio-error-snapshot/actor/package.json new file mode 100644 index 000000000000..3a0a07ab904a --- /dev/null +++ b/test/e2e/cheerio-error-snapshot/actor/package.json @@ -0,0 +1,28 @@ +{ + "name": "test-cheerio-throw-on-ssl-errors", + "version": "0.0.1", + "description": "Cheerio Crawler Test - Should throw on SSL Errors", + "dependencies": { + "apify": "next", + "@apify/storage-local": "^2.1.3", + "@crawlee/basic": "file:./packages/basic-crawler", + "@crawlee/browser-pool": "file:./packages/browser-pool", + "@crawlee/http": "file:./packages/http-crawler", + "@crawlee/cheerio": "file:./packages/cheerio-crawler", + "@crawlee/core": "file:./packages/core", + "@crawlee/memory-storage": "file:./packages/memory-storage", + "@crawlee/types": "file:./packages/types", + "@crawlee/utils": "file:./packages/utils" + }, + "overrides": { + "apify": { + "@crawlee/core": "file:./packages/core", + "@crawlee/utils": "file:./packages/utils" + } + }, + "scripts": { + "start": "node main.js" + }, + "type": "module", + "license": "ISC" +} diff --git a/test/e2e/cheerio-error-snapshot/test.mjs b/test/e2e/cheerio-error-snapshot/test.mjs new file mode 100644 index 000000000000..7a0f1d9bb1e0 --- /dev/null +++ b/test/e2e/cheerio-error-snapshot/test.mjs @@ -0,0 +1,10 @@ +import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs'; + +const testActorDirname = getActorTestDir(import.meta.url); +await initialize(testActorDirname); + +const { stats, defaultKeyValueStoreItems } = await runActor(testActorDirname); + +await expect(stats.requestsFailed === 4, 'All requests failed'); +// Count of error HTML files stored in the Key-Value store +await expect(defaultKeyValueStoreItems.length === 3, 'Number of HTML error snapshots in KV store'); diff --git a/test/e2e/puppeteer-error-snapshot/actor/.actor/actor.json b/test/e2e/puppeteer-error-snapshot/actor/.actor/actor.json new file mode 100644 index 000000000000..224de98d2f9e --- /dev/null +++ b/test/e2e/puppeteer-error-snapshot/actor/.actor/actor.json @@ -0,0 +1,7 @@ +{ + "actorSpecification": 1, + "name": "test-puppeteer-throw-on-ssl-errors", + "version": "0.0", + "buildTag": "latest", + "env": null +} diff --git a/test/e2e/puppeteer-error-snapshot/actor/.gitignore b/test/e2e/puppeteer-error-snapshot/actor/.gitignore new file mode 100644 index 000000000000..ced7cbfc582d --- /dev/null +++ b/test/e2e/puppeteer-error-snapshot/actor/.gitignore @@ -0,0 +1,7 @@ +.idea +.DS_Store +node_modules +package-lock.json +apify_storage +crawlee_storage +storage diff --git a/test/e2e/puppeteer-error-snapshot/actor/Dockerfile b/test/e2e/puppeteer-error-snapshot/actor/Dockerfile new file mode 100644 index 000000000000..840bbbde8dff --- /dev/null +++ b/test/e2e/puppeteer-error-snapshot/actor/Dockerfile @@ -0,0 +1,23 @@ +FROM node:18 AS builder + +COPY /packages ./packages +COPY /package*.json ./ +RUN npm --quiet set progress=false \ + && npm install --only=prod --no-optional --no-audit \ + && npm update + +FROM apify/actor-node-puppeteer-chrome:18-beta + +RUN rm -r node_modules +COPY --from=builder /node_modules ./node_modules +COPY --from=builder /packages ./packages +COPY --from=builder /package*.json ./ +COPY /.actor ./.actor +COPY /main.js ./ + +RUN echo "Installed NPM packages:" \ + && (npm list --only=prod --no-optional --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version diff --git a/test/e2e/puppeteer-error-snapshot/actor/main.js b/test/e2e/puppeteer-error-snapshot/actor/main.js new file mode 100644 index 000000000000..b51d08609239 --- /dev/null +++ b/test/e2e/puppeteer-error-snapshot/actor/main.js @@ -0,0 +1,54 @@ +import { PuppeteerCrawler } from '@crawlee/puppeteer'; +import { sleep } from '@crawlee/utils'; +import { Actor } from 'apify'; + +const mainOptions = { + exit: Actor.isAtHome(), + storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, +}; + +const LABELS = { + TIMEOUT: 'TIMEOUT', + TYPE_ERROR: 'TYPE_ERROR', + ERROR_OPENING_PAGE: 'ERROR_OPENING_PAGE', + POST_NAVIGATION_ERROR: 'POST_NAVIGATION_ERROR', +}; + +// Pre Navigation errors snapshots will not be saved as we don't get the response in the context +await Actor.main(async () => { + const crawler = new PuppeteerCrawler({ + requestHandlerTimeoutSecs: 15, + maxRequestRetries: 0, + async requestHandler({ request, log, page }) { + const { userData: { label } } = request; + + if (label === LABELS.TIMEOUT) { + log.error('Timeout error'); + await sleep(30_000); + } if (label === LABELS.TYPE_ERROR) { + log.error('TypeError: page.error is not a function'); + page.error(); + } else if (label === LABELS.ERROR_OPENING_PAGE) { + log.error('Error opening page'); + throw new Error('An error occurred while opening the page'); + } + }, + postNavigationHooks: [async ({ request, log }) => { + const { userData: { label } } = request; + + if (label === LABELS.POST_NAVIGATION_ERROR) { + log.error('Post navigation error'); + throw new Error(' Unable to navigate to the requested post'); + } + }], + }); + + // await crawler.run(Object.values(LABELS).map((label) => ({ url: 'https://example.com', userData: { label }, uniqueKey: label }))); + + await crawler.run([ + { url: 'https://example.com', userData: { label: 'TIMEOUT' }, uniqueKey: 'TIMEOUT' }, + { url: 'https://example.com', userData: { label: 'TYPE_ERROR' }, uniqueKey: 'TYPE_ERROR' }, + { url: 'https://example.com', userData: { label: 'ERROR_OPENING_PAGE' }, uniqueKey: 'ERROR_OPENING_PAGE' }, + { url: 'https://example.com', userData: { label: 'POST_NAVIGATION_ERROR' }, uniqueKey: 'POST_NAVIGATION_ERROR' }, + ]); +}, mainOptions); diff --git a/test/e2e/puppeteer-error-snapshot/actor/package.json b/test/e2e/puppeteer-error-snapshot/actor/package.json new file mode 100644 index 000000000000..65b5d8134ab1 --- /dev/null +++ b/test/e2e/puppeteer-error-snapshot/actor/package.json @@ -0,0 +1,29 @@ +{ + "name": "test-puppeteer-throw-on-ssl-errors", + "version": "0.0.1", + "description": "Puppeteer Test - Should throw on SSL Errors", + "dependencies": { + "apify": "next", + "@apify/storage-local": "^2.1.3", + "@crawlee/basic": "file:./packages/basic-crawler", + "@crawlee/browser": "file:./packages/browser-crawler", + "@crawlee/browser-pool": "file:./packages/browser-pool", + "@crawlee/core": "file:./packages/core", + "@crawlee/memory-storage": "file:./packages/memory-storage", + "@crawlee/puppeteer": "file:./packages/puppeteer-crawler", + "@crawlee/types": "file:./packages/types", + "@crawlee/utils": "file:./packages/utils", + "puppeteer": "*" + }, + "overrides": { + "apify": { + "@crawlee/core": "file:./packages/core", + "@crawlee/utils": "file:./packages/utils" + } + }, + "scripts": { + "start": "node main.js" + }, + "type": "module", + "license": "ISC" +} diff --git a/test/e2e/puppeteer-error-snapshot/test.mjs b/test/e2e/puppeteer-error-snapshot/test.mjs new file mode 100644 index 000000000000..54ec4de9b031 --- /dev/null +++ b/test/e2e/puppeteer-error-snapshot/test.mjs @@ -0,0 +1,10 @@ +import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs'; + +const testActorDirname = getActorTestDir(import.meta.url); +await initialize(testActorDirname); + +const { stats, defaultKeyValueStoreItems } = await runActor(testActorDirname); + +await expect(stats.requestsFailed === 4, 'All requests failed'); +// Count of error HTML files and screenshot files stored in the Key-Value store +await expect(defaultKeyValueStoreItems.length === 8, 'Number of HTML error snapshots in KV store'); From 778a2bc0c83ae819d954dcc68199027382ba80c9 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Thu, 22 Feb 2024 10:18:45 +0300 Subject: [PATCH 09/28] fix: conditionally generate screenshot and html file names based on APIFY_IS_AT_HOME env variable --- .../src/internals/utils/playwright-utils.ts | 6 ++++-- .../src/internals/utils/puppeteer_utils.ts | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts index 174a42fd3985..6d8cea5a8376 100644 --- a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts +++ b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts @@ -534,14 +534,16 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {} try { const store = await KeyValueStore.open(keyValueStoreName, { config: config ?? Configuration.getGlobalConfig() }); + const { APIFY_IS_AT_HOME } = process.env; + if (saveScreenshot) { - const screenshotName = `${key}.jpg`; + const screenshotName = APIFY_IS_AT_HOME ? `${key}.jpg` : key; const screenshotBuffer = await page.screenshot({ fullPage: true, quality: screenshotQuality, type: 'jpeg', animations: 'disabled' }); await store.setValue(screenshotName, screenshotBuffer, { contentType: 'image/jpeg' }); } if (saveHtml) { - const htmlName = `${key}.html`; + const htmlName = APIFY_IS_AT_HOME ? `${key}.html` : key; const html = await page.content(); await store.setValue(htmlName, html, { contentType: 'text/html' }); } diff --git a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts index 154761391e65..2d7cceb9164f 100644 --- a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts +++ b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts @@ -666,14 +666,16 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {} try { const store = await KeyValueStore.open(keyValueStoreName, { config: config ?? Configuration.getGlobalConfig() }); + const { APIFY_IS_AT_HOME } = process.env; + if (saveScreenshot) { - const screenshotName = `${key}.jpg`; + const screenshotName = APIFY_IS_AT_HOME ? `${key}.jpg` : key; const screenshotBuffer = await page.screenshot({ fullPage: true, quality: screenshotQuality, type: 'jpeg' }); await store.setValue(screenshotName, screenshotBuffer, { contentType: 'image/jpeg' }); } if (saveHtml) { - const htmlName = `${key}.html`; + const htmlName = APIFY_IS_AT_HOME ? `${key}.html` : key; const html = await page.content(); await store.setValue(htmlName, html, { contentType: 'text/html' }); } From efa5437b9368f9e9de30ab83927fc721cd9dfe7d Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Thu, 22 Feb 2024 11:46:40 +0300 Subject: [PATCH 10/28] refactor: move error_snapshotter.ts and error_tracker.ts to core package --- .../src/internals => core/src/crawlers}/error_snapshotter.ts | 5 +++-- .../src/internals => core/src/crawlers}/error_tracker.ts | 3 +-- packages/core/src/crawlers/index.ts | 2 ++ packages/core/src/crawlers/statistics.ts | 2 +- packages/utils/src/index.ts | 2 -- packages/utils/test/non-error-objects-working.test.ts | 2 +- test/core/error_tracker.test.ts | 4 +--- 7 files changed, 9 insertions(+), 11 deletions(-) rename packages/{utils/src/internals => core/src/crawlers}/error_snapshotter.ts (97%) rename packages/{utils/src/internals => core/src/crawlers}/error_tracker.ts (99%) diff --git a/packages/utils/src/internals/error_snapshotter.ts b/packages/core/src/crawlers/error_snapshotter.ts similarity index 97% rename from packages/utils/src/internals/error_snapshotter.ts rename to packages/core/src/crawlers/error_snapshotter.ts index b5897ed8d815..10a445311f63 100644 --- a/packages/utils/src/internals/error_snapshotter.ts +++ b/packages/core/src/crawlers/error_snapshotter.ts @@ -1,12 +1,13 @@ import crypto from 'node:crypto'; -import type { KeyValueStore, CrawlingContext } from '@crawlee/core'; import type { PlaywrightCrawlingContext } from '@crawlee/playwright'; import type { PuppeteerCrawlingContext } from '@crawlee/puppeteer'; import type { Page as PlaywrightPage } from 'playwright'; import type { Page as PuppeteerPage } from 'puppeteer'; import type { ErrnoException } from './error_tracker'; +import type { CrawlingContext } from '../crawlers/crawler_commons'; +import type { KeyValueStore } from '../storages'; const { PWD, CRAWLEE_STORAGE_DIR, APIFY_IS_AT_HOME } = process.env; @@ -43,7 +44,7 @@ export class ErrorSnapshotter { if (page) { const capturedFiles = await this.captureSnapShot( - context as + context as unknown as | PlaywrightCrawlingContext | PuppeteerCrawlingContext, filename, diff --git a/packages/utils/src/internals/error_tracker.ts b/packages/core/src/crawlers/error_tracker.ts similarity index 99% rename from packages/utils/src/internals/error_tracker.ts rename to packages/core/src/crawlers/error_tracker.ts index ef591421845c..94cf5b98189b 100644 --- a/packages/utils/src/internals/error_tracker.ts +++ b/packages/core/src/crawlers/error_tracker.ts @@ -1,8 +1,7 @@ import { inspect } from 'node:util'; -import type { CrawlingContext } from '@crawlee/core'; - import { ErrorSnapshotter } from './error_snapshotter'; +import type { CrawlingContext } from '../crawlers/crawler_commons'; /** * Node.js Error interface diff --git a/packages/core/src/crawlers/index.ts b/packages/core/src/crawlers/index.ts index 8f206d4e6043..77a83511e413 100644 --- a/packages/core/src/crawlers/index.ts +++ b/packages/core/src/crawlers/index.ts @@ -2,3 +2,5 @@ export * from './crawler_commons'; export * from './crawler_extension'; export * from './crawler_utils'; export * from './statistics'; +export * from './error_tracker'; +export * from './error_snapshotter'; diff --git a/packages/core/src/crawlers/statistics.ts b/packages/core/src/crawlers/statistics.ts index 556c268d96a5..c617a0164c43 100644 --- a/packages/core/src/crawlers/statistics.ts +++ b/packages/core/src/crawlers/statistics.ts @@ -1,6 +1,6 @@ -import { ErrorTracker } from '@crawlee/utils'; import ow from 'ow'; +import { ErrorTracker } from './error_tracker'; import { Configuration } from '../configuration'; import type { EventManager } from '../events/event_manager'; import { EventType } from '../events/event_manager'; diff --git a/packages/utils/src/index.ts b/packages/utils/src/index.ts index fa021cd30c81..9ba820c5ab78 100644 --- a/packages/utils/src/index.ts +++ b/packages/utils/src/index.ts @@ -7,8 +7,6 @@ export * from './internals/memory-info'; export * from './internals/debug'; export * as social from './internals/social'; export * from './internals/typedefs'; -export * from './internals/error_tracker'; -export * from './internals/error_snapshotter'; export * from './internals/open_graph_parser'; export * from './internals/gotScraping'; export * from './internals/robots'; diff --git a/packages/utils/test/non-error-objects-working.test.ts b/packages/utils/test/non-error-objects-working.test.ts index 68c69b6f0294..8b3dcd83c957 100644 --- a/packages/utils/test/non-error-objects-working.test.ts +++ b/packages/utils/test/non-error-objects-working.test.ts @@ -1,4 +1,4 @@ -import { ErrorTracker } from '../src/internals/error_tracker'; +import { ErrorTracker } from '../../core/src/crawlers/error_tracker'; describe('ErrorTracker', () => { test('processing a non-error error should not crash', () => { diff --git a/test/core/error_tracker.test.ts b/test/core/error_tracker.test.ts index e8c78745cc2c..6f49bb30274b 100644 --- a/test/core/error_tracker.test.ts +++ b/test/core/error_tracker.test.ts @@ -1,7 +1,5 @@ /* eslint-disable no-multi-spaces */ -import exp from 'node:constants'; - -import { ErrorTracker } from '../../packages/utils/src/internals/error_tracker'; +import { ErrorTracker } from '../../packages/core/src/crawlers/error_tracker'; const random = () => Math.random().toString(36).slice(2); From 865454ea23ac52c56f0085a59a31ecced76b4b3b Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Thu, 22 Feb 2024 11:47:38 +0300 Subject: [PATCH 11/28] refactor: update actor and package names for error snapshot tests --- test/e2e/cheerio-error-snapshot/actor/.actor/actor.json | 2 +- test/e2e/cheerio-error-snapshot/actor/main.js | 9 +-------- test/e2e/cheerio-error-snapshot/actor/package.json | 4 ++-- .../e2e/puppeteer-error-snapshot/actor/.actor/actor.json | 2 +- test/e2e/puppeteer-error-snapshot/actor/main.js | 9 +-------- test/e2e/puppeteer-error-snapshot/actor/package.json | 4 ++-- 6 files changed, 8 insertions(+), 22 deletions(-) diff --git a/test/e2e/cheerio-error-snapshot/actor/.actor/actor.json b/test/e2e/cheerio-error-snapshot/actor/.actor/actor.json index 051de11176db..13d855466bf0 100644 --- a/test/e2e/cheerio-error-snapshot/actor/.actor/actor.json +++ b/test/e2e/cheerio-error-snapshot/actor/.actor/actor.json @@ -1,6 +1,6 @@ { "actorSpecification": 1, - "name": "test-cheerio-throw-on-ssl-errors", + "name": "test-cheerio-error-snapshot", "version": "0.0", "buildTag": "latest", "env": null diff --git a/test/e2e/cheerio-error-snapshot/actor/main.js b/test/e2e/cheerio-error-snapshot/actor/main.js index 7836911a1e03..de40770870c2 100644 --- a/test/e2e/cheerio-error-snapshot/actor/main.js +++ b/test/e2e/cheerio-error-snapshot/actor/main.js @@ -39,17 +39,10 @@ await Actor.main(async () => { // Post navigation errors snapshots are not saved as we don't get the body in the context if (label === LABELS.POST_NAVIGATION_ERROR) { log.error('Post navigation error'); - throw new Error(' Unable to navigate to the requested post'); + throw new Error('Unable to navigate to the requested post'); } }], }); await crawler.run(Object.values(LABELS).map((label) => ({ url: 'https://example.com', userData: { label }, uniqueKey: label }))); - - // await crawler.run([ - // { url: 'https://example.com', userData: { label: 'TIMEOUT' }, uniqueKey: 'TIMEOUT' }, - // { url: 'https://example.com', userData: { label: 'TYPE_ERROR' }, uniqueKey: 'TYPE_ERROR' }, - // { url: 'https://example.com', userData: { label: 'ERROR_OPENING_PAGE' }, uniqueKey: 'ERROR_OPENING_PAGE' }, - // { url: 'https://example.com', userData: { label: 'POST_NAVIGATION_ERROR' }, uniqueKey: 'POST_NAVIGATION_ERROR' }, - // ]); }, mainOptions); diff --git a/test/e2e/cheerio-error-snapshot/actor/package.json b/test/e2e/cheerio-error-snapshot/actor/package.json index 3a0a07ab904a..988e6e0806c8 100644 --- a/test/e2e/cheerio-error-snapshot/actor/package.json +++ b/test/e2e/cheerio-error-snapshot/actor/package.json @@ -1,7 +1,7 @@ { - "name": "test-cheerio-throw-on-ssl-errors", + "name": "test-cheerio-error-snapshot", "version": "0.0.1", - "description": "Cheerio Crawler Test - Should throw on SSL Errors", + "description": "Cheerio Crawler Test - Should save errors snapshots", "dependencies": { "apify": "next", "@apify/storage-local": "^2.1.3", diff --git a/test/e2e/puppeteer-error-snapshot/actor/.actor/actor.json b/test/e2e/puppeteer-error-snapshot/actor/.actor/actor.json index 224de98d2f9e..827dc94c4e26 100644 --- a/test/e2e/puppeteer-error-snapshot/actor/.actor/actor.json +++ b/test/e2e/puppeteer-error-snapshot/actor/.actor/actor.json @@ -1,6 +1,6 @@ { "actorSpecification": 1, - "name": "test-puppeteer-throw-on-ssl-errors", + "name": "test-puppeteer-error-snapshot", "version": "0.0", "buildTag": "latest", "env": null diff --git a/test/e2e/puppeteer-error-snapshot/actor/main.js b/test/e2e/puppeteer-error-snapshot/actor/main.js index b51d08609239..ed4e7543ec14 100644 --- a/test/e2e/puppeteer-error-snapshot/actor/main.js +++ b/test/e2e/puppeteer-error-snapshot/actor/main.js @@ -43,12 +43,5 @@ await Actor.main(async () => { }], }); - // await crawler.run(Object.values(LABELS).map((label) => ({ url: 'https://example.com', userData: { label }, uniqueKey: label }))); - - await crawler.run([ - { url: 'https://example.com', userData: { label: 'TIMEOUT' }, uniqueKey: 'TIMEOUT' }, - { url: 'https://example.com', userData: { label: 'TYPE_ERROR' }, uniqueKey: 'TYPE_ERROR' }, - { url: 'https://example.com', userData: { label: 'ERROR_OPENING_PAGE' }, uniqueKey: 'ERROR_OPENING_PAGE' }, - { url: 'https://example.com', userData: { label: 'POST_NAVIGATION_ERROR' }, uniqueKey: 'POST_NAVIGATION_ERROR' }, - ]); + await crawler.run(Object.values(LABELS).map((label) => ({ url: 'https://example.com', userData: { label }, uniqueKey: label }))); }, mainOptions); diff --git a/test/e2e/puppeteer-error-snapshot/actor/package.json b/test/e2e/puppeteer-error-snapshot/actor/package.json index 65b5d8134ab1..ce3638b8fd90 100644 --- a/test/e2e/puppeteer-error-snapshot/actor/package.json +++ b/test/e2e/puppeteer-error-snapshot/actor/package.json @@ -1,7 +1,7 @@ { - "name": "test-puppeteer-throw-on-ssl-errors", + "name": "test-puppeteer-error-snapshot", "version": "0.0.1", - "description": "Puppeteer Test - Should throw on SSL Errors", + "description": "Puppeteer Test - Should save errors snapshots", "dependencies": { "apify": "next", "@apify/storage-local": "^2.1.3", From 6982e0331d2e90e3a941d42fc98ef1aed3410be9 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Thu, 22 Feb 2024 12:09:49 +0300 Subject: [PATCH 12/28] refactor: add a new function `addAsync`in error_tracker and revert the changes on `add` function --- .../src/internals/basic-crawler.ts | 6 +- packages/core/src/crawlers/error_tracker.ts | 36 +++++++- test/core/error_tracker.test.ts | 88 +++++++++---------- 3 files changed, 81 insertions(+), 49 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 8fba69b4d648..a4e405551b08 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1363,7 +1363,7 @@ export class BasicCrawler { +test('works', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -95,7 +95,7 @@ test('works', async () => { expect(tracker.result).toMatchObject({}); - await tracker.add(g(e)); + tracker.add(g(e)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -110,7 +110,7 @@ test('works', async () => { }); expect(tracker.getUniqueErrorCount()).toBe(1); - await tracker.add(g(e)); + tracker.add(g(e)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -138,8 +138,8 @@ test('no code is null code', async () => { const e = errorNoCode; - await tracker.add(g(e)); - await tracker.add(g(e)); + tracker.add(g(e)); + tracker.add(g(e)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -167,8 +167,8 @@ test('can hide error code', async () => { const e = errorRandomCode; - await tracker.add(g(errorRandomCode)); - await tracker.add(g(errorRandomCode)); + tracker.add(g(errorRandomCode)); + tracker.add(g(errorRandomCode)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -194,8 +194,8 @@ test('can hide error name', async () => { const e = errorRandomName; - await tracker.add(g(e)); - await tracker.add(g(e)); + tracker.add(g(e)); + tracker.add(g(e)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -221,8 +221,8 @@ test('can hide error message', async () => { const e = errorRandomMessage; - await tracker.add(g(errorRandomMessage)); - await tracker.add(g(errorRandomMessage)); + tracker.add(g(errorRandomMessage)); + tracker.add(g(errorRandomMessage)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -246,8 +246,8 @@ test('can hide error stack', async () => { showFullMessage: true, }); - await tracker.add(g(errorRandomStack)); - await tracker.add(g(errorRandomStack)); + tracker.add(g(errorRandomStack)); + tracker.add(g(errorRandomStack)); expect(tracker.result).toMatchObject({ 'ERR_INVALID_URL': { // code @@ -273,8 +273,8 @@ test('can display full stack', async () => { const e = multilineError; - await tracker.add(g(e)); - await tracker.add(g(e)); + tracker.add(g(e)); + tracker.add(g(e)); expect(tracker.result).toMatchObject({ [s(e.stack)]: { // source @@ -302,8 +302,8 @@ test('stack looks for user files first', async () => { const e = multilineError; - await tracker.add(g(e)); - await tracker.add(g(e)); + tracker.add(g(e)); + tracker.add(g(e)); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -331,7 +331,7 @@ test('can shorten the message to the first line', async () => { const e = g(multilineError); - await tracker.add(e); + tracker.add(e); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -360,7 +360,7 @@ test('supports error.cause', async () => { const e = g(multilineError); e.cause = g(errorRandomMessage); - await tracker.add(e); + tracker.add(e); expect(tracker.result).toMatchObject({ 'myscript.js:10:3': { // source @@ -389,17 +389,17 @@ test('placeholder #1', async () => { showFullMessage: true, }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected boolean, got number', }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected boolean, got string', }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected boolean, got undefined', }); @@ -428,17 +428,17 @@ test('placeholder #2', async () => { showFullMessage: true, }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected `boolean`, got `number`', }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected `boolean`, got `string`', }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected `boolean`, got `undefined`', }); @@ -467,17 +467,17 @@ test('placeholder #3', async () => { showFullMessage: true, }); - await tracker.add({ + tracker.add({ name: 'Error', message: '1 2 3', }); - await tracker.add({ + tracker.add({ name: 'Error', message: '1 4 3', }); - await tracker.add({ + tracker.add({ name: 'Error', message: '1 5 3', }); @@ -506,17 +506,17 @@ test('placeholder #4', async () => { showFullMessage: true, }); - await tracker.add({ + tracker.add({ name: 'Error', message: '1 2 3', }); - await tracker.add({ + tracker.add({ name: 'Error', message: '1 2 4', }); - await tracker.add({ + tracker.add({ name: 'Error', message: '1 2 5', }); @@ -545,17 +545,17 @@ test('placeholder #5', async () => { showFullMessage: true, }); - await tracker.add({ + tracker.add({ name: 'Error', message: '1 2 3', }); - await tracker.add({ + tracker.add({ name: 'Error', message: '4 2 3', }); - await tracker.add({ + tracker.add({ name: 'Error', message: '5 2 3', }); @@ -584,17 +584,17 @@ test('placeholder #6', async () => { showFullMessage: true, }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'The weather is sunny today, but the grass is wet.', }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'The weather is rainy today, but the grass is still dry.', }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'The weather is wild today, however the grass is yellow.', }); @@ -623,12 +623,12 @@ test('placeholder #7', async () => { showFullMessage: true, }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected `boolean`, got `number`', }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected `boolean`, got `number`', }); @@ -646,7 +646,7 @@ test('placeholder #7', async () => { }); expect(tracker.getUniqueErrorCount()).toBe(1); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected `boolean`, got `falsy value`', }); @@ -675,12 +675,12 @@ test('placeholder #8', async () => { showFullMessage: true, }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected `boolean`, got `number`', }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Expected `string`, got `null`', }); @@ -712,12 +712,12 @@ test('placeholder #9', async () => { showFullMessage: true, }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Unexpected `show` property in `options` object', }); - await tracker.add({ + tracker.add({ name: 'Error', message: 'Missing `display` in style', }); From c79e3b1e7dedacaecff0c1d236b09079248dedd1 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Thu, 22 Feb 2024 18:31:43 +0300 Subject: [PATCH 13/28] fix: add partial types in typedefs.ts --- packages/core/src/crawlers/error_snapshotter.ts | 6 +----- packages/core/src/typedefs.ts | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/packages/core/src/crawlers/error_snapshotter.ts b/packages/core/src/crawlers/error_snapshotter.ts index 10a445311f63..26b7da81f3d6 100644 --- a/packages/core/src/crawlers/error_snapshotter.ts +++ b/packages/core/src/crawlers/error_snapshotter.ts @@ -1,13 +1,9 @@ import crypto from 'node:crypto'; -import type { PlaywrightCrawlingContext } from '@crawlee/playwright'; -import type { PuppeteerCrawlingContext } from '@crawlee/puppeteer'; -import type { Page as PlaywrightPage } from 'playwright'; -import type { Page as PuppeteerPage } from 'puppeteer'; - import type { ErrnoException } from './error_tracker'; import type { CrawlingContext } from '../crawlers/crawler_commons'; import type { KeyValueStore } from '../storages'; +import type { PlaywrightCrawlingContext, PuppeteerCrawlingContext, PlaywrightPage, PuppeteerPage } from '../typedefs'; const { PWD, CRAWLEE_STORAGE_DIR, APIFY_IS_AT_HOME } = process.env; diff --git a/packages/core/src/typedefs.ts b/packages/core/src/typedefs.ts index 66fc714ecd73..2aa731b42012 100644 --- a/packages/core/src/typedefs.ts +++ b/packages/core/src/typedefs.ts @@ -17,3 +17,17 @@ export function keys(obj: T) { } export declare type AllowedHttpMethods = 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH'; + +// Define the following types as we cannot import the complete types from the respective packages +export interface PlaywrightCrawlingContext { + saveSnapshot: (options: { key: string }) => Promise; +} +export interface PuppeteerCrawlingContext { + saveSnapshot: (options: { key: string }) => Promise; +} +export interface PlaywrightPage { + content: () => Promise; +} +export interface PuppeteerPage { + content: () => Promise; +} From 9eec3ee873dc031c58f26398477466d9f55d36cd Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Thu, 22 Feb 2024 18:32:11 +0300 Subject: [PATCH 14/28] fix: update puppeteer and playwright utils test --- test/core/playwright_utils.test.ts | 7 ++++--- test/core/puppeteer_utils.test.ts | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/test/core/playwright_utils.test.ts b/test/core/playwright_utils.test.ts index ca29ff875d6b..03b02f7ef1df 100644 --- a/test/core/playwright_utils.test.ts +++ b/test/core/playwright_utils.test.ts @@ -301,6 +301,7 @@ describe('playwrightUtils', () => { test('saveSnapshot() works', async () => { const openKVSSpy = vitest.spyOn(KeyValueStore, 'open'); const browser = await chromium.launch({ headless: true }); + const { APIFY_IS_AT_HOME: isAtHome } = process.env; try { const page = await browser.newPage(); @@ -314,8 +315,8 @@ describe('playwrightUtils', () => { openKVSSpy.mockResolvedValue(object as any); await playwrightUtils.saveSnapshot(page, { key: 'TEST', keyValueStoreName: 'TEST-STORE', screenshotQuality: 60 }); - expect(object.setValue).toBeCalledWith('TEST.jpg', screenshot, { contentType: 'image/jpeg' }); - expect(object.setValue).toBeCalledWith('TEST.html', contentHTML, { contentType: 'text/html' }); + expect(object.setValue).toBeCalledWith(isAtHome ? 'TEST.jpg' : 'TEST', screenshot, { contentType: 'image/jpeg' }); + expect(object.setValue).toBeCalledWith(isAtHome ? 'TEST.html' : 'TEST', contentHTML, { contentType: 'text/html' }); object.setValue.mockReset(); // Test saving only image @@ -323,7 +324,7 @@ describe('playwrightUtils', () => { // Default quality is 50 const screenshot2 = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 50 }); - expect(object.setValue).toBeCalledWith('SNAPSHOT.jpg', screenshot2, { contentType: 'image/jpeg' }); + expect(object.setValue).toBeCalledWith(isAtHome ? 'SNAPSHOT.jpg' : 'SNAPSHOT', screenshot2, { contentType: 'image/jpeg' }); } finally { await browser.close(); } diff --git a/test/core/puppeteer_utils.test.ts b/test/core/puppeteer_utils.test.ts index 4103ba99b36d..0ee59f6b1e19 100644 --- a/test/core/puppeteer_utils.test.ts +++ b/test/core/puppeteer_utils.test.ts @@ -412,6 +412,7 @@ describe('puppeteerUtils', () => { it('saveSnapshot() works', async () => { const openKVSSpy = vitest.spyOn(KeyValueStore, 'open'); const browser = await method(launchContext); + const { APIFY_IS_AT_HOME: isAtHome } = process.env; try { const page = await browser.newPage(); @@ -425,8 +426,8 @@ describe('puppeteerUtils', () => { openKVSSpy.mockResolvedValue(object as any); await puppeteerUtils.saveSnapshot(page, { key: 'TEST', keyValueStoreName: 'TEST-STORE', screenshotQuality: 60 }); - expect(object.setValue).toBeCalledWith('TEST.jpg', screenshot, { contentType: 'image/jpeg' }); - expect(object.setValue).toBeCalledWith('TEST.html', contentHTML, { contentType: 'text/html' }); + expect(object.setValue).toBeCalledWith(isAtHome ? 'TEST.jpg' : 'TEST', screenshot, { contentType: 'image/jpeg' }); + expect(object.setValue).toBeCalledWith(isAtHome ? 'TEST.html' : 'TEST', contentHTML, { contentType: 'text/html' }); object.setValue.mockReset(); // Test saving only image @@ -434,7 +435,7 @@ describe('puppeteerUtils', () => { // Default quality is 50 const screenshot2 = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 50 }); - expect(object.setValue).toBeCalledWith('SNAPSHOT.jpg', screenshot2, { contentType: 'image/jpeg' }); + expect(object.setValue).toBeCalledWith(isAtHome ? 'SNAPSHOT.jpg' : 'SNAPSHOT', screenshot2, { contentType: 'image/jpeg' }); } finally { await browser.close(); } From e08a3cfb40ae82924582027834c4d1e2b02c6087 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Fri, 23 Feb 2024 17:41:40 +0300 Subject: [PATCH 15/28] fix: lint issue --- packages/utils/test/non-error-objects-working.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/utils/test/non-error-objects-working.test.ts b/packages/utils/test/non-error-objects-working.test.ts index 8b3dcd83c957..c7adfbfbb511 100644 --- a/packages/utils/test/non-error-objects-working.test.ts +++ b/packages/utils/test/non-error-objects-working.test.ts @@ -4,6 +4,6 @@ describe('ErrorTracker', () => { test('processing a non-error error should not crash', () => { const errorTracker = new ErrorTracker(); // @ts-expect-error tracking non-error errors - expect(async () => await errorTracker.add('foo')).not.toThrow(); + expect(() => errorTracker.add('foo')).not.toThrow(); }); }); From e56e1cb34ef6f2e95bcdf3af15b524e385aaba5a Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Tue, 12 Mar 2024 01:31:30 +0300 Subject: [PATCH 16/28] refactor: error_tracker add methods handling and error_snapshotter tests --- packages/core/src/crawlers/error_tracker.ts | 32 ++++++----------- test/e2e/cheerio-error-snapshot/test.mjs | 15 ++++++-- test/e2e/puppeteer-error-snapshot/test.mjs | 24 +++++++++++-- test/e2e/run.mjs | 8 +++-- test/e2e/tools.mjs | 39 ++++++++++++++++++--- 5 files changed, 84 insertions(+), 34 deletions(-) diff --git a/packages/core/src/crawlers/error_tracker.ts b/packages/core/src/crawlers/error_tracker.ts index e8cfdd73d6af..62b43bdb99ed 100644 --- a/packages/core/src/crawlers/error_tracker.ts +++ b/packages/core/src/crawlers/error_tracker.ts @@ -305,9 +305,7 @@ export class ErrorTracker { this.total = 0; } - add(error: ErrnoException) { - this.total++; - + private updateGroup(error: ErrnoException) { let group = this.result; if (this.#options.showStackTrace) { @@ -328,6 +326,14 @@ export class ErrorTracker { increaseCount(group as { count: number }); + return group; + } + + add(error: ErrnoException) { + this.total++; + + this.updateGroup(error); + if (typeof error.cause === 'object' && error.cause !== null) { this.add(error.cause); } @@ -340,25 +346,7 @@ export class ErrorTracker { async addAsync(error: ErrnoException, context?: CrawlingContext) { this.total++; - let group = this.result; - - if (this.#options.showStackTrace) { - group = getStackTraceGroup(error, group, this.#options.showFullStack); - } - - if (this.#options.showErrorCode) { - group = getErrorCodeGroup(error, group); - } - - if (this.#options.showErrorName) { - group = getErrorNameGroup(error, group); - } - - if (this.#options.showErrorMessage) { - group = getErrorMessageGroup(error, group, this.#options.showFullMessage); - } - - increaseCount(group as { count: number }); + const group = this.updateGroup(error); // Capture a snapshot (screenshot and HTML) on the first occurrence of an error if (group.count === 1 && context) { diff --git a/test/e2e/cheerio-error-snapshot/test.mjs b/test/e2e/cheerio-error-snapshot/test.mjs index 7a0f1d9bb1e0..912f6a7bf24d 100644 --- a/test/e2e/cheerio-error-snapshot/test.mjs +++ b/test/e2e/cheerio-error-snapshot/test.mjs @@ -1,10 +1,21 @@ -import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs'; +import { initialize, getActorTestDir, runActor, expect, hasNestedKey } from '../tools.mjs'; const testActorDirname = getActorTestDir(import.meta.url); await initialize(testActorDirname); const { stats, defaultKeyValueStoreItems } = await runActor(testActorDirname); +// All requests should fail to test the error snapshots await expect(stats.requestsFailed === 4, 'All requests failed'); + +let totalErrorHtmlFiles = 0; +for (const error of Object.values(stats.errors)) { + if (hasNestedKey(error, 'firstErrorHtmlUrl')) { + totalErrorHtmlFiles++; + } +} + +// Count of error HTML files stored in the stats to make sure they are saved +await expect(totalErrorHtmlFiles === 3, 'Number of HTML error files in stats should be 3'); // Count of error HTML files stored in the Key-Value store -await expect(defaultKeyValueStoreItems.length === 3, 'Number of HTML error snapshots in KV store'); +await expect(defaultKeyValueStoreItems.length === 3, 'Number of HTML error files in KV store should be 3'); diff --git a/test/e2e/puppeteer-error-snapshot/test.mjs b/test/e2e/puppeteer-error-snapshot/test.mjs index 54ec4de9b031..404d64e916be 100644 --- a/test/e2e/puppeteer-error-snapshot/test.mjs +++ b/test/e2e/puppeteer-error-snapshot/test.mjs @@ -1,10 +1,30 @@ -import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs'; +import { initialize, getActorTestDir, runActor, expect, hasNestedKey } from '../tools.mjs'; const testActorDirname = getActorTestDir(import.meta.url); await initialize(testActorDirname); const { stats, defaultKeyValueStoreItems } = await runActor(testActorDirname); +// All requests should fail to test the error snapshots await expect(stats.requestsFailed === 4, 'All requests failed'); + +let totalErrorHtmlFiles = 0; +let totalErrorScreenshotFiles = 0; +for (const error of Object.values(stats.errors)) { + if (hasNestedKey(error, 'firstErrorHtmlUrl')) { + totalErrorHtmlFiles++; + } +} + +for (const error of Object.values(stats.errors)) { + if (hasNestedKey(error, 'firstErrorScreenshotUrl')) { + totalErrorScreenshotFiles++; + } +} + +// Count of error HTML files stored in the stats to make sure they are saved +await expect(totalErrorHtmlFiles === 4, 'Number of HTML error files in stats should be 4'); +// Count of error Screenshot files stored in the stats to make sure they are saved +await expect(totalErrorScreenshotFiles === 4, 'Number of screenshots error files in stats should be 4'); // Count of error HTML files and screenshot files stored in the Key-Value store -await expect(defaultKeyValueStoreItems.length === 8, 'Number of HTML error snapshots in KV store'); +await expect(defaultKeyValueStoreItems.length === 8, 'Number of HTML and screenshot error snapshots in KV store should be 8'); diff --git a/test/e2e/run.mjs b/test/e2e/run.mjs index 8a9b5dc1c69f..ae107458d842 100644 --- a/test/e2e/run.mjs +++ b/test/e2e/run.mjs @@ -1,9 +1,11 @@ -import { dirname } from 'node:path'; -import { fileURLToPath } from 'node:url'; +/* eslint-disable no-loop-func */ +import { execSync } from 'node:child_process'; import { once } from 'node:events'; import { readdir } from 'node:fs/promises'; +import { dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; import { isMainThread, Worker, workerData } from 'node:worker_threads'; -import { execSync } from 'node:child_process'; + import { colors, getApifyToken, clearPackages, clearStorage, SKIPPED_TEST_CLOSE_CODE } from './tools.mjs'; const basePath = dirname(fileURLToPath(import.meta.url)); diff --git a/test/e2e/tools.mjs b/test/e2e/tools.mjs index ae18eda80da1..5dec93e9c8ee 100644 --- a/test/e2e/tools.mjs +++ b/test/e2e/tools.mjs @@ -1,16 +1,22 @@ -import { dirname, join } from 'node:path'; -import { fileURLToPath } from 'node:url'; +import { execSync as execSyncOriginal } from 'node:child_process'; import { existsSync } from 'node:fs'; import { readdir, readFile } from 'node:fs/promises'; import { homedir } from 'node:os'; +import { dirname, join } from 'node:path'; import { setTimeout } from 'node:timers/promises'; -import { execSync as execSyncOriginal } from 'node:child_process'; -import { got } from 'got'; -import fs from 'fs-extra'; +import { fileURLToPath } from 'node:url'; + import { Actor } from 'apify'; +import fs from 'fs-extra'; +import { got } from 'got'; + // eslint-disable-next-line import/no-relative-packages import { URL_NO_COMMAS_REGEX } from '../../packages/utils/dist/index.mjs'; +/** + * @param {string} command + * @param {import('node:child_process').ExecSyncOptions} options + */ function execSync(command, options) { return execSyncOriginal(command, { ...options, encoding: 'utf-8' }); } @@ -469,3 +475,26 @@ function isItemHidden(item) { } return true; } + +/** + * @param {any} obj the object to search + * @param {string} keyName the key to search for + * @returns {boolean} + */ +export function hasNestedKey(obj, keyName) { + if (typeof obj !== 'object' || obj === null) { + return false; + } + + for (const key of Object.keys(obj)) { + if (key === keyName) { + return true; + } + + if (typeof obj[key] === 'object' && obj[key] !== null && hasNestedKey(obj[key], keyName)) { + return true; + } + } + + return false; +} From d2b2c3191d87b24197ac1c462ee33f2eb7b57e1f Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Tue, 14 May 2024 23:49:48 +0300 Subject: [PATCH 17/28] fix: readd error-tracker.ts file to packages/utils to avoid breaking changes --- packages/utils/src/index.ts | 1 + packages/utils/src/internals/error-tracker.ts | 376 ++++++++++++++++++ 2 files changed, 377 insertions(+) create mode 100644 packages/utils/src/internals/error-tracker.ts diff --git a/packages/utils/src/index.ts b/packages/utils/src/index.ts index 9ba820c5ab78..1bd8d5901abe 100644 --- a/packages/utils/src/index.ts +++ b/packages/utils/src/index.ts @@ -7,6 +7,7 @@ export * from './internals/memory-info'; export * from './internals/debug'; export * as social from './internals/social'; export * from './internals/typedefs'; +export * from './internals/error-tracker'; export * from './internals/open_graph_parser'; export * from './internals/gotScraping'; export * from './internals/robots'; diff --git a/packages/utils/src/internals/error-tracker.ts b/packages/utils/src/internals/error-tracker.ts new file mode 100644 index 000000000000..259a4c33b5bd --- /dev/null +++ b/packages/utils/src/internals/error-tracker.ts @@ -0,0 +1,376 @@ +import { inspect } from 'node:util'; + +/** + * Node.js Error interface + */ + interface ErrnoException extends Error { + errno?: number | undefined; + code?: string | number | undefined; + path?: string | undefined; + syscall?: string | undefined; + cause?: any; +} + +export interface ErrorTrackerOptions { + showErrorCode: boolean; + showErrorName: boolean; + showStackTrace: boolean; + showFullStack: boolean; + showErrorMessage: boolean; + showFullMessage: boolean; +} + +const extractPathFromStackTraceLine = (line: string) => { + const lastStartingRoundBracketIndex = line.lastIndexOf('('); + + if (lastStartingRoundBracketIndex !== -1) { + const closingRoundBracketIndex = line.indexOf(')', lastStartingRoundBracketIndex); + + if (closingRoundBracketIndex !== -1) { + return line.slice(lastStartingRoundBracketIndex + 1, closingRoundBracketIndex); + } + } + + return line; +}; + +// https://v8.dev/docs/stack-trace-api#appendix%3A-stack-trace-format +const getPathFromStackTrace = (stack: string[]) => { + for (const line of stack) { + const path = extractPathFromStackTraceLine(line); + + if ( + path.startsWith('node:') + || path.includes('/node_modules/') + || path.includes('\\node_modules\\') + ) { + continue; + } + + return path; + } + + return extractPathFromStackTraceLine(stack[0]); +}; + +const getStackTraceGroup = (error: ErrnoException, storage: Record, showFullStack: boolean) => { + const stack = error.stack?.split('\n').map((line) => line.trim()); + + let sliceAt = -1; + + if (stack) { + for (let i = 0; i < stack.length; i++) { + if (stack[i].startsWith('at ') || stack[i].startsWith('eval at ')) { + sliceAt = i; + break; + } + } + } + + let normalizedStackTrace = null; + if (sliceAt !== -1) { + normalizedStackTrace = showFullStack ? stack!.slice(sliceAt).map((x) => x.trim()).join('\n') : getPathFromStackTrace(stack!.slice(sliceAt)); + } + + if (!normalizedStackTrace) { + normalizedStackTrace = 'missing stack trace'; + } + + if (!(normalizedStackTrace in storage)) { + storage[normalizedStackTrace] = Object.create(null); + } + + return storage[normalizedStackTrace] as Record; +}; + +const getErrorCodeGroup = (error: ErrnoException, storage: Record) => { + let { code } = error; + + if (code === undefined) { + code = 'missing error code'; + } + + if (!(code in storage)) { + storage[code] = Object.create(null); + } + + return storage[String(code)] as Record; +}; + +const getErrorNameGroup = (error: ErrnoException, storage: Record) => { + const { name } = error; + + if (!(name in storage)) { + storage[name] = Object.create(null); + } + + return storage[name] as Record; +}; + +const findBiggestWordIntersection = (a: string[], b: string[]) => { + let maxStreak = 0; + let bStreakIndex = -1; + let aStreakIndex = -1; + for (let aIndex = 0; aIndex < a.length; aIndex++) { + let bIndex = -1; + + do { + let aWalkIndex = aIndex; + + bIndex = b.indexOf(a[aIndex], bIndex + 1); + + let bWalkIndex = bIndex; + + let streak = 0; + while (aWalkIndex < a.length && bWalkIndex < b.length && b[bWalkIndex++] === a[aWalkIndex++]) { + streak++; + } + + if (streak > maxStreak) { + maxStreak = streak; + aStreakIndex = aIndex; + bStreakIndex = bIndex; + } + } while (bIndex !== -1); + } + + return { + maxStreak, + aStreakIndex, + bStreakIndex, + }; +}; + +const arrayCount = (array: unknown[], target: unknown) => { + let result = 0; + + for (const item of array) { + if (item === target) { + result++; + } + } + + return result; +}; + +const calculatePlaceholder = (a: string[], b: string[]) => { + const { maxStreak, aStreakIndex, bStreakIndex } = findBiggestWordIntersection(a, b); + + if (maxStreak === 0) { + return ['_']; + } + + const leftA = a.slice(0, aStreakIndex); + const leftB = b.slice(0, bStreakIndex); + const rightA = a.slice(aStreakIndex + maxStreak); + const rightB = b.slice(bStreakIndex + maxStreak); + + const output: string[] = []; + + if (leftA.length !== 0 || leftB.length !== 0) { + output.push(...calculatePlaceholder(leftA, leftB)); + } + + output.push(...a.slice(aStreakIndex, aStreakIndex + maxStreak)); + + if (rightA.length !== 0 || rightB.length !== 0) { + output.push(...calculatePlaceholder(rightA, rightB)); + } + + return output; +}; + +const normalizedCalculatePlaceholder = (a: string[], b: string[]) => { + const output = calculatePlaceholder(a, b); + + // We can't be too general + if ((arrayCount(output, '_') / output.length) >= 0.5) { + return ['_']; + } + + return output; +}; + +// Merge A (missing placeholders) into B (can contain placeholders but does not have to) +const mergeMessages = (a: string, b: string, storage: Record) => { + const placeholder = normalizedCalculatePlaceholder( + a.split(' '), + b.split(' '), + ).join(' '); + + if (placeholder === '_') { + return undefined; + } + + interface HasCount { + count: number; + } + + const count = (storage[a] as HasCount).count + (storage[b] as HasCount).count; + + delete storage[a]; + delete storage[b]; + + storage[placeholder] = Object.assign(Object.create(null), { + count, + }); + + return placeholder; +}; + +const getErrorMessageGroup = (error: ErrnoException, storage: Record, showFullMessage: boolean) => { + let { message } = error; + + if (!message) { + try { + message = typeof error === 'string' ? error : `Unknown error message. Received non-error object: ${JSON.stringify(error)}`; + } catch { + message = `Unknown error message. Received non-error object, and could not stringify it: ${inspect(error, { depth: 0 })}`; + } + } + + if (!showFullMessage) { + const newLineIndex = message.indexOf('\n'); + message = message.slice(0, newLineIndex === -1 ? undefined : newLineIndex); + } + + if (!(message in storage)) { + storage[message] = Object.assign(Object.create(null), { + count: 0, + }); + + // This actually safe, since we Object.create(null) so no prototype pollution can happen. + // eslint-disable-next-line no-restricted-syntax, guard-for-in + for (const existingMessage in storage) { + const newMessage = mergeMessages(message, existingMessage, storage); + if (newMessage) { + message = newMessage; + break; + } + } + } + + return storage[message] as Record; +}; + +const increaseCount = (group: { count?: number }) => { + if (!('count' in group)) { + // In case users don't want to display error message + group.count = 0; + } + + group.count!++; +}; + +/** + * This class tracks errors and computes a summary of information like: + * - where the errors happened + * - what the error names are + * - what the error codes are + * - what is the general error message + * + * This is extremely useful when there are dynamic error messages, such as argument validation. + * + * Since the structure of the `tracker.result` object differs when using different options, + * it's typed as `Record`. The most deep object has a `count` property, which is a number. + * + * It's possible to get the total amount of errors via the `tracker.total` property. + */ +export class ErrorTracker { + #options: ErrorTrackerOptions; + + result: Record; + + total: number; + + constructor(options: Partial = {}) { + this.#options = { + showErrorCode: true, + showErrorName: true, + showStackTrace: true, + showFullStack: false, + showErrorMessage: true, + showFullMessage: false, + ...options, + }; + + this.result = Object.create(null); + this.total = 0; + } + + add(error: ErrnoException) { + this.total++; + + let group = this.result; + + if (this.#options.showStackTrace) { + group = getStackTraceGroup(error, group, this.#options.showFullStack); + } + + if (this.#options.showErrorCode) { + group = getErrorCodeGroup(error, group); + } + + if (this.#options.showErrorName) { + group = getErrorNameGroup(error, group); + } + + if (this.#options.showErrorMessage) { + group = getErrorMessageGroup(error, group, this.#options.showFullMessage); + } + + increaseCount(group as { count: number }); + + if (typeof error.cause === 'object' && error.cause !== null) { + this.add(error.cause); + } + } + + getUniqueErrorCount() { + let count = 0; + + const goDeeper = (group: Record): void => { + if ('count' in group) { + count++; + return; + } + + // eslint-disable-next-line guard-for-in, no-restricted-syntax + for (const key in group) { + goDeeper(group[key] as Record); + } + }; + + goDeeper(this.result); + + return count; + } + + getMostPopularErrors(count: number) { + const result: [number, string[]][] = []; + + const goDeeper = (group: Record, path: string[]): void => { + if ('count' in group) { + result.push([(group as any).count, path]); + return; + } + + // eslint-disable-next-line guard-for-in, no-restricted-syntax + for (const key in group) { + goDeeper(group[key] as Record, [...path, key]); + } + }; + + goDeeper(this.result, []); + + return result.sort((a, b) => b[0] - a[0]).slice(0, count); + } + + reset() { + // This actually safe, since we Object.create(null) so no prototype pollution can happen. + // eslint-disable-next-line no-restricted-syntax, guard-for-in + for (const key in this.result) { + delete this.result[key]; + } + } +} From a509de608072d3a0f103f4df3c72d07ec65d4c44 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Tue, 14 May 2024 23:51:26 +0300 Subject: [PATCH 18/28] refactor: remove the extension logic when saving screenshot --- .../src/internals/utils/playwright-utils.ts | 6 ++---- .../src/internals/utils/puppeteer_utils.ts | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts index 6d8cea5a8376..174a42fd3985 100644 --- a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts +++ b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts @@ -534,16 +534,14 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {} try { const store = await KeyValueStore.open(keyValueStoreName, { config: config ?? Configuration.getGlobalConfig() }); - const { APIFY_IS_AT_HOME } = process.env; - if (saveScreenshot) { - const screenshotName = APIFY_IS_AT_HOME ? `${key}.jpg` : key; + const screenshotName = `${key}.jpg`; const screenshotBuffer = await page.screenshot({ fullPage: true, quality: screenshotQuality, type: 'jpeg', animations: 'disabled' }); await store.setValue(screenshotName, screenshotBuffer, { contentType: 'image/jpeg' }); } if (saveHtml) { - const htmlName = APIFY_IS_AT_HOME ? `${key}.html` : key; + const htmlName = `${key}.html`; const html = await page.content(); await store.setValue(htmlName, html, { contentType: 'text/html' }); } diff --git a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts index 2d7cceb9164f..154761391e65 100644 --- a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts +++ b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts @@ -666,16 +666,14 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {} try { const store = await KeyValueStore.open(keyValueStoreName, { config: config ?? Configuration.getGlobalConfig() }); - const { APIFY_IS_AT_HOME } = process.env; - if (saveScreenshot) { - const screenshotName = APIFY_IS_AT_HOME ? `${key}.jpg` : key; + const screenshotName = `${key}.jpg`; const screenshotBuffer = await page.screenshot({ fullPage: true, quality: screenshotQuality, type: 'jpeg' }); await store.setValue(screenshotName, screenshotBuffer, { contentType: 'image/jpeg' }); } if (saveHtml) { - const htmlName = APIFY_IS_AT_HOME ? `${key}.html` : key; + const htmlName = `${key}.html`; const html = await page.content(); await store.setValue(htmlName, html, { contentType: 'text/html' }); } From 967def24d749d723795f383a015f63ea4e3505c5 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Tue, 14 May 2024 23:53:44 +0300 Subject: [PATCH 19/28] refactor: Update error_tracker to remove the new addAsync function and revert changes to add function --- .../src/internals/basic-crawler.ts | 4 +- .../core/src/crawlers/error_snapshotter.ts | 88 ++++++++++--------- packages/core/src/crawlers/error_tracker.ts | 30 ++----- 3 files changed, 53 insertions(+), 69 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index a4e405551b08..aa1df0a01002 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1363,7 +1363,7 @@ export class BasicCrawler { - const { KEY_VALUE_PLATFORM_PATH, KEY_VALUE_STORE_LOCAL_PATH } = ErrorSnapshotter; - const page = context?.page as PuppeteerPage | PlaywrightPage | undefined; - const body = context?.body; - - const keyValueStore = await context?.getKeyValueStore(); - // If the key-value store is not available, or the body and page are not available, return empty filenames - if (!keyValueStore || (!body && !page)) { - return {}; - } - - const filename = this.generateFilename(error); - - let screenshotFilename: string | undefined; - let htmlFileName: string | undefined; - - if (page) { - const capturedFiles = await this.captureSnapShot( - context as unknown as - | PlaywrightCrawlingContext - | PuppeteerCrawlingContext, - filename, - ); + try { + const { KEY_VALUE_PLATFORM_PATH, KEY_VALUE_STORE_LOCAL_PATH } = ErrorSnapshotter; + const page = context?.page as PuppeteerPage | PlaywrightPage | undefined; + const body = context?.body; + + const keyValueStore = await context?.getKeyValueStore(); + // If the key-value store is not available, or the body and page are not available, return empty filenames + if (!keyValueStore || (!body && !page)) { + return {}; + } - if (capturedFiles) { - screenshotFilename = capturedFiles.screenshotFilename; - htmlFileName = capturedFiles.htmlFileName; + const filename = this.generateFilename(error); + + let screenshotFilename: string | undefined; + let htmlFileName: string | undefined; + + if (page) { + const capturedFiles = await this.captureSnapShot( + context as unknown as + | PlaywrightCrawlingContext + | PuppeteerCrawlingContext, + filename, + ); + + if (capturedFiles) { + screenshotFilename = capturedFiles.screenshotFilename; + htmlFileName = capturedFiles.htmlFileName; + } + + // If the snapshot for browsers failed to capture the HTML, try to capture it from the page content + if (!htmlFileName) { + const html = await page?.content() || undefined; + htmlFileName = html ? await this.saveHTMLSnapshot(html, keyValueStore, filename) : undefined; + } + } else if (typeof body === 'string') { // for non-browser contexts + htmlFileName = await this.saveHTMLSnapshot(body, keyValueStore, filename); } - // If the snapshot for browsers failed to capture the HTML, try to capture it from the page content - if (!htmlFileName) { - const html = await page?.content() || undefined; - htmlFileName = html ? await this.saveHTMLSnapshot(html, keyValueStore, filename) : undefined; + if (APIFY_IS_AT_HOME) { + const platformPath = `${KEY_VALUE_PLATFORM_PATH}/${keyValueStore.id}/records`; + return { + screenshotFileUrl: screenshotFilename ? `${platformPath}/${screenshotFilename}` : undefined, + htmlFileUrl: htmlFileName ? `${platformPath}/${htmlFileName}` : undefined, + }; } - } else if (typeof body === 'string') { // for non-browser contexts - htmlFileName = await this.saveHTMLSnapshot(body, keyValueStore, filename); - } - if (APIFY_IS_AT_HOME) { - const platformPath = `${KEY_VALUE_PLATFORM_PATH}/${keyValueStore.id}/records`; + const localPath = `${KEY_VALUE_STORE_LOCAL_PATH}/${keyValueStore.name || 'default'}`; return { - screenshotFileUrl: screenshotFilename ? `${platformPath}/${screenshotFilename}` : undefined, - htmlFileUrl: htmlFileName ? `${platformPath}/${htmlFileName}` : undefined, + screenshotFileUrl: screenshotFilename ? `${localPath}/${screenshotFilename}` : undefined, + htmlFileUrl: htmlFileName ? `${localPath}/${htmlFileName}` : undefined, }; + } catch (e) { + return {}; } - - const localPath = `${KEY_VALUE_STORE_LOCAL_PATH}/${keyValueStore.name || 'default'}`; - return { - screenshotFileUrl: screenshotFilename ? `${localPath}/${screenshotFilename}` : undefined, - htmlFileUrl: htmlFileName ? `${localPath}/${htmlFileName}` : undefined, - }; } /** diff --git a/packages/core/src/crawlers/error_tracker.ts b/packages/core/src/crawlers/error_tracker.ts index 62b43bdb99ed..0f5769468dae 100644 --- a/packages/core/src/crawlers/error_tracker.ts +++ b/packages/core/src/crawlers/error_tracker.ts @@ -305,7 +305,9 @@ export class ErrorTracker { this.total = 0; } - private updateGroup(error: ErrnoException) { + add(error: ErrnoException, context?: CrawlingContext) { + this.total++; + let group = this.result; if (this.#options.showStackTrace) { @@ -326,35 +328,13 @@ export class ErrorTracker { increaseCount(group as { count: number }); - return group; - } - - add(error: ErrnoException) { - this.total++; - - this.updateGroup(error); - - if (typeof error.cause === 'object' && error.cause !== null) { - this.add(error.cause); - } - } - - /** - * This method is async, because it captures a snapshot of the error context. - * We added this new method to avoid breaking changes. - */ - async addAsync(error: ErrnoException, context?: CrawlingContext) { - this.total++; - - const group = this.updateGroup(error); - // Capture a snapshot (screenshot and HTML) on the first occurrence of an error if (group.count === 1 && context) { - await this.captureSnapshot(group, error, context).catch(() => { }); + void this.captureSnapshot(group, error, context).catch(() => { }); } if (typeof error.cause === 'object' && error.cause !== null) { - await this.addAsync(error.cause); + this.add(error.cause); } } From 224d512ceffe1e998dee6b10a44e36598ae2a422 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Wed, 15 May 2024 02:20:14 +0300 Subject: [PATCH 20/28] chore: update node version in Dockerfile for error_tracker tests --- test/core/error_tracker.test.ts | 2 +- test/e2e/cheerio-error-snapshot/actor/Dockerfile | 2 +- test/e2e/puppeteer-error-snapshot/actor/Dockerfile | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/core/error_tracker.test.ts b/test/core/error_tracker.test.ts index f0b0763ef252..c77fac20b61d 100644 --- a/test/core/error_tracker.test.ts +++ b/test/core/error_tracker.test.ts @@ -126,7 +126,7 @@ test('works', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('no code is null code', async () => { +test('no code is null code', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, diff --git a/test/e2e/cheerio-error-snapshot/actor/Dockerfile b/test/e2e/cheerio-error-snapshot/actor/Dockerfile index 4874669fcad0..36afd80b9648 100644 --- a/test/e2e/cheerio-error-snapshot/actor/Dockerfile +++ b/test/e2e/cheerio-error-snapshot/actor/Dockerfile @@ -1,4 +1,4 @@ -FROM apify/actor-node:18-beta +FROM apify/actor-node:20-beta COPY packages ./packages COPY package*.json ./ diff --git a/test/e2e/puppeteer-error-snapshot/actor/Dockerfile b/test/e2e/puppeteer-error-snapshot/actor/Dockerfile index 840bbbde8dff..1e7718842183 100644 --- a/test/e2e/puppeteer-error-snapshot/actor/Dockerfile +++ b/test/e2e/puppeteer-error-snapshot/actor/Dockerfile @@ -1,4 +1,4 @@ -FROM node:18 AS builder +FROM node:20 AS builder COPY /packages ./packages COPY /package*.json ./ @@ -6,7 +6,7 @@ RUN npm --quiet set progress=false \ && npm install --only=prod --no-optional --no-audit \ && npm update -FROM apify/actor-node-puppeteer-chrome:18-beta +FROM apify/actor-node-puppeteer-chrome:20-beta RUN rm -r node_modules COPY --from=builder /node_modules ./node_modules From deed57bce660f473b6e6991f20e4c0fc03fed102 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Wed, 15 May 2024 02:21:42 +0300 Subject: [PATCH 21/28] refactor: Update ErrorSnapshotter class --- .../core/src/crawlers/error_snapshotter.ts | 63 +++++++++++-------- packages/core/src/typedefs.ts | 3 + 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/packages/core/src/crawlers/error_snapshotter.ts b/packages/core/src/crawlers/error_snapshotter.ts index d66758f93663..ade51deb3f6b 100644 --- a/packages/core/src/crawlers/error_snapshotter.ts +++ b/packages/core/src/crawlers/error_snapshotter.ts @@ -1,11 +1,12 @@ import crypto from 'node:crypto'; +import { resolve } from 'node:path'; + +import { pathExistsSync } from 'fs-extra'; import type { ErrnoException } from './error_tracker'; import type { CrawlingContext } from '../crawlers/crawler_commons'; import type { KeyValueStore } from '../storages'; -import type { PlaywrightCrawlingContext, PuppeteerCrawlingContext, PlaywrightPage, PuppeteerPage } from '../typedefs'; - -const { PWD, CRAWLEE_STORAGE_DIR, APIFY_IS_AT_HOME } = process.env; +import type { PlaywrightCrawlingContext, PuppeteerCrawlingContext, PlaywrightPage, PuppeteerPage, SnapshotResult } from '../typedefs'; /** * ErrorSnapshotter class is used to capture a screenshot of the page and a snapshot of the HTML when an error occur during web crawling. @@ -16,15 +17,31 @@ export class ErrorSnapshotter { static readonly MAX_FILENAME_LENGTH = 250; static readonly BASE_MESSAGE = 'An error occurred'; static readonly SNAPSHOT_PREFIX = 'ERROR_SNAPSHOT'; - static readonly KEY_VALUE_PLATFORM_PATH = 'https://api.apify.com/v2/key-value-stores'; - static readonly KEY_VALUE_STORE_LOCAL_PATH = `file://${PWD}/${CRAWLEE_STORAGE_DIR || 'storage'}/key_value_stores`; + + private KEY_VALUE_STORE_LOCAL_PATH: string; + private KEY_VALUE_PLATFORM_PATH :string; + + constructor() { + // v3.0.0 used `crawlee_storage` as the default, we changed this in v3.0.1 to just `storage`, + // this function handles it without making BC breaks - it respects existing `crawlee_storage` + // directories, and uses the `storage` only if it's not there. + const defaultStorageDir = () => { + if (pathExistsSync(resolve('./crawlee_storage'))) { + return 'crawlee_storage'; + } + + return 'storage'; + }; + + this.KEY_VALUE_PLATFORM_PATH = 'https://api.apify.com/v2/key-value-stores'; + this.KEY_VALUE_STORE_LOCAL_PATH = `file://${process.env.PWD}/${defaultStorageDir()}/key_value_stores`; + } /** * Capture a snapshot of the error context. */ async captureSnapshot(error: ErrnoException, context: CrawlingContext): Promise<{ screenshotFileUrl?: string; htmlFileUrl?: string }> { try { - const { KEY_VALUE_PLATFORM_PATH, KEY_VALUE_STORE_LOCAL_PATH } = ErrorSnapshotter; const page = context?.page as PuppeteerPage | PlaywrightPage | undefined; const body = context?.body; @@ -41,9 +58,7 @@ export class ErrorSnapshotter { if (page) { const capturedFiles = await this.captureSnapShot( - context as unknown as - | PlaywrightCrawlingContext - | PuppeteerCrawlingContext, + context as unknown as PlaywrightCrawlingContext | PuppeteerCrawlingContext, filename, ); @@ -61,15 +76,15 @@ export class ErrorSnapshotter { htmlFileName = await this.saveHTMLSnapshot(body, keyValueStore, filename); } - if (APIFY_IS_AT_HOME) { - const platformPath = `${KEY_VALUE_PLATFORM_PATH}/${keyValueStore.id}/records`; + if (process.env.APIFY_IS_AT_HOME) { + const platformPath = `${this.KEY_VALUE_PLATFORM_PATH}/${keyValueStore.id}/records`; return { screenshotFileUrl: screenshotFilename ? `${platformPath}/${screenshotFilename}` : undefined, htmlFileUrl: htmlFileName ? `${platformPath}/${htmlFileName}` : undefined, }; } - const localPath = `${KEY_VALUE_STORE_LOCAL_PATH}/${keyValueStore.name || 'default'}`; + const localPath = `${this.KEY_VALUE_STORE_LOCAL_PATH}/${keyValueStore.name || 'default'}`; return { screenshotFileUrl: screenshotFilename ? `${localPath}/${screenshotFilename}` : undefined, htmlFileUrl: htmlFileName ? `${localPath}/${htmlFileName}` : undefined, @@ -82,17 +97,12 @@ export class ErrorSnapshotter { /** * Capture a screenshot and HTML of the page (For Browser only), and return the filename with the extension. */ - async captureSnapShot( - context: PlaywrightCrawlingContext | PuppeteerCrawlingContext, - filename: string): Promise<{ - screenshotFilename?: string; - htmlFileName?: string; - } | undefined> { + async captureSnapShot(context: PlaywrightCrawlingContext | PuppeteerCrawlingContext, filename: string): Promise { try { await context.saveSnapshot({ key: filename }); return { // The screenshot file extension is different for Apify and local environments - screenshotFilename: `${filename}${APIFY_IS_AT_HOME ? '.jpg' : '.jpeg'}`, + screenshotFilename: `${filename}${process.env.APIFY_IS_AT_HOME ? '.jpg' : '.jpeg'}`, htmlFileName: `${filename}.html`, }; } catch (e) { @@ -112,23 +122,22 @@ export class ErrorSnapshotter { } } - /** - * Remove non-word characters from the start and end of a string. - */ - sanitizeString(str: string): string { - return str.replace(/^\W+|\W+$/g, ''); - } - /** * Generate a unique filename for each error snapshot. */ generateFilename(error: ErrnoException): string { const { SNAPSHOT_PREFIX, BASE_MESSAGE, MAX_HASH_LENGTH, MAX_ERROR_CHARACTERS, MAX_FILENAME_LENGTH } = ErrorSnapshotter; - const { sanitizeString } = this; // Create a hash of the error stack trace const errorStackHash = crypto.createHash('sha1').update(error.stack || error.message || '').digest('hex').slice(0, MAX_HASH_LENGTH); const errorMessagePrefix = (error.message || BASE_MESSAGE).slice(0, MAX_ERROR_CHARACTERS).trim(); + /** + * Remove non-word characters from the start and end of a string. + */ + const sanitizeString = (str: string): string => { + return str.replace(/^\W+|\W+$/g, ''); + }; + // Generate filename and remove disallowed characters const filename = `${SNAPSHOT_PREFIX}_${sanitizeString(errorStackHash)}_${sanitizeString(errorMessagePrefix)}` .replace(/\W+/g, '-') // Replace non-word characters with a dash diff --git a/packages/core/src/typedefs.ts b/packages/core/src/typedefs.ts index 2aa731b42012..9d9d5390d65a 100644 --- a/packages/core/src/typedefs.ts +++ b/packages/core/src/typedefs.ts @@ -31,3 +31,6 @@ export interface PlaywrightPage { export interface PuppeteerPage { content: () => Promise; } + +export interface SnapshotOptions { context: PlaywrightCrawlingContext | PuppeteerCrawlingContext; filename: string } +export type SnapshotResult = { screenshotFilename?: string; htmlFileName?: string } | undefined; From 084cb0ccaeed4f2a690269af1d708719e4540729 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Wed, 15 May 2024 22:41:59 +0300 Subject: [PATCH 22/28] refactor: Update ErrorSnapshotter class --- .../core/src/crawlers/error_snapshotter.ts | 76 ++-- packages/core/src/crawlers/error_tracker.ts | 8 +- packages/core/src/typedefs.ts | 13 +- packages/utils/src/index.ts | 1 - packages/utils/src/internals/error-tracker.ts | 376 ------------------ test/core/error_tracker.test.ts | 34 +- .../puppeteer-error-snapshot/actor/Dockerfile | 2 +- 7 files changed, 57 insertions(+), 453 deletions(-) delete mode 100644 packages/utils/src/internals/error-tracker.ts diff --git a/packages/core/src/crawlers/error_snapshotter.ts b/packages/core/src/crawlers/error_snapshotter.ts index ade51deb3f6b..acf095064010 100644 --- a/packages/core/src/crawlers/error_snapshotter.ts +++ b/packages/core/src/crawlers/error_snapshotter.ts @@ -1,12 +1,9 @@ import crypto from 'node:crypto'; -import { resolve } from 'node:path'; - -import { pathExistsSync } from 'fs-extra'; import type { ErrnoException } from './error_tracker'; import type { CrawlingContext } from '../crawlers/crawler_commons'; import type { KeyValueStore } from '../storages'; -import type { PlaywrightCrawlingContext, PuppeteerCrawlingContext, PlaywrightPage, PuppeteerPage, SnapshotResult } from '../typedefs'; +import type { BrowserCrawlingContext, BrowserPage, SnapshotResult } from '../typedefs'; /** * ErrorSnapshotter class is used to capture a screenshot of the page and a snapshot of the HTML when an error occur during web crawling. @@ -22,19 +19,8 @@ export class ErrorSnapshotter { private KEY_VALUE_PLATFORM_PATH :string; constructor() { - // v3.0.0 used `crawlee_storage` as the default, we changed this in v3.0.1 to just `storage`, - // this function handles it without making BC breaks - it respects existing `crawlee_storage` - // directories, and uses the `storage` only if it's not there. - const defaultStorageDir = () => { - if (pathExistsSync(resolve('./crawlee_storage'))) { - return 'crawlee_storage'; - } - - return 'storage'; - }; - this.KEY_VALUE_PLATFORM_PATH = 'https://api.apify.com/v2/key-value-stores'; - this.KEY_VALUE_STORE_LOCAL_PATH = `file://${process.env.PWD}/${defaultStorageDir()}/key_value_stores`; + this.KEY_VALUE_STORE_LOCAL_PATH = `file://${process.env.PWD}/storage/key_value_stores`; } /** @@ -42,7 +28,7 @@ export class ErrorSnapshotter { */ async captureSnapshot(error: ErrnoException, context: CrawlingContext): Promise<{ screenshotFileUrl?: string; htmlFileUrl?: string }> { try { - const page = context?.page as PuppeteerPage | PlaywrightPage | undefined; + const page = context?.page as BrowserPage | undefined; const body = context?.body; const keyValueStore = await context?.getKeyValueStore(); @@ -51,79 +37,81 @@ export class ErrorSnapshotter { return {}; } - const filename = this.generateFilename(error); + const fileName = this.generateFilename(error); - let screenshotFilename: string | undefined; + let screenshotFileName: string | undefined; let htmlFileName: string | undefined; if (page) { - const capturedFiles = await this.captureSnapShot( - context as unknown as PlaywrightCrawlingContext | PuppeteerCrawlingContext, - filename, + const capturedFiles = await this.contextCaptureSnapshot( + context as unknown as BrowserCrawlingContext, + fileName, ); if (capturedFiles) { - screenshotFilename = capturedFiles.screenshotFilename; + screenshotFileName = capturedFiles.screenshotFileName; htmlFileName = capturedFiles.htmlFileName; } // If the snapshot for browsers failed to capture the HTML, try to capture it from the page content if (!htmlFileName) { - const html = await page?.content() || undefined; - htmlFileName = html ? await this.saveHTMLSnapshot(html, keyValueStore, filename) : undefined; + const html = await page.content(); + htmlFileName = html ? await this.saveHTMLSnapshot(html, keyValueStore, fileName) : undefined; } } else if (typeof body === 'string') { // for non-browser contexts - htmlFileName = await this.saveHTMLSnapshot(body, keyValueStore, filename); + htmlFileName = await this.saveHTMLSnapshot(body, keyValueStore, fileName); } if (process.env.APIFY_IS_AT_HOME) { const platformPath = `${this.KEY_VALUE_PLATFORM_PATH}/${keyValueStore.id}/records`; return { - screenshotFileUrl: screenshotFilename ? `${platformPath}/${screenshotFilename}` : undefined, + screenshotFileUrl: screenshotFileName ? `${platformPath}/${screenshotFileName}` : undefined, htmlFileUrl: htmlFileName ? `${platformPath}/${htmlFileName}` : undefined, }; } const localPath = `${this.KEY_VALUE_STORE_LOCAL_PATH}/${keyValueStore.name || 'default'}`; return { - screenshotFileUrl: screenshotFilename ? `${localPath}/${screenshotFilename}` : undefined, + screenshotFileUrl: screenshotFileName ? `${localPath}/${screenshotFileName}` : undefined, htmlFileUrl: htmlFileName ? `${localPath}/${htmlFileName}` : undefined, }; - } catch (e) { + } catch { return {}; } } /** - * Capture a screenshot and HTML of the page (For Browser only), and return the filename with the extension. + * Captures a snapshot of the current page using the context.saveSnapshot function. + * This function is applicable for browser contexts only. + * Returns an object containing the filenames of the screenshot and HTML file. */ - async captureSnapShot(context: PlaywrightCrawlingContext | PuppeteerCrawlingContext, filename: string): Promise { + async contextCaptureSnapshot(context: BrowserCrawlingContext, fileName: string): Promise { try { - await context.saveSnapshot({ key: filename }); + await context.saveSnapshot({ key: fileName }); return { // The screenshot file extension is different for Apify and local environments - screenshotFilename: `${filename}${process.env.APIFY_IS_AT_HOME ? '.jpg' : '.jpeg'}`, - htmlFileName: `${filename}.html`, + screenshotFileName: `${fileName}.jpg`, + htmlFileName: `${fileName}.html`, }; - } catch (e) { + } catch { return undefined; } } /** - * Save the HTML snapshot of the page, and return the filename with the extension. + * Save the HTML snapshot of the page, and return the fileName with the extension. */ - async saveHTMLSnapshot(html: string, keyValueStore: KeyValueStore, filename: string): Promise { + async saveHTMLSnapshot(html: string, keyValueStore: KeyValueStore, fileName: string): Promise { try { - await keyValueStore.setValue(filename, html, { contentType: 'text/html' }); - return `${filename}.html`; - } catch (e) { + await keyValueStore.setValue(fileName, html, { contentType: 'text/html' }); + return `${fileName}.html`; + } catch { return undefined; } } /** - * Generate a unique filename for each error snapshot. + * Generate a unique fileName for each error snapshot. */ generateFilename(error: ErrnoException): string { const { SNAPSHOT_PREFIX, BASE_MESSAGE, MAX_HASH_LENGTH, MAX_ERROR_CHARACTERS, MAX_FILENAME_LENGTH } = ErrorSnapshotter; @@ -138,11 +126,11 @@ export class ErrorSnapshotter { return str.replace(/^\W+|\W+$/g, ''); }; - // Generate filename and remove disallowed characters - const filename = `${SNAPSHOT_PREFIX}_${sanitizeString(errorStackHash)}_${sanitizeString(errorMessagePrefix)}` + // Generate fileName and remove disallowed characters + const fileName = `${SNAPSHOT_PREFIX}_${sanitizeString(errorStackHash)}_${sanitizeString(errorMessagePrefix)}` .replace(/\W+/g, '-') // Replace non-word characters with a dash .slice(0, MAX_FILENAME_LENGTH); - return filename; + return fileName; } } diff --git a/packages/core/src/crawlers/error_tracker.ts b/packages/core/src/crawlers/error_tracker.ts index 0f5769468dae..84137ea750e5 100644 --- a/packages/core/src/crawlers/error_tracker.ts +++ b/packages/core/src/crawlers/error_tracker.ts @@ -7,10 +7,10 @@ import type { CrawlingContext } from '../crawlers/crawler_commons'; * Node.js Error interface */ export interface ErrnoException extends Error { - errno?: number | undefined; - code?: string | number | undefined; - path?: string | undefined; - syscall?: string | undefined; + errno?: number; + code?: string | number; + path?: string; + syscall?: string; cause?: any; } diff --git a/packages/core/src/typedefs.ts b/packages/core/src/typedefs.ts index 9d9d5390d65a..5f7063dbb7fe 100644 --- a/packages/core/src/typedefs.ts +++ b/packages/core/src/typedefs.ts @@ -19,18 +19,11 @@ export function keys(obj: T) { export declare type AllowedHttpMethods = 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH'; // Define the following types as we cannot import the complete types from the respective packages -export interface PlaywrightCrawlingContext { +export interface BrowserCrawlingContext { saveSnapshot: (options: { key: string }) => Promise; } -export interface PuppeteerCrawlingContext { - saveSnapshot: (options: { key: string }) => Promise; -} -export interface PlaywrightPage { - content: () => Promise; -} -export interface PuppeteerPage { +export interface BrowserPage { content: () => Promise; } -export interface SnapshotOptions { context: PlaywrightCrawlingContext | PuppeteerCrawlingContext; filename: string } -export type SnapshotResult = { screenshotFilename?: string; htmlFileName?: string } | undefined; +export type SnapshotResult = { screenshotFileName?: string; htmlFileName?: string } | undefined; diff --git a/packages/utils/src/index.ts b/packages/utils/src/index.ts index 1bd8d5901abe..9ba820c5ab78 100644 --- a/packages/utils/src/index.ts +++ b/packages/utils/src/index.ts @@ -7,7 +7,6 @@ export * from './internals/memory-info'; export * from './internals/debug'; export * as social from './internals/social'; export * from './internals/typedefs'; -export * from './internals/error-tracker'; export * from './internals/open_graph_parser'; export * from './internals/gotScraping'; export * from './internals/robots'; diff --git a/packages/utils/src/internals/error-tracker.ts b/packages/utils/src/internals/error-tracker.ts deleted file mode 100644 index 259a4c33b5bd..000000000000 --- a/packages/utils/src/internals/error-tracker.ts +++ /dev/null @@ -1,376 +0,0 @@ -import { inspect } from 'node:util'; - -/** - * Node.js Error interface - */ - interface ErrnoException extends Error { - errno?: number | undefined; - code?: string | number | undefined; - path?: string | undefined; - syscall?: string | undefined; - cause?: any; -} - -export interface ErrorTrackerOptions { - showErrorCode: boolean; - showErrorName: boolean; - showStackTrace: boolean; - showFullStack: boolean; - showErrorMessage: boolean; - showFullMessage: boolean; -} - -const extractPathFromStackTraceLine = (line: string) => { - const lastStartingRoundBracketIndex = line.lastIndexOf('('); - - if (lastStartingRoundBracketIndex !== -1) { - const closingRoundBracketIndex = line.indexOf(')', lastStartingRoundBracketIndex); - - if (closingRoundBracketIndex !== -1) { - return line.slice(lastStartingRoundBracketIndex + 1, closingRoundBracketIndex); - } - } - - return line; -}; - -// https://v8.dev/docs/stack-trace-api#appendix%3A-stack-trace-format -const getPathFromStackTrace = (stack: string[]) => { - for (const line of stack) { - const path = extractPathFromStackTraceLine(line); - - if ( - path.startsWith('node:') - || path.includes('/node_modules/') - || path.includes('\\node_modules\\') - ) { - continue; - } - - return path; - } - - return extractPathFromStackTraceLine(stack[0]); -}; - -const getStackTraceGroup = (error: ErrnoException, storage: Record, showFullStack: boolean) => { - const stack = error.stack?.split('\n').map((line) => line.trim()); - - let sliceAt = -1; - - if (stack) { - for (let i = 0; i < stack.length; i++) { - if (stack[i].startsWith('at ') || stack[i].startsWith('eval at ')) { - sliceAt = i; - break; - } - } - } - - let normalizedStackTrace = null; - if (sliceAt !== -1) { - normalizedStackTrace = showFullStack ? stack!.slice(sliceAt).map((x) => x.trim()).join('\n') : getPathFromStackTrace(stack!.slice(sliceAt)); - } - - if (!normalizedStackTrace) { - normalizedStackTrace = 'missing stack trace'; - } - - if (!(normalizedStackTrace in storage)) { - storage[normalizedStackTrace] = Object.create(null); - } - - return storage[normalizedStackTrace] as Record; -}; - -const getErrorCodeGroup = (error: ErrnoException, storage: Record) => { - let { code } = error; - - if (code === undefined) { - code = 'missing error code'; - } - - if (!(code in storage)) { - storage[code] = Object.create(null); - } - - return storage[String(code)] as Record; -}; - -const getErrorNameGroup = (error: ErrnoException, storage: Record) => { - const { name } = error; - - if (!(name in storage)) { - storage[name] = Object.create(null); - } - - return storage[name] as Record; -}; - -const findBiggestWordIntersection = (a: string[], b: string[]) => { - let maxStreak = 0; - let bStreakIndex = -1; - let aStreakIndex = -1; - for (let aIndex = 0; aIndex < a.length; aIndex++) { - let bIndex = -1; - - do { - let aWalkIndex = aIndex; - - bIndex = b.indexOf(a[aIndex], bIndex + 1); - - let bWalkIndex = bIndex; - - let streak = 0; - while (aWalkIndex < a.length && bWalkIndex < b.length && b[bWalkIndex++] === a[aWalkIndex++]) { - streak++; - } - - if (streak > maxStreak) { - maxStreak = streak; - aStreakIndex = aIndex; - bStreakIndex = bIndex; - } - } while (bIndex !== -1); - } - - return { - maxStreak, - aStreakIndex, - bStreakIndex, - }; -}; - -const arrayCount = (array: unknown[], target: unknown) => { - let result = 0; - - for (const item of array) { - if (item === target) { - result++; - } - } - - return result; -}; - -const calculatePlaceholder = (a: string[], b: string[]) => { - const { maxStreak, aStreakIndex, bStreakIndex } = findBiggestWordIntersection(a, b); - - if (maxStreak === 0) { - return ['_']; - } - - const leftA = a.slice(0, aStreakIndex); - const leftB = b.slice(0, bStreakIndex); - const rightA = a.slice(aStreakIndex + maxStreak); - const rightB = b.slice(bStreakIndex + maxStreak); - - const output: string[] = []; - - if (leftA.length !== 0 || leftB.length !== 0) { - output.push(...calculatePlaceholder(leftA, leftB)); - } - - output.push(...a.slice(aStreakIndex, aStreakIndex + maxStreak)); - - if (rightA.length !== 0 || rightB.length !== 0) { - output.push(...calculatePlaceholder(rightA, rightB)); - } - - return output; -}; - -const normalizedCalculatePlaceholder = (a: string[], b: string[]) => { - const output = calculatePlaceholder(a, b); - - // We can't be too general - if ((arrayCount(output, '_') / output.length) >= 0.5) { - return ['_']; - } - - return output; -}; - -// Merge A (missing placeholders) into B (can contain placeholders but does not have to) -const mergeMessages = (a: string, b: string, storage: Record) => { - const placeholder = normalizedCalculatePlaceholder( - a.split(' '), - b.split(' '), - ).join(' '); - - if (placeholder === '_') { - return undefined; - } - - interface HasCount { - count: number; - } - - const count = (storage[a] as HasCount).count + (storage[b] as HasCount).count; - - delete storage[a]; - delete storage[b]; - - storage[placeholder] = Object.assign(Object.create(null), { - count, - }); - - return placeholder; -}; - -const getErrorMessageGroup = (error: ErrnoException, storage: Record, showFullMessage: boolean) => { - let { message } = error; - - if (!message) { - try { - message = typeof error === 'string' ? error : `Unknown error message. Received non-error object: ${JSON.stringify(error)}`; - } catch { - message = `Unknown error message. Received non-error object, and could not stringify it: ${inspect(error, { depth: 0 })}`; - } - } - - if (!showFullMessage) { - const newLineIndex = message.indexOf('\n'); - message = message.slice(0, newLineIndex === -1 ? undefined : newLineIndex); - } - - if (!(message in storage)) { - storage[message] = Object.assign(Object.create(null), { - count: 0, - }); - - // This actually safe, since we Object.create(null) so no prototype pollution can happen. - // eslint-disable-next-line no-restricted-syntax, guard-for-in - for (const existingMessage in storage) { - const newMessage = mergeMessages(message, existingMessage, storage); - if (newMessage) { - message = newMessage; - break; - } - } - } - - return storage[message] as Record; -}; - -const increaseCount = (group: { count?: number }) => { - if (!('count' in group)) { - // In case users don't want to display error message - group.count = 0; - } - - group.count!++; -}; - -/** - * This class tracks errors and computes a summary of information like: - * - where the errors happened - * - what the error names are - * - what the error codes are - * - what is the general error message - * - * This is extremely useful when there are dynamic error messages, such as argument validation. - * - * Since the structure of the `tracker.result` object differs when using different options, - * it's typed as `Record`. The most deep object has a `count` property, which is a number. - * - * It's possible to get the total amount of errors via the `tracker.total` property. - */ -export class ErrorTracker { - #options: ErrorTrackerOptions; - - result: Record; - - total: number; - - constructor(options: Partial = {}) { - this.#options = { - showErrorCode: true, - showErrorName: true, - showStackTrace: true, - showFullStack: false, - showErrorMessage: true, - showFullMessage: false, - ...options, - }; - - this.result = Object.create(null); - this.total = 0; - } - - add(error: ErrnoException) { - this.total++; - - let group = this.result; - - if (this.#options.showStackTrace) { - group = getStackTraceGroup(error, group, this.#options.showFullStack); - } - - if (this.#options.showErrorCode) { - group = getErrorCodeGroup(error, group); - } - - if (this.#options.showErrorName) { - group = getErrorNameGroup(error, group); - } - - if (this.#options.showErrorMessage) { - group = getErrorMessageGroup(error, group, this.#options.showFullMessage); - } - - increaseCount(group as { count: number }); - - if (typeof error.cause === 'object' && error.cause !== null) { - this.add(error.cause); - } - } - - getUniqueErrorCount() { - let count = 0; - - const goDeeper = (group: Record): void => { - if ('count' in group) { - count++; - return; - } - - // eslint-disable-next-line guard-for-in, no-restricted-syntax - for (const key in group) { - goDeeper(group[key] as Record); - } - }; - - goDeeper(this.result); - - return count; - } - - getMostPopularErrors(count: number) { - const result: [number, string[]][] = []; - - const goDeeper = (group: Record, path: string[]): void => { - if ('count' in group) { - result.push([(group as any).count, path]); - return; - } - - // eslint-disable-next-line guard-for-in, no-restricted-syntax - for (const key in group) { - goDeeper(group[key] as Record, [...path, key]); - } - }; - - goDeeper(this.result, []); - - return result.sort((a, b) => b[0] - a[0]).slice(0, count); - } - - reset() { - // This actually safe, since we Object.create(null) so no prototype pollution can happen. - // eslint-disable-next-line no-restricted-syntax, guard-for-in - for (const key in this.result) { - delete this.result[key]; - } - } -} diff --git a/test/core/error_tracker.test.ts b/test/core/error_tracker.test.ts index c77fac20b61d..f3ae1a9afd5e 100644 --- a/test/core/error_tracker.test.ts +++ b/test/core/error_tracker.test.ts @@ -155,7 +155,7 @@ test('no code is null code', () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can hide error code', async () => { +test('can hide error code', () => { const tracker = new ErrorTracker({ showErrorCode: false, showErrorMessage: true, @@ -182,7 +182,7 @@ test('can hide error code', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can hide error name', async () => { +test('can hide error name', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -209,7 +209,7 @@ test('can hide error name', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can hide error message', async () => { +test('can hide error message', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: false, @@ -236,7 +236,7 @@ test('can hide error message', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can hide error stack', async () => { +test('can hide error stack', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -261,7 +261,7 @@ test('can hide error stack', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can display full stack', async () => { +test('can display full stack', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -290,7 +290,7 @@ test('can display full stack', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('stack looks for user files first', async () => { +test('stack looks for user files first', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -319,7 +319,7 @@ test('stack looks for user files first', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('can shorten the message to the first line', async () => { +test('can shorten the message to the first line', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -347,7 +347,7 @@ test('can shorten the message to the first line', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('supports error.cause', async () => { +test('supports error.cause', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -379,7 +379,7 @@ test('supports error.cause', async () => { expect(tracker.getUniqueErrorCount()).toBe(2); }); -test('placeholder #1', async () => { +test('placeholder #1', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -418,7 +418,7 @@ test('placeholder #1', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #2', async () => { +test('placeholder #2', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -457,7 +457,7 @@ test('placeholder #2', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #3', async () => { +test('placeholder #3', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -496,7 +496,7 @@ test('placeholder #3', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #4', async () => { +test('placeholder #4', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -535,7 +535,7 @@ test('placeholder #4', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #5', async () => { +test('placeholder #5', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -574,7 +574,7 @@ test('placeholder #5', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #6', async () => { +test('placeholder #6', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -613,7 +613,7 @@ test('placeholder #6', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #7', async () => { +test('placeholder #7', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -665,7 +665,7 @@ test('placeholder #7', async () => { expect(tracker.getUniqueErrorCount()).toBe(1); }); -test('placeholder #8', async () => { +test('placeholder #8', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, @@ -702,7 +702,7 @@ test('placeholder #8', async () => { expect(tracker.getUniqueErrorCount()).toBe(2); }); -test('placeholder #9', async () => { +test('placeholder #9', () => { const tracker = new ErrorTracker({ showErrorCode: true, showErrorMessage: true, diff --git a/test/e2e/puppeteer-error-snapshot/actor/Dockerfile b/test/e2e/puppeteer-error-snapshot/actor/Dockerfile index 1e7718842183..4668d1cf6a57 100644 --- a/test/e2e/puppeteer-error-snapshot/actor/Dockerfile +++ b/test/e2e/puppeteer-error-snapshot/actor/Dockerfile @@ -1,4 +1,4 @@ -FROM node:20 AS builder +FROM apify/actor-node-puppeteer-chrome:20-beta AS builder COPY /packages ./packages COPY /package*.json ./ From f239415aa90dccac900d3027a06c7e2faa325215 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Wed, 15 May 2024 22:48:52 +0300 Subject: [PATCH 23/28] revert: Dockerfile node version --- test/e2e/puppeteer-error-snapshot/actor/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/puppeteer-error-snapshot/actor/Dockerfile b/test/e2e/puppeteer-error-snapshot/actor/Dockerfile index 4668d1cf6a57..1e7718842183 100644 --- a/test/e2e/puppeteer-error-snapshot/actor/Dockerfile +++ b/test/e2e/puppeteer-error-snapshot/actor/Dockerfile @@ -1,4 +1,4 @@ -FROM apify/actor-node-puppeteer-chrome:20-beta AS builder +FROM node:20 AS builder COPY /packages ./packages COPY /package*.json ./ From 912c2a315ea970fbcaff92e6a954e3043ad2c5ed Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Thu, 16 May 2024 07:00:10 +0300 Subject: [PATCH 24/28] refactor: Update ErrorSnapshotter class and removed hardcoded paths --- .../core/src/crawlers/error_snapshotter.ts | 21 ++----------------- .../puppeteer-error-snapshot/actor/main.js | 2 +- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/packages/core/src/crawlers/error_snapshotter.ts b/packages/core/src/crawlers/error_snapshotter.ts index acf095064010..09c2edaed1bc 100644 --- a/packages/core/src/crawlers/error_snapshotter.ts +++ b/packages/core/src/crawlers/error_snapshotter.ts @@ -15,14 +15,6 @@ export class ErrorSnapshotter { static readonly BASE_MESSAGE = 'An error occurred'; static readonly SNAPSHOT_PREFIX = 'ERROR_SNAPSHOT'; - private KEY_VALUE_STORE_LOCAL_PATH: string; - private KEY_VALUE_PLATFORM_PATH :string; - - constructor() { - this.KEY_VALUE_PLATFORM_PATH = 'https://api.apify.com/v2/key-value-stores'; - this.KEY_VALUE_STORE_LOCAL_PATH = `file://${process.env.PWD}/storage/key_value_stores`; - } - /** * Capture a snapshot of the error context. */ @@ -62,18 +54,9 @@ export class ErrorSnapshotter { htmlFileName = await this.saveHTMLSnapshot(body, keyValueStore, fileName); } - if (process.env.APIFY_IS_AT_HOME) { - const platformPath = `${this.KEY_VALUE_PLATFORM_PATH}/${keyValueStore.id}/records`; - return { - screenshotFileUrl: screenshotFileName ? `${platformPath}/${screenshotFileName}` : undefined, - htmlFileUrl: htmlFileName ? `${platformPath}/${htmlFileName}` : undefined, - }; - } - - const localPath = `${this.KEY_VALUE_STORE_LOCAL_PATH}/${keyValueStore.name || 'default'}`; return { - screenshotFileUrl: screenshotFileName ? `${localPath}/${screenshotFileName}` : undefined, - htmlFileUrl: htmlFileName ? `${localPath}/${htmlFileName}` : undefined, + screenshotFileUrl: screenshotFileName, + htmlFileUrl: htmlFileName, }; } catch { return {}; diff --git a/test/e2e/puppeteer-error-snapshot/actor/main.js b/test/e2e/puppeteer-error-snapshot/actor/main.js index ed4e7543ec14..70ca82f8f890 100644 --- a/test/e2e/puppeteer-error-snapshot/actor/main.js +++ b/test/e2e/puppeteer-error-snapshot/actor/main.js @@ -38,7 +38,7 @@ await Actor.main(async () => { if (label === LABELS.POST_NAVIGATION_ERROR) { log.error('Post navigation error'); - throw new Error(' Unable to navigate to the requested post'); + throw new Error('Unable to navigate to the requested post'); } }], }); From 70c104df01f80b63ee49ab78bd4e5ebd9dfc7f07 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Thu, 16 May 2024 07:04:40 +0300 Subject: [PATCH 25/28] refactor: Update ErrorSnapshotter class to use more descriptive property names for snapshot files --- packages/core/src/crawlers/error_snapshotter.ts | 6 +++--- packages/core/src/crawlers/error_tracker.ts | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/core/src/crawlers/error_snapshotter.ts b/packages/core/src/crawlers/error_snapshotter.ts index 09c2edaed1bc..8ff66a6b2ee9 100644 --- a/packages/core/src/crawlers/error_snapshotter.ts +++ b/packages/core/src/crawlers/error_snapshotter.ts @@ -18,7 +18,7 @@ export class ErrorSnapshotter { /** * Capture a snapshot of the error context. */ - async captureSnapshot(error: ErrnoException, context: CrawlingContext): Promise<{ screenshotFileUrl?: string; htmlFileUrl?: string }> { + async captureSnapshot(error: ErrnoException, context: CrawlingContext): Promise<{ screenshotFileName?: string; htmlFileName?: string }> { try { const page = context?.page as BrowserPage | undefined; const body = context?.body; @@ -55,8 +55,8 @@ export class ErrorSnapshotter { } return { - screenshotFileUrl: screenshotFileName, - htmlFileUrl: htmlFileName, + screenshotFileName, + htmlFileName, }; } catch { return {}; diff --git a/packages/core/src/crawlers/error_tracker.ts b/packages/core/src/crawlers/error_tracker.ts index 84137ea750e5..1daa3af64993 100644 --- a/packages/core/src/crawlers/error_tracker.ts +++ b/packages/core/src/crawlers/error_tracker.ts @@ -379,10 +379,10 @@ export class ErrorTracker { } async captureSnapshot(storage: Record, error: ErrnoException, context: CrawlingContext) { - const { screenshotFileUrl, htmlFileUrl } = await this.errorSnapshotter.captureSnapshot(error, context); + const { screenshotFileName, htmlFileName } = await this.errorSnapshotter.captureSnapshot(error, context); - storage.firstErrorScreenshotUrl = screenshotFileUrl; - storage.firstErrorHtmlUrl = htmlFileUrl; + storage.firstErrorScreenshot = screenshotFileName; + storage.firstErrorHtml = htmlFileName; } reset() { From bdcd0dcab2c87a6eb8ab8af6fd5ae442f5310fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Thu, 16 May 2024 09:55:25 +0200 Subject: [PATCH 26/28] fix tests --- packages/core/src/crawlers/error_snapshotter.ts | 1 - test/core/playwright_utils.test.ts | 10 ++++------ test/core/puppeteer_utils.test.ts | 7 +++---- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/packages/core/src/crawlers/error_snapshotter.ts b/packages/core/src/crawlers/error_snapshotter.ts index 8ff66a6b2ee9..c59d72d2e11a 100644 --- a/packages/core/src/crawlers/error_snapshotter.ts +++ b/packages/core/src/crawlers/error_snapshotter.ts @@ -72,7 +72,6 @@ export class ErrorSnapshotter { try { await context.saveSnapshot({ key: fileName }); return { - // The screenshot file extension is different for Apify and local environments screenshotFileName: `${fileName}.jpg`, htmlFileName: `${fileName}.html`, }; diff --git a/test/core/playwright_utils.test.ts b/test/core/playwright_utils.test.ts index cda683af7224..5d4dbdea51d6 100644 --- a/test/core/playwright_utils.test.ts +++ b/test/core/playwright_utils.test.ts @@ -294,23 +294,21 @@ describe('playwrightUtils', () => { const openKVSSpy = vitest.spyOn(KeyValueStore, 'open'); const browser = await chromium.launch({ headless: true }); // TODO: get rid of this somehow, crawlee API cannot depend on apify env vars directly - const isAtHome = process.env.APIFY_IS_AT_HOME; try { const page = await browser.newPage(); const contentHTML = '
Div number: 1
'; await page.setContent(contentHTML); - const screenshot = await page.screenshot( - { fullPage: true, type: 'jpeg', quality: 60 }); + const screenshot = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 60 }); // Test saving both image and html const object = { setValue: vitest.fn() }; openKVSSpy.mockResolvedValue(object as any); await playwrightUtils.saveSnapshot(page, { key: 'TEST', keyValueStoreName: 'TEST-STORE', screenshotQuality: 60 }); - expect(object.setValue).toBeCalledWith(isAtHome ? 'TEST.jpg' : 'TEST', screenshot, { contentType: 'image/jpeg' }); - expect(object.setValue).toBeCalledWith(isAtHome ? 'TEST.html' : 'TEST', contentHTML, { contentType: 'text/html' }); + expect(object.setValue).toBeCalledWith('TEST.jpg', screenshot, { contentType: 'image/jpeg' }); + expect(object.setValue).toBeCalledWith('TEST.html', contentHTML, { contentType: 'text/html' }); object.setValue.mockReset(); // Test saving only image @@ -318,7 +316,7 @@ describe('playwrightUtils', () => { // Default quality is 50 const screenshot2 = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 50 }); - expect(object.setValue).toBeCalledWith(isAtHome ? 'SNAPSHOT.jpg' : 'SNAPSHOT', screenshot2, { contentType: 'image/jpeg' }); + expect(object.setValue).toBeCalledWith('SNAPSHOT.jpg', screenshot2, { contentType: 'image/jpeg' }); } finally { await browser.close(); } diff --git a/test/core/puppeteer_utils.test.ts b/test/core/puppeteer_utils.test.ts index 5f572c719547..6458760281e9 100644 --- a/test/core/puppeteer_utils.test.ts +++ b/test/core/puppeteer_utils.test.ts @@ -411,7 +411,6 @@ describe('puppeteerUtils', () => { it('saveSnapshot() works', async () => { const openKVSSpy = vitest.spyOn(KeyValueStore, 'open'); const browser = await launchPuppeteer(launchContext); - const isAtHome = process.env.APIFY_IS_AT_HOME; try { const page = await browser.newPage(); @@ -425,8 +424,8 @@ describe('puppeteerUtils', () => { openKVSSpy.mockResolvedValue(object as any); await puppeteerUtils.saveSnapshot(page, { key: 'TEST', keyValueStoreName: 'TEST-STORE', screenshotQuality: 60 }); - expect(object.setValue).toBeCalledWith(isAtHome ? 'TEST.jpg' : 'TEST', screenshot, { contentType: 'image/jpeg' }); - expect(object.setValue).toBeCalledWith(isAtHome ? 'TEST.html' : 'TEST', contentHTML, { contentType: 'text/html' }); + expect(object.setValue).toBeCalledWith('TEST.jpg', screenshot, { contentType: 'image/jpeg' }); + expect(object.setValue).toBeCalledWith('TEST.html', contentHTML, { contentType: 'text/html' }); object.setValue.mockReset(); // Test saving only image @@ -434,7 +433,7 @@ describe('puppeteerUtils', () => { // Default quality is 50 const screenshot2 = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 50 }); - expect(object.setValue).toBeCalledWith(isAtHome ? 'SNAPSHOT.jpg' : 'SNAPSHOT', screenshot2, { contentType: 'image/jpeg' }); + expect(object.setValue).toBeCalledWith('SNAPSHOT.jpg', screenshot2, { contentType: 'image/jpeg' }); } finally { await browser.close(); } From 3cf793c3d825487001be1bf6ad570c6347001a9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Thu, 16 May 2024 10:17:19 +0200 Subject: [PATCH 27/28] revert to addAsync method --- .../src/internals/basic-crawler.ts | 4 +-- packages/core/src/crawlers/error_tracker.ts | 30 +++++++++++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index acb67db603c8..ff9fb550cf37 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1362,7 +1362,7 @@ export class BasicCrawler { }); + await this.captureSnapshot(group, error, context).catch(() => { }); } if (typeof error.cause === 'object' && error.cause !== null) { - this.add(error.cause); + await this.addAsync(error.cause); } } From 4a59c07d8098b54fc2653ce1d4b5ee083747ecce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Thu, 16 May 2024 10:34:30 +0200 Subject: [PATCH 28/28] make it configurable and opt-in --- .../core/src/crawlers/error_snapshotter.ts | 30 +++++++++++++++++-- packages/core/src/crawlers/error_tracker.ts | 12 ++++++-- packages/core/src/crawlers/statistics.ts | 14 +++++++-- packages/core/src/typedefs.ts | 10 ------- test/core/playwright_utils.test.ts | 1 - 5 files changed, 49 insertions(+), 18 deletions(-) diff --git a/packages/core/src/crawlers/error_snapshotter.ts b/packages/core/src/crawlers/error_snapshotter.ts index c59d72d2e11a..dc773da13720 100644 --- a/packages/core/src/crawlers/error_snapshotter.ts +++ b/packages/core/src/crawlers/error_snapshotter.ts @@ -3,10 +3,34 @@ import crypto from 'node:crypto'; import type { ErrnoException } from './error_tracker'; import type { CrawlingContext } from '../crawlers/crawler_commons'; import type { KeyValueStore } from '../storages'; -import type { BrowserCrawlingContext, BrowserPage, SnapshotResult } from '../typedefs'; + +// Define the following types as we cannot import the complete types from the respective packages +interface BrowserCrawlingContext { + saveSnapshot: (options: { key: string }) => Promise; +} + +interface BrowserPage { + content: () => Promise; +} + +export interface SnapshotResult { + screenshotFileName?: string; + htmlFileName?: string; +} /** - * ErrorSnapshotter class is used to capture a screenshot of the page and a snapshot of the HTML when an error occur during web crawling. + * ErrorSnapshotter class is used to capture a screenshot of the page and a snapshot of the HTML when an error occurs during web crawling. + * + * This functionality is opt-in, and can be enabled via the crawler options: + * + * ```ts + * const crawler = new BasicCrawler({ + * // ... + * statisticsOptions: { + * saveErrorSnapshots: true, + * }, + * }); + * ``` */ export class ErrorSnapshotter { static readonly MAX_ERROR_CHARACTERS = 30; @@ -68,7 +92,7 @@ export class ErrorSnapshotter { * This function is applicable for browser contexts only. * Returns an object containing the filenames of the screenshot and HTML file. */ - async contextCaptureSnapshot(context: BrowserCrawlingContext, fileName: string): Promise { + async contextCaptureSnapshot(context: BrowserCrawlingContext, fileName: string): Promise { try { await context.saveSnapshot({ key: fileName }); return { diff --git a/packages/core/src/crawlers/error_tracker.ts b/packages/core/src/crawlers/error_tracker.ts index 5b88f983c880..e592ca84678c 100644 --- a/packages/core/src/crawlers/error_tracker.ts +++ b/packages/core/src/crawlers/error_tracker.ts @@ -21,6 +21,7 @@ export interface ErrorTrackerOptions { showFullStack: boolean; showErrorMessage: boolean; showFullMessage: boolean; + saveErrorSnapshots: boolean; } const extractPathFromStackTraceLine = (line: string) => { @@ -286,7 +287,7 @@ export class ErrorTracker { total: number; - errorSnapshotter: ErrorSnapshotter; + errorSnapshotter?: ErrorSnapshotter; constructor(options: Partial = {}) { this.#options = { @@ -296,10 +297,13 @@ export class ErrorTracker { showFullStack: false, showErrorMessage: true, showFullMessage: false, + saveErrorSnapshots: false, ...options, }; - this.errorSnapshotter = new ErrorSnapshotter(); + if (this.#options.saveErrorSnapshots) { + this.errorSnapshotter = new ErrorSnapshotter(); + } this.result = Object.create(null); this.total = 0; @@ -399,6 +403,10 @@ export class ErrorTracker { } async captureSnapshot(storage: Record, error: ErrnoException, context: CrawlingContext) { + if (!this.errorSnapshotter) { + return; + } + const { screenshotFileName, htmlFileName } = await this.errorSnapshotter.captureSnapshot(error, context); storage.firstErrorScreenshot = screenshotFileName; diff --git a/packages/core/src/crawlers/statistics.ts b/packages/core/src/crawlers/statistics.ts index 91c97ccd41b1..2f346ca6a202 100644 --- a/packages/core/src/crawlers/statistics.ts +++ b/packages/core/src/crawlers/statistics.ts @@ -66,12 +66,12 @@ export class Statistics { /** * An error tracker for final retry errors. */ - errorTracker = new ErrorTracker(errorTrackerConfig); + errorTracker: ErrorTracker; /** * An error tracker for retry errors prior to the final retry. */ - errorTrackerRetry = new ErrorTracker(errorTrackerConfig); + errorTrackerRetry: ErrorTracker; /** * Statistic instance id. @@ -115,6 +115,7 @@ export class Statistics { keyValueStore: ow.optional.object, config: ow.optional.object, persistenceOptions: ow.optional.object, + saveErrorSnapshots: ow.optional.boolean, })); const { @@ -125,8 +126,11 @@ export class Statistics { persistenceOptions = { enable: true, }, + saveErrorSnapshots = false, } = options; + this.errorTracker = new ErrorTracker({ ...errorTrackerConfig, saveErrorSnapshots }); + this.errorTrackerRetry = new ErrorTracker({ ...errorTrackerConfig, saveErrorSnapshots }); this.logIntervalMillis = logIntervalSecs * 1000; this.logMessage = logMessage; this.keyValueStore = keyValueStore; @@ -444,6 +448,12 @@ export interface StatisticsOptions { * Control how and when to persist the statistics. */ persistenceOptions?: PersistenceOptions; + + /** + * Save HTML snapshot (and a screenshot if possible) when an error occurs. + * @default false + */ + saveErrorSnapshots?: boolean; } /** diff --git a/packages/core/src/typedefs.ts b/packages/core/src/typedefs.ts index 5f7063dbb7fe..66fc714ecd73 100644 --- a/packages/core/src/typedefs.ts +++ b/packages/core/src/typedefs.ts @@ -17,13 +17,3 @@ export function keys(obj: T) { } export declare type AllowedHttpMethods = 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH'; - -// Define the following types as we cannot import the complete types from the respective packages -export interface BrowserCrawlingContext { - saveSnapshot: (options: { key: string }) => Promise; -} -export interface BrowserPage { - content: () => Promise; -} - -export type SnapshotResult = { screenshotFileName?: string; htmlFileName?: string } | undefined; diff --git a/test/core/playwright_utils.test.ts b/test/core/playwright_utils.test.ts index 5d4dbdea51d6..cd0bdb49e28a 100644 --- a/test/core/playwright_utils.test.ts +++ b/test/core/playwright_utils.test.ts @@ -293,7 +293,6 @@ describe('playwrightUtils', () => { test('saveSnapshot() works', async () => { const openKVSSpy = vitest.spyOn(KeyValueStore, 'open'); const browser = await chromium.launch({ headless: true }); - // TODO: get rid of this somehow, crawlee API cannot depend on apify env vars directly try { const page = await browser.newPage();