Skip to content

Commit

Permalink
feat: implement ErrorSnapshotter for error context capture (#2332)
Browse files Browse the repository at this point in the history
This commit introduces the ErrorSnapshotter class to the crawlee
package, providing functionality to capture screenshots and HTML
snapshots when an error occurs during web crawling.

This functionality is opt-in, and can be enabled via the crawler
options:

```ts
const crawler = new BasicCrawler({
  // ...
  statisticsOptions: {
    saveErrorSnapshots: true,
  },
});
```

Closes #2280

---------

Co-authored-by: Martin Adámek <banan23@gmail.com>
  • Loading branch information
HamzaAlwan and B4nan committed May 16, 2024
1 parent 0a3c518 commit e861dfd
Show file tree
Hide file tree
Showing 22 changed files with 533 additions and 26 deletions.
12 changes: 10 additions & 2 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1362,7 +1362,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
const shouldRetryRequest = this._canRequestBeRetried(request, error);

if (shouldRetryRequest) {
this.stats.errorTrackerRetry.add(error);
await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);

if (error instanceof SessionError) {
await this._rotateSession(crawlingContext);
Expand All @@ -1388,7 +1388,15 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}
}

this.stats.errorTracker.add(error);
// If the request is non-retryable, the error and snapshot aren't saved in the errorTrackerRetry object.
// Therefore, we pass the crawlingContext to the errorTracker.add method, enabling snapshot capture.
// This is to make sure the error snapshot is not duplicated in the errorTrackerRetry and errorTracker objects.
const { noRetry, maxRetries } = request;
if (noRetry || !maxRetries) {
await this.stats.errorTracker.addAsync(error, crawlingContext);
} else {
this.stats.errorTracker.add(error);
}

// If we get here, the request is either not retryable
// or failed more than retryCount times and will not be retried anymore.
Expand Down
142 changes: 142 additions & 0 deletions packages/core/src/crawlers/error_snapshotter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import crypto from 'node:crypto';

import type { ErrnoException } from './error_tracker';
import type { CrawlingContext } from '../crawlers/crawler_commons';
import type { KeyValueStore } from '../storages';

// Define the following types as we cannot import the complete types from the respective packages
interface BrowserCrawlingContext {
saveSnapshot: (options: { key: string }) => Promise<void>;
}

interface BrowserPage {
content: () => Promise<string>;
}

export interface SnapshotResult {
screenshotFileName?: string;
htmlFileName?: string;
}

/**
* ErrorSnapshotter class is used to capture a screenshot of the page and a snapshot of the HTML when an error occurs during web crawling.
*
* This functionality is opt-in, and can be enabled via the crawler options:
*
* ```ts
* const crawler = new BasicCrawler({
* // ...
* statisticsOptions: {
* saveErrorSnapshots: true,
* },
* });
* ```
*/
export class ErrorSnapshotter {
static readonly MAX_ERROR_CHARACTERS = 30;
static readonly MAX_HASH_LENGTH = 30;
static readonly MAX_FILENAME_LENGTH = 250;
static readonly BASE_MESSAGE = 'An error occurred';
static readonly SNAPSHOT_PREFIX = 'ERROR_SNAPSHOT';

/**
* Capture a snapshot of the error context.
*/
async captureSnapshot(error: ErrnoException, context: CrawlingContext): Promise<{ screenshotFileName?: string; htmlFileName?: string }> {
try {
const page = context?.page as BrowserPage | undefined;
const body = context?.body;

const keyValueStore = await context?.getKeyValueStore();
// If the key-value store is not available, or the body and page are not available, return empty filenames
if (!keyValueStore || (!body && !page)) {
return {};
}

const fileName = this.generateFilename(error);

let screenshotFileName: string | undefined;
let htmlFileName: string | undefined;

if (page) {
const capturedFiles = await this.contextCaptureSnapshot(
context as unknown as BrowserCrawlingContext,
fileName,
);

if (capturedFiles) {
screenshotFileName = capturedFiles.screenshotFileName;
htmlFileName = capturedFiles.htmlFileName;
}

// If the snapshot for browsers failed to capture the HTML, try to capture it from the page content
if (!htmlFileName) {
const html = await page.content();
htmlFileName = html ? await this.saveHTMLSnapshot(html, keyValueStore, fileName) : undefined;
}
} else if (typeof body === 'string') { // for non-browser contexts
htmlFileName = await this.saveHTMLSnapshot(body, keyValueStore, fileName);
}

return {
screenshotFileName,
htmlFileName,
};
} catch {
return {};
}
}

/**
* Captures a snapshot of the current page using the context.saveSnapshot function.
* This function is applicable for browser contexts only.
* Returns an object containing the filenames of the screenshot and HTML file.
*/
async contextCaptureSnapshot(context: BrowserCrawlingContext, fileName: string): Promise<SnapshotResult | undefined> {
try {
await context.saveSnapshot({ key: fileName });
return {
screenshotFileName: `${fileName}.jpg`,
htmlFileName: `${fileName}.html`,
};
} catch {
return undefined;
}
}

/**
* Save the HTML snapshot of the page, and return the fileName with the extension.
*/
async saveHTMLSnapshot(html: string, keyValueStore: KeyValueStore, fileName: string): Promise<string | undefined> {
try {
await keyValueStore.setValue(fileName, html, { contentType: 'text/html' });
return `${fileName}.html`;
} catch {
return undefined;
}
}

/**
* Generate a unique fileName for each error snapshot.
*/
generateFilename(error: ErrnoException): string {
const { SNAPSHOT_PREFIX, BASE_MESSAGE, MAX_HASH_LENGTH, MAX_ERROR_CHARACTERS, MAX_FILENAME_LENGTH } = ErrorSnapshotter;
// Create a hash of the error stack trace
const errorStackHash = crypto.createHash('sha1').update(error.stack || error.message || '').digest('hex').slice(0, MAX_HASH_LENGTH);
const errorMessagePrefix = (error.message || BASE_MESSAGE).slice(0, MAX_ERROR_CHARACTERS).trim();

/**
* Remove non-word characters from the start and end of a string.
*/
const sanitizeString = (str: string): string => {
return str.replace(/^\W+|\W+$/g, '');
};

// Generate fileName and remove disallowed characters
const fileName = `${SNAPSHOT_PREFIX}_${sanitizeString(errorStackHash)}_${sanitizeString(errorMessagePrefix)}`
.replace(/\W+/g, '-') // Replace non-word characters with a dash
.slice(0, MAX_FILENAME_LENGTH);

return fileName;
}
}
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import { inspect } from 'node:util';

import { ErrorSnapshotter } from './error_snapshotter';
import type { CrawlingContext } from '../crawlers/crawler_commons';

/**
* Node.js Error interface
*/
interface ErrnoException extends Error {
errno?: number | undefined;
code?: string | number | undefined;
path?: string | undefined;
syscall?: string | undefined;
export interface ErrnoException extends Error {
errno?: number;
code?: string | number;
path?: string;
syscall?: string;
cause?: any;
}

Expand All @@ -18,6 +21,7 @@ export interface ErrorTrackerOptions {
showFullStack: boolean;
showErrorMessage: boolean;
showFullMessage: boolean;
saveErrorSnapshots: boolean;
}

const extractPathFromStackTraceLine = (line: string) => {
Expand Down Expand Up @@ -283,6 +287,8 @@ export class ErrorTracker {

total: number;

errorSnapshotter?: ErrorSnapshotter;

constructor(options: Partial<ErrorTrackerOptions> = {}) {
this.#options = {
showErrorCode: true,
Expand All @@ -291,16 +297,19 @@ export class ErrorTracker {
showFullStack: false,
showErrorMessage: true,
showFullMessage: false,
saveErrorSnapshots: false,
...options,
};

if (this.#options.saveErrorSnapshots) {
this.errorSnapshotter = new ErrorSnapshotter();
}

this.result = Object.create(null);
this.total = 0;
}

add(error: ErrnoException) {
this.total++;

private updateGroup(error: ErrnoException) {
let group = this.result;

if (this.#options.showStackTrace) {
Expand All @@ -321,11 +330,38 @@ export class ErrorTracker {

increaseCount(group as { count: number });

return group;
}

add(error: ErrnoException) {
this.total++;

this.updateGroup(error);

if (typeof error.cause === 'object' && error.cause !== null) {
this.add(error.cause);
}
}

/**
* This method is async, because it captures a snapshot of the error context.
* We added this new method to avoid breaking changes.
*/
async addAsync(error: ErrnoException, context?: CrawlingContext) {
this.total++;

const group = this.updateGroup(error);

// Capture a snapshot (screenshot and HTML) on the first occurrence of an error
if (group.count === 1 && context) {
await this.captureSnapshot(group, error, context).catch(() => { });
}

if (typeof error.cause === 'object' && error.cause !== null) {
await this.addAsync(error.cause);
}
}

getUniqueErrorCount() {
let count = 0;

Expand Down Expand Up @@ -366,6 +402,17 @@ export class ErrorTracker {
return result.sort((a, b) => b[0] - a[0]).slice(0, count);
}

async captureSnapshot(storage: Record<string, unknown>, error: ErrnoException, context: CrawlingContext) {
if (!this.errorSnapshotter) {
return;
}

const { screenshotFileName, htmlFileName } = await this.errorSnapshotter.captureSnapshot(error, context);

storage.firstErrorScreenshot = screenshotFileName;
storage.firstErrorHtml = htmlFileName;
}

reset() {
// This actually safe, since we Object.create(null) so no prototype pollution can happen.
// eslint-disable-next-line no-restricted-syntax, guard-for-in
Expand Down
2 changes: 2 additions & 0 deletions packages/core/src/crawlers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ export * from './crawler_commons';
export * from './crawler_extension';
export * from './crawler_utils';
export * from './statistics';
export * from './error_tracker';
export * from './error_snapshotter';
16 changes: 13 additions & 3 deletions packages/core/src/crawlers/statistics.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { ErrorTracker } from '@crawlee/utils';
import ow from 'ow';

import { ErrorTracker } from './error_tracker';
import { Configuration } from '../configuration';
import type { EventManager } from '../events/event_manager';
import { EventType } from '../events/event_manager';
Expand Down Expand Up @@ -66,12 +66,12 @@ export class Statistics {
/**
* An error tracker for final retry errors.
*/
errorTracker = new ErrorTracker(errorTrackerConfig);
errorTracker: ErrorTracker;

/**
* An error tracker for retry errors prior to the final retry.
*/
errorTrackerRetry = new ErrorTracker(errorTrackerConfig);
errorTrackerRetry: ErrorTracker;

/**
* Statistic instance id.
Expand Down Expand Up @@ -115,6 +115,7 @@ export class Statistics {
keyValueStore: ow.optional.object,
config: ow.optional.object,
persistenceOptions: ow.optional.object,
saveErrorSnapshots: ow.optional.boolean,
}));

const {
Expand All @@ -125,8 +126,11 @@ export class Statistics {
persistenceOptions = {
enable: true,
},
saveErrorSnapshots = false,
} = options;

this.errorTracker = new ErrorTracker({ ...errorTrackerConfig, saveErrorSnapshots });
this.errorTrackerRetry = new ErrorTracker({ ...errorTrackerConfig, saveErrorSnapshots });
this.logIntervalMillis = logIntervalSecs * 1000;
this.logMessage = logMessage;
this.keyValueStore = keyValueStore;
Expand Down Expand Up @@ -444,6 +448,12 @@ export interface StatisticsOptions {
* Control how and when to persist the statistics.
*/
persistenceOptions?: PersistenceOptions;

/**
* Save HTML snapshot (and a screenshot if possible) when an error occurs.
* @default false
*/
saveErrorSnapshots?: boolean;
}

/**
Expand Down
1 change: 0 additions & 1 deletion packages/utils/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ export * from './internals/memory-info';
export * from './internals/debug';
export * as social from './internals/social';
export * from './internals/typedefs';
export * from './internals/error_tracker';
export * from './internals/open_graph_parser';
export * from './internals/gotScraping';
export * from './internals/robots';
Expand Down
2 changes: 1 addition & 1 deletion packages/utils/test/non-error-objects-working.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { ErrorTracker } from '../src/internals/error_tracker';
import { ErrorTracker } from '../../core/src/crawlers/error_tracker';

describe('ErrorTracker', () => {
test('processing a non-error error should not crash', () => {
Expand Down
4 changes: 1 addition & 3 deletions test/core/error_tracker.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
/* eslint-disable no-multi-spaces */
import exp from 'node:constants';

import { ErrorTracker } from '../../packages/utils/src/internals/error_tracker';
import { ErrorTracker } from '../../packages/core/src/crawlers/error_tracker';

const random = () => Math.random().toString(36).slice(2);

Expand Down
7 changes: 7 additions & 0 deletions test/e2e/cheerio-error-snapshot/actor/.actor/actor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"actorSpecification": 1,
"name": "test-cheerio-error-snapshot",
"version": "0.0",
"buildTag": "latest",
"env": null
}
7 changes: 7 additions & 0 deletions test/e2e/cheerio-error-snapshot/actor/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.idea
.DS_Store
node_modules
package-lock.json
apify_storage
crawlee_storage
storage

0 comments on commit e861dfd

Please sign in to comment.