Skip to content

Commit

Permalink
feat(jsdom-crawler): add runScripts option (#1668)
Browse files Browse the repository at this point in the history
  • Loading branch information
szmarczak committed Nov 14, 2022
1 parent 0bc147e commit 8ef90bc
Show file tree
Hide file tree
Showing 13 changed files with 206 additions and 61 deletions.
5 changes: 3 additions & 2 deletions docs/examples/jsdom_crawler.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ import CodeBlock from '@theme/CodeBlock';
import ApiLink from '@site/src/components/ApiLink';
import JSDOMCrawlerSource from '!!raw-loader!./jsdom_crawler.ts';

This example demonstrates how to use <ApiLink to="jsdom-crawler/class/JSDOMCrawler">`JSDOMCrawler`</ApiLink> to crawl a list of URLs from an external file, load each URL using a plain HTTP request, parse the HTML using the [jsdom](https://www.npmjs.com/package/jsdom) DOM implementation and extract some data from it: the page title and all `h1` tags.
This example demonstrates how to use <ApiLink to="jsdom-crawler/class/JSDOMCrawler">`JSDOMCrawler`</ApiLink> to interact with a website using [jsdom](https://www.npmjs.com/package/jsdom) DOM implementation.
Here the script will open a calculator app from the [React examples](https://reactjs.org/community/examples.html), click `1` `+` `1` `=` and extract the result.

<CodeBlock className="language-js">
<CodeBlock className="language-ts">
{JSDOMCrawlerSource}
</CodeBlock>
66 changes: 11 additions & 55 deletions docs/examples/jsdom_crawler.ts
Original file line number Diff line number Diff line change
@@ -1,64 +1,20 @@
import { Dataset, JSDOMCrawler, log, LogLevel } from 'crawlee';
import { JSDOMCrawler } from '@crawlee/jsdom';

// Crawlers come with various utilities, e.g. for logging.
// Here we use debug level of logging to improve the debugging experience.
// This functionality is optional!
log.setLevel(LogLevel.DEBUG);

// Create an instance of the CheerioCrawler class - a crawler
// that automatically loads the URLs and parses their HTML using the cheerio library.
const crawler = new JSDOMCrawler({
// The crawler downloads and processes the web pages in parallel, with a concurrency
// automatically managed based on the available system memory and CPU (see AutoscaledPool class).
// Here we define some hard limits for the concurrency.
minConcurrency: 10,
maxConcurrency: 50,

// On error, retry each page at most once.
maxRequestRetries: 1,

// Increase the timeout for processing of each page.
requestHandlerTimeoutSecs: 30,

// Limit to 10 requests per one crawl
maxRequestsPerCrawl: 10,
runScripts: true,
requestHandler: async ({ window }) => {
const { document } = window;
document.querySelectorAll('button')[12].click(); // 1
document.querySelectorAll('button')[15].click(); // +
document.querySelectorAll('button')[12].click(); // 1
document.querySelectorAll('button')[18].click(); // =

// This function will be called for each URL to crawl.
// It accepts a single parameter, which is an object with options as:
// https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlerOptions#requestHandler
// We use for demonstration only 2 of them:
// - request: an instance of the Request class with information such as the URL that is being crawled and HTTP method
// - $: the cheerio object containing parsed HTML
async requestHandler({ request, window }) {
log.debug(`Processing ${request.url}...`);
const result = document.querySelectorAll('.component-display')[0].childNodes[0] as Element;

// Extract data from the page
const title = window.document.title;
const h1texts: { text: string }[] = [];
document.querySelectorAll('h1').forEach((element) => {
h1texts.push({
text: element.textContent!,
});
});

// Store the results to the dataset. In local configuration,
// the data will be stored as JSON files in ./storage/datasets/default
await Dataset.pushData({
url: request.url,
title,
h1texts,
});
},

// This function is called if the page processing failed more than maxRequestRetries + 1 times.
failedRequestHandler({ request }) {
log.debug(`Request ${request.url} failed twice.`);
console.log(result.innerHTML); // 2
},
});

// Run the crawler and wait for it to finish.
await crawler.run([
'https://crawlee.dev',
'https://ahfarmer.github.io/calculator/',
]);

log.debug('Crawler finished.');
3 changes: 2 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions packages/browser-pool/src/abstract-classes/browser-plugin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import type { UnwrapPromise } from '../utils';
* - without using a fingerprint,
* - without specifying a user agent.
* Last updated on 2022-05-05.
*
* After you update it here, please update it also in jsdom-crawler.ts
*/
export const DEFAULT_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36';

Expand Down
3 changes: 2 additions & 1 deletion packages/jsdom-crawler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
"@crawlee/http": "^3.1.1",
"@crawlee/types": "^3.1.1",
"@types/jsdom": "^20.0.0",
"jsdom": "^20.0.0"
"jsdom": "^20.0.0",
"ow": "^0.28.2"
}
}
55 changes: 53 additions & 2 deletions packages/jsdom-crawler/src/internals/jsdom-crawler.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import ow from 'ow';
import type {
HttpCrawlerOptions,
InternalHttpCrawlingContext,
Expand All @@ -6,12 +7,13 @@ import type {
RequestHandler,
EnqueueLinksOptions,
RequestQueue,
Configuration,
} from '@crawlee/http';
import { HttpCrawler, enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering } from '@crawlee/http';
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
import { concatStreamToBuffer } from '@apify/utilities';
import type { DOMWindow } from 'jsdom';
import { JSDOM } from 'jsdom';
import { JSDOM, ResourceLoader } from 'jsdom';
import type { IncomingMessage } from 'http';

export type JSDOMErrorHandler<
Expand All @@ -22,7 +24,12 @@ export type JSDOMErrorHandler<
export interface JSDOMCrawlerOptions<
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
> extends HttpCrawlerOptions<JSDOMCrawlingContext<UserData, JSONData>> {}
> extends HttpCrawlerOptions<JSDOMCrawlingContext<UserData, JSONData>> {
/**
* Download and run scripts.
*/
runScripts?: boolean;
}

export type JSDOMHook<
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
Expand Down Expand Up @@ -54,6 +61,12 @@ export type JSDOMRequestHandler<
* to display the content, you might need to use {@apilink PuppeteerCrawler} or {@apilink PlaywrightCrawler} instead,
* because it loads the pages using full-featured headless Chrome browser.
*
* Alternatively, you can use {@apilink JSDOMCrawlerOptions.runScripts} to run website scripts in Node.
* JSDOM does not implement all the standards, so websites can break.
*
* **Limitation**:
* This crawler does not support proxies and cookies yet (each open starts with empty cookie store), and the user agent is always set to `Chrome`.
*
* `JSDOMCrawler` downloads each URL using a plain HTTP request,
* parses the HTML content using [JSDOM](https://www.npmjs.com/package/jsdom)
* and then invokes the user-provided {@apilink JSDOMCrawlerOptions.requestHandler} to extract page data
Expand Down Expand Up @@ -110,15 +123,53 @@ export type JSDOMRequestHandler<
* ```
* @category Crawlers
*/
const resources = new ResourceLoader({
// Copy from /packages/browser-pool/src/abstract-classes/browser-plugin.ts:17
// in order not to include the entire package here
userAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36',
});

export class JSDOMCrawler extends HttpCrawler<JSDOMCrawlingContext> {
protected static override optionsShape = {
...HttpCrawler.optionsShape,
runScripts: ow.optional.boolean,
};

protected runScripts: boolean;

constructor(options: JSDOMCrawlerOptions = {}, config?: Configuration) {
const {
runScripts = false,
...httpOptions
} = options;

super(httpOptions, config);

this.runScripts = runScripts;
}

protected override async _cleanupContext(context: JSDOMCrawlingContext) {
context.window?.close();
}

protected override async _parseHTML(response: IncomingMessage, isXml: boolean, crawlingContext: JSDOMCrawlingContext) {
const body = await concatStreamToBuffer(response);

const { window } = new JSDOM(body, {
url: response.url,
contentType: isXml ? 'text/xml' : 'text/html',
runScripts: this.runScripts ? 'dangerously' : undefined,
resources,
});

if (this.runScripts) {
await new Promise<void>((resolve) => {
window.addEventListener('load', () => {
resolve();
});
});
}

return {
window,
get body() {
Expand Down
11 changes: 11 additions & 0 deletions test/e2e/jsdom-react-ts/actor/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
.idea
.DS_Store
node_modules
package-lock.json
apify_storage
crawlee_storage
storage
main.d.ts
main.d.ts.map
main.js
main.js.map
28 changes: 28 additions & 0 deletions test/e2e/jsdom-react-ts/actor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# using multistage build, as we need dev deps to build the TS source code
FROM apify/actor-node:16-beta AS builder

# copy all files, install all dependencies (including dev deps) and build the project
COPY . ./
RUN npm install --include=dev \
&& npm run build

# create final image
FROM apify/actor-node:16-beta
# copy only necessary files
COPY --from=builder /usr/src/app/packages ./packages
COPY --from=builder /usr/src/app/package.json ./
COPY --from=builder /usr/src/app/main.js ./

# install only prod deps
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& npm update \
&& echo "Installed NPM packages:" \
&& (npm list --only=prod --no-optional --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version

# run compiled code
CMD npm run start:prod
6 changes: 6 additions & 0 deletions test/e2e/jsdom-react-ts/actor/apify.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"name": "test-jsdom-react-ts",
"version": "0.0",
"buildTag": "latest",
"env": null
}
31 changes: 31 additions & 0 deletions test/e2e/jsdom-react-ts/actor/main.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import { Actor } from 'apify';
import { JSDOMCrawler, Dataset } from '@crawlee/jsdom';
import { ApifyStorageLocal } from '@apify/storage-local';

if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') {
await Actor.init({ storage: new ApifyStorageLocal() });
} else {
await Actor.init();
}

const crawler = new JSDOMCrawler({
runScripts: true,
requestHandler: async ({ window }) => {
const { document } = window;
document.querySelectorAll('button')[12].click(); // 1
document.querySelectorAll('button')[15].click(); // +
document.querySelectorAll('button')[12].click(); // 1
document.querySelectorAll('button')[18].click(); // =

// 2
const { innerHTML } = document.querySelectorAll('.component-display')[0].childNodes[0] as Element;

await Dataset.pushData({ result: innerHTML });
},
});

await crawler.run([
'https://ahfarmer.github.io/calculator/',
]);

await Actor.exit({ exit: Actor.isAtHome() });
35 changes: 35 additions & 0 deletions test/e2e/jsdom-react-ts/actor/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"name": "test-jsdom-react-ts",
"version": "0.0.1",
"description": "JSDOM Crawler Test - React - TypeScript",
"dependencies": {
"apify": "next",
"@apify/storage-local": "^2.1.0",
"@crawlee/basic": "file:./packages/basic-crawler",
"@crawlee/browser-pool": "file:./packages/browser-pool",
"@crawlee/http": "file:./packages/http-crawler",
"@crawlee/jsdom": "file:./packages/jsdom-crawler",
"@crawlee/core": "file:./packages/core",
"@crawlee/memory-storage": "file:./packages/memory-storage",
"@crawlee/types": "file:./packages/types",
"@crawlee/utils": "file:./packages/utils"
},
"overrides": {
"apify": {
"@crawlee/core": "file:./packages/core",
"@crawlee/types": "file:./packages/types",
"@crawlee/utils": "file:./packages/utils"
}
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"typescript": "4.7.4"
},
"scripts": {
"start": "tsc && node main.js",
"start:prod": "node main.js",
"build": "tsc"
},
"type": "module",
"license": "ISC"
}
11 changes: 11 additions & 0 deletions test/e2e/jsdom-react-ts/actor/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "ES2022",
"target": "ES2022",
"lib": ["DOM"]
},
"include": [
"./**/*.ts"
]
}
11 changes: 11 additions & 0 deletions test/e2e/jsdom-react-ts/test.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { initialize, getActorTestDir, runActor, expect, validateDataset } from '../tools.mjs';

const testActorDirname = getActorTestDir(import.meta.url);
await initialize(testActorDirname);

const { stats, datasetItems } = await runActor(testActorDirname);

await expect(stats.requestsFinished === 1, 'All requests finished');
await expect(datasetItems.length === 1, 'Number of dataset items');
await expect(validateDataset(datasetItems, ['result']), 'Dataset items validation');
await expect(datasetItems[0].result, 'Dataset items');

0 comments on commit 8ef90bc

Please sign in to comment.