feat(jsdom-crawler): add runScripts option (#1668)

apify · Nov 14, 2022 · 8ef90bc · 8ef90bc
1 parent 0bc147e
commit 8ef90bc
Show file tree

Hide file tree

Showing 13 changed files with 206 additions and 61 deletions.
diff --git a/docs/examples/jsdom_crawler.mdx b/docs/examples/jsdom_crawler.mdx
@@ -7,8 +7,9 @@ import CodeBlock from '@theme/CodeBlock';
 import ApiLink from '@site/src/components/ApiLink';
 import JSDOMCrawlerSource from '!!raw-loader!./jsdom_crawler.ts';
 
-This example demonstrates how to use <ApiLink to="jsdom-crawler/class/JSDOMCrawler">`JSDOMCrawler`</ApiLink> to crawl a list of URLs from an external file, load each URL using a plain HTTP request, parse the HTML using the [jsdom](https://www.npmjs.com/package/jsdom) DOM implementation and extract some data from it: the page title and all `h1` tags.
+This example demonstrates how to use <ApiLink to="jsdom-crawler/class/JSDOMCrawler">`JSDOMCrawler`</ApiLink> to interact with a website using [jsdom](https://www.npmjs.com/package/jsdom) DOM implementation.
+Here the script will open a calculator app from the [React examples](https://reactjs.org/community/examples.html), click `1` `+` `1` `=` and extract the result.
 
-<CodeBlock className="language-js">
+<CodeBlock className="language-ts">
 	{JSDOMCrawlerSource}
 </CodeBlock>
diff --git a/docs/examples/jsdom_crawler.ts b/docs/examples/jsdom_crawler.ts
@@ -1,64 +1,20 @@
-import { Dataset, JSDOMCrawler, log, LogLevel } from 'crawlee';
+import { JSDOMCrawler } from '@crawlee/jsdom';
 
-// Crawlers come with various utilities, e.g. for logging.
-// Here we use debug level of logging to improve the debugging experience.
-// This functionality is optional!
-log.setLevel(LogLevel.DEBUG);
-
-// Create an instance of the CheerioCrawler class - a crawler
-// that automatically loads the URLs and parses their HTML using the cheerio library.
 const crawler = new JSDOMCrawler({
-    // The crawler downloads and processes the web pages in parallel, with a concurrency
-    // automatically managed based on the available system memory and CPU (see AutoscaledPool class).
-    // Here we define some hard limits for the concurrency.
-    minConcurrency: 10,
-    maxConcurrency: 50,
-
-    // On error, retry each page at most once.
-    maxRequestRetries: 1,
-
-    // Increase the timeout for processing of each page.
-    requestHandlerTimeoutSecs: 30,
-
-    // Limit to 10 requests per one crawl
-    maxRequestsPerCrawl: 10,
+    runScripts: true,
+    requestHandler: async ({ window }) => {
+        const { document } = window;
+        document.querySelectorAll('button')[12].click(); // 1
+        document.querySelectorAll('button')[15].click(); // +
+        document.querySelectorAll('button')[12].click(); // 1
+        document.querySelectorAll('button')[18].click(); // =
 
-    // This function will be called for each URL to crawl.
-    // It accepts a single parameter, which is an object with options as:
-    // https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlerOptions#requestHandler
-    // We use for demonstration only 2 of them:
-    // - request: an instance of the Request class with information such as the URL that is being crawled and HTTP method
-    // - $: the cheerio object containing parsed HTML
-    async requestHandler({ request, window }) {
-        log.debug(`Processing ${request.url}...`);
+        const result = document.querySelectorAll('.component-display')[0].childNodes[0] as Element;
 
-        // Extract data from the page
-        const title = window.document.title;
-        const h1texts: { text: string }[] = [];
-        document.querySelectorAll('h1').forEach((element) => {
-            h1texts.push({
-                text: element.textContent!,
-            });
-        });
-
-        // Store the results to the dataset. In local configuration,
-        // the data will be stored as JSON files in ./storage/datasets/default
-        await Dataset.pushData({
-            url: request.url,
-            title,
-            h1texts,
-        });
-    },
-
-    // This function is called if the page processing failed more than maxRequestRetries + 1 times.
-    failedRequestHandler({ request }) {
-        log.debug(`Request ${request.url} failed twice.`);
+        console.log(result.innerHTML); // 2
     },
 });
 
-// Run the crawler and wait for it to finish.
 await crawler.run([
-    'https://crawlee.dev',
+    'https://ahfarmer.github.io/calculator/',
 ]);
-
-log.debug('Crawler finished.');
diff --git a/package-lock.json b/package-lock.json
diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts
@@ -13,6 +13,8 @@ import type { UnwrapPromise } from '../utils';
  *  - without using a fingerprint,
  *  - without specifying a user agent.
  * Last updated on 2022-05-05.
+ *
+ * After you update it here, please update it also in jsdom-crawler.ts
  */
 export const DEFAULT_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36';
 

diff --git a/packages/jsdom-crawler/package.json b/packages/jsdom-crawler/package.json
@@ -57,6 +57,7 @@
         "@crawlee/http": "^3.1.1",
         "@crawlee/types": "^3.1.1",
         "@types/jsdom": "^20.0.0",
-        "jsdom": "^20.0.0"
+        "jsdom": "^20.0.0",
+        "ow": "^0.28.2"
     }
 }
diff --git a/packages/jsdom-crawler/src/internals/jsdom-crawler.ts b/packages/jsdom-crawler/src/internals/jsdom-crawler.ts
@@ -1,3 +1,4 @@
+import ow from 'ow';
 import type {
     HttpCrawlerOptions,
     InternalHttpCrawlingContext,
@@ -6,12 +7,13 @@ import type {
     RequestHandler,
     EnqueueLinksOptions,
     RequestQueue,
+    Configuration,
 } from '@crawlee/http';
 import { HttpCrawler, enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering } from '@crawlee/http';
 import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
 import { concatStreamToBuffer } from '@apify/utilities';
 import type { DOMWindow } from 'jsdom';
-import { JSDOM } from 'jsdom';
+import { JSDOM, ResourceLoader } from 'jsdom';
 import type { IncomingMessage } from 'http';
 
 export type JSDOMErrorHandler<
@@ -22,7 +24,12 @@ export type JSDOMErrorHandler<
 export interface JSDOMCrawlerOptions<
     UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
     JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
-    > extends HttpCrawlerOptions<JSDOMCrawlingContext<UserData, JSONData>> {}
+    > extends HttpCrawlerOptions<JSDOMCrawlingContext<UserData, JSONData>> {
+    /**
+     * Download and run scripts.
+     */
+    runScripts?: boolean;
+}
 
 export type JSDOMHook<
     UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
@@ -54,6 +61,12 @@ export type JSDOMRequestHandler<
  * to display the content, you might need to use {@apilink PuppeteerCrawler} or {@apilink PlaywrightCrawler} instead,
  * because it loads the pages using full-featured headless Chrome browser.
  *
+ * Alternatively, you can use {@apilink JSDOMCrawlerOptions.runScripts} to run website scripts in Node.
+ * JSDOM does not implement all the standards, so websites can break.
+ *
+ * **Limitation**:
+ * This crawler does not support proxies and cookies yet (each open starts with empty cookie store), and the user agent is always set to `Chrome`.
+ *
  * `JSDOMCrawler` downloads each URL using a plain HTTP request,
  * parses the HTML content using [JSDOM](https://www.npmjs.com/package/jsdom)
  * and then invokes the user-provided {@apilink JSDOMCrawlerOptions.requestHandler} to extract page data
@@ -110,15 +123,53 @@ export type JSDOMRequestHandler<
  * ```
  * @category Crawlers
  */
+const resources = new ResourceLoader({
+    // Copy from /packages/browser-pool/src/abstract-classes/browser-plugin.ts:17
+    // in order not to include the entire package here
+    userAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36',
+});
+
 export class JSDOMCrawler extends HttpCrawler<JSDOMCrawlingContext> {
+    protected static override optionsShape = {
+        ...HttpCrawler.optionsShape,
+        runScripts: ow.optional.boolean,
+    };
+
+    protected runScripts: boolean;
+
+    constructor(options: JSDOMCrawlerOptions = {}, config?: Configuration) {
+        const {
+            runScripts = false,
+            ...httpOptions
+        } = options;
+
+        super(httpOptions, config);
+
+        this.runScripts = runScripts;
+    }
+
+    protected override async _cleanupContext(context: JSDOMCrawlingContext) {
+        context.window?.close();
+    }
+
     protected override async _parseHTML(response: IncomingMessage, isXml: boolean, crawlingContext: JSDOMCrawlingContext) {
         const body = await concatStreamToBuffer(response);
 
         const { window } = new JSDOM(body, {
             url: response.url,
             contentType: isXml ? 'text/xml' : 'text/html',
+            runScripts: this.runScripts ? 'dangerously' : undefined,
+            resources,
         });
 
+        if (this.runScripts) {
+            await new Promise<void>((resolve) => {
+                window.addEventListener('load', () => {
+                    resolve();
+                });
+            });
+        }
+
         return {
             window,
             get body() {

diff --git a/test/e2e/jsdom-react-ts/actor/.gitignore b/test/e2e/jsdom-react-ts/actor/.gitignore
@@ -0,0 +1,11 @@
+.idea
+.DS_Store
+node_modules
+package-lock.json
+apify_storage
+crawlee_storage
+storage
+main.d.ts
+main.d.ts.map
+main.js
+main.js.map
diff --git a/test/e2e/jsdom-react-ts/actor/Dockerfile b/test/e2e/jsdom-react-ts/actor/Dockerfile
@@ -0,0 +1,28 @@
+# using multistage build, as we need dev deps to build the TS source code
+FROM apify/actor-node:16-beta AS builder
+
+# copy all files, install all dependencies (including dev deps) and build the project
+COPY . ./
+RUN npm install --include=dev \
+    && npm run build
+
+# create final image
+FROM apify/actor-node:16-beta
+# copy only necessary files
+COPY --from=builder /usr/src/app/packages ./packages
+COPY --from=builder /usr/src/app/package.json ./
+COPY --from=builder /usr/src/app/main.js ./
+
+# install only prod deps
+RUN npm --quiet set progress=false \
+    && npm install --only=prod --no-optional \
+    && npm update \
+    && echo "Installed NPM packages:" \
+    && (npm list --only=prod --no-optional --all || true) \
+    && echo "Node.js version:" \
+    && node --version \
+    && echo "NPM version:" \
+    && npm --version
+
+# run compiled code
+CMD npm run start:prod
diff --git a/test/e2e/jsdom-react-ts/actor/apify.json b/test/e2e/jsdom-react-ts/actor/apify.json
@@ -0,0 +1,6 @@
+{
+	"name": "test-jsdom-react-ts",
+	"version": "0.0",
+	"buildTag": "latest",
+	"env": null
+}
diff --git a/test/e2e/jsdom-react-ts/actor/main.ts b/test/e2e/jsdom-react-ts/actor/main.ts
@@ -0,0 +1,31 @@
+import { Actor } from 'apify';
+import { JSDOMCrawler, Dataset } from '@crawlee/jsdom';
+import { ApifyStorageLocal } from '@apify/storage-local';
+
+if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') {
+    await Actor.init({ storage: new ApifyStorageLocal() });
+} else {
+    await Actor.init();
+}
+
+const crawler = new JSDOMCrawler({
+    runScripts: true,
+    requestHandler: async ({ window }) => {
+        const { document } = window;
+        document.querySelectorAll('button')[12].click(); // 1
+        document.querySelectorAll('button')[15].click(); // +
+        document.querySelectorAll('button')[12].click(); // 1
+        document.querySelectorAll('button')[18].click(); // =
+
+        // 2
+        const { innerHTML } = document.querySelectorAll('.component-display')[0].childNodes[0] as Element;
+
+        await Dataset.pushData({ result: innerHTML });
+    },
+});
+
+await crawler.run([
+    'https://ahfarmer.github.io/calculator/',
+]);
+
+await Actor.exit({ exit: Actor.isAtHome() });
diff --git a/test/e2e/jsdom-react-ts/actor/package.json b/test/e2e/jsdom-react-ts/actor/package.json
@@ -0,0 +1,35 @@
+{
+    "name": "test-jsdom-react-ts",
+    "version": "0.0.1",
+    "description": "JSDOM Crawler Test - React - TypeScript",
+    "dependencies": {
+        "apify": "next",
+        "@apify/storage-local": "^2.1.0",
+        "@crawlee/basic": "file:./packages/basic-crawler",
+        "@crawlee/browser-pool": "file:./packages/browser-pool",
+        "@crawlee/http": "file:./packages/http-crawler",
+        "@crawlee/jsdom": "file:./packages/jsdom-crawler",
+        "@crawlee/core": "file:./packages/core",
+        "@crawlee/memory-storage": "file:./packages/memory-storage",
+        "@crawlee/types": "file:./packages/types",
+        "@crawlee/utils": "file:./packages/utils"
+    },
+    "overrides": {
+        "apify": {
+            "@crawlee/core": "file:./packages/core",
+            "@crawlee/types": "file:./packages/types",
+            "@crawlee/utils": "file:./packages/utils"
+        }
+    },
+    "devDependencies": {
+        "@apify/tsconfig": "^0.1.0",
+        "typescript": "4.7.4"
+    },
+    "scripts": {
+        "start": "tsc && node main.js",
+        "start:prod": "node main.js",
+        "build": "tsc"
+    },
+    "type": "module",
+    "license": "ISC"
+}
diff --git a/test/e2e/jsdom-react-ts/actor/tsconfig.json b/test/e2e/jsdom-react-ts/actor/tsconfig.json
@@ -0,0 +1,11 @@
+{
+    "extends": "@apify/tsconfig",
+    "compilerOptions": {
+        "module": "ES2022",
+        "target": "ES2022",
+        "lib": ["DOM"]
+    },
+    "include": [
+        "./**/*.ts"
+    ]
+}
diff --git a/test/e2e/jsdom-react-ts/test.mjs b/test/e2e/jsdom-react-ts/test.mjs
@@ -0,0 +1,11 @@
+import { initialize, getActorTestDir, runActor, expect, validateDataset } from '../tools.mjs';
+
+const testActorDirname = getActorTestDir(import.meta.url);
+await initialize(testActorDirname);
+
+const { stats, datasetItems } = await runActor(testActorDirname);
+
+await expect(stats.requestsFinished === 1, 'All requests finished');
+await expect(datasetItems.length === 1, 'Number of dataset items');
+await expect(validateDataset(datasetItems, ['result']), 'Dataset items validation');
+await expect(datasetItems[0].result, 'Dataset items');