-
Notifications
You must be signed in to change notification settings - Fork 563
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(jsdom-crawler): add runScripts option (#1668)
- Loading branch information
Showing
13 changed files
with
206 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,64 +1,20 @@ | ||
import { Dataset, JSDOMCrawler, log, LogLevel } from 'crawlee'; | ||
import { JSDOMCrawler } from '@crawlee/jsdom'; | ||
|
||
// Crawlers come with various utilities, e.g. for logging. | ||
// Here we use debug level of logging to improve the debugging experience. | ||
// This functionality is optional! | ||
log.setLevel(LogLevel.DEBUG); | ||
|
||
// Create an instance of the CheerioCrawler class - a crawler | ||
// that automatically loads the URLs and parses their HTML using the cheerio library. | ||
const crawler = new JSDOMCrawler({ | ||
// The crawler downloads and processes the web pages in parallel, with a concurrency | ||
// automatically managed based on the available system memory and CPU (see AutoscaledPool class). | ||
// Here we define some hard limits for the concurrency. | ||
minConcurrency: 10, | ||
maxConcurrency: 50, | ||
|
||
// On error, retry each page at most once. | ||
maxRequestRetries: 1, | ||
|
||
// Increase the timeout for processing of each page. | ||
requestHandlerTimeoutSecs: 30, | ||
|
||
// Limit to 10 requests per one crawl | ||
maxRequestsPerCrawl: 10, | ||
runScripts: true, | ||
requestHandler: async ({ window }) => { | ||
const { document } = window; | ||
document.querySelectorAll('button')[12].click(); // 1 | ||
document.querySelectorAll('button')[15].click(); // + | ||
document.querySelectorAll('button')[12].click(); // 1 | ||
document.querySelectorAll('button')[18].click(); // = | ||
|
||
// This function will be called for each URL to crawl. | ||
// It accepts a single parameter, which is an object with options as: | ||
// https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlerOptions#requestHandler | ||
// We use for demonstration only 2 of them: | ||
// - request: an instance of the Request class with information such as the URL that is being crawled and HTTP method | ||
// - $: the cheerio object containing parsed HTML | ||
async requestHandler({ request, window }) { | ||
log.debug(`Processing ${request.url}...`); | ||
const result = document.querySelectorAll('.component-display')[0].childNodes[0] as Element; | ||
|
||
// Extract data from the page | ||
const title = window.document.title; | ||
const h1texts: { text: string }[] = []; | ||
document.querySelectorAll('h1').forEach((element) => { | ||
h1texts.push({ | ||
text: element.textContent!, | ||
}); | ||
}); | ||
|
||
// Store the results to the dataset. In local configuration, | ||
// the data will be stored as JSON files in ./storage/datasets/default | ||
await Dataset.pushData({ | ||
url: request.url, | ||
title, | ||
h1texts, | ||
}); | ||
}, | ||
|
||
// This function is called if the page processing failed more than maxRequestRetries + 1 times. | ||
failedRequestHandler({ request }) { | ||
log.debug(`Request ${request.url} failed twice.`); | ||
console.log(result.innerHTML); // 2 | ||
}, | ||
}); | ||
|
||
// Run the crawler and wait for it to finish. | ||
await crawler.run([ | ||
'https://crawlee.dev', | ||
'https://ahfarmer.github.io/calculator/', | ||
]); | ||
|
||
log.debug('Crawler finished.'); |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
.idea | ||
.DS_Store | ||
node_modules | ||
package-lock.json | ||
apify_storage | ||
crawlee_storage | ||
storage | ||
main.d.ts | ||
main.d.ts.map | ||
main.js | ||
main.js.map |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# using multistage build, as we need dev deps to build the TS source code | ||
FROM apify/actor-node:16-beta AS builder | ||
|
||
# copy all files, install all dependencies (including dev deps) and build the project | ||
COPY . ./ | ||
RUN npm install --include=dev \ | ||
&& npm run build | ||
|
||
# create final image | ||
FROM apify/actor-node:16-beta | ||
# copy only necessary files | ||
COPY --from=builder /usr/src/app/packages ./packages | ||
COPY --from=builder /usr/src/app/package.json ./ | ||
COPY --from=builder /usr/src/app/main.js ./ | ||
|
||
# install only prod deps | ||
RUN npm --quiet set progress=false \ | ||
&& npm install --only=prod --no-optional \ | ||
&& npm update \ | ||
&& echo "Installed NPM packages:" \ | ||
&& (npm list --only=prod --no-optional --all || true) \ | ||
&& echo "Node.js version:" \ | ||
&& node --version \ | ||
&& echo "NPM version:" \ | ||
&& npm --version | ||
|
||
# run compiled code | ||
CMD npm run start:prod |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"name": "test-jsdom-react-ts", | ||
"version": "0.0", | ||
"buildTag": "latest", | ||
"env": null | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import { Actor } from 'apify'; | ||
import { JSDOMCrawler, Dataset } from '@crawlee/jsdom'; | ||
import { ApifyStorageLocal } from '@apify/storage-local'; | ||
|
||
if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') { | ||
await Actor.init({ storage: new ApifyStorageLocal() }); | ||
} else { | ||
await Actor.init(); | ||
} | ||
|
||
const crawler = new JSDOMCrawler({ | ||
runScripts: true, | ||
requestHandler: async ({ window }) => { | ||
const { document } = window; | ||
document.querySelectorAll('button')[12].click(); // 1 | ||
document.querySelectorAll('button')[15].click(); // + | ||
document.querySelectorAll('button')[12].click(); // 1 | ||
document.querySelectorAll('button')[18].click(); // = | ||
|
||
// 2 | ||
const { innerHTML } = document.querySelectorAll('.component-display')[0].childNodes[0] as Element; | ||
|
||
await Dataset.pushData({ result: innerHTML }); | ||
}, | ||
}); | ||
|
||
await crawler.run([ | ||
'https://ahfarmer.github.io/calculator/', | ||
]); | ||
|
||
await Actor.exit({ exit: Actor.isAtHome() }); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
{ | ||
"name": "test-jsdom-react-ts", | ||
"version": "0.0.1", | ||
"description": "JSDOM Crawler Test - React - TypeScript", | ||
"dependencies": { | ||
"apify": "next", | ||
"@apify/storage-local": "^2.1.0", | ||
"@crawlee/basic": "file:./packages/basic-crawler", | ||
"@crawlee/browser-pool": "file:./packages/browser-pool", | ||
"@crawlee/http": "file:./packages/http-crawler", | ||
"@crawlee/jsdom": "file:./packages/jsdom-crawler", | ||
"@crawlee/core": "file:./packages/core", | ||
"@crawlee/memory-storage": "file:./packages/memory-storage", | ||
"@crawlee/types": "file:./packages/types", | ||
"@crawlee/utils": "file:./packages/utils" | ||
}, | ||
"overrides": { | ||
"apify": { | ||
"@crawlee/core": "file:./packages/core", | ||
"@crawlee/types": "file:./packages/types", | ||
"@crawlee/utils": "file:./packages/utils" | ||
} | ||
}, | ||
"devDependencies": { | ||
"@apify/tsconfig": "^0.1.0", | ||
"typescript": "4.7.4" | ||
}, | ||
"scripts": { | ||
"start": "tsc && node main.js", | ||
"start:prod": "node main.js", | ||
"build": "tsc" | ||
}, | ||
"type": "module", | ||
"license": "ISC" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
{ | ||
"extends": "@apify/tsconfig", | ||
"compilerOptions": { | ||
"module": "ES2022", | ||
"target": "ES2022", | ||
"lib": ["DOM"] | ||
}, | ||
"include": [ | ||
"./**/*.ts" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
import { initialize, getActorTestDir, runActor, expect, validateDataset } from '../tools.mjs'; | ||
|
||
const testActorDirname = getActorTestDir(import.meta.url); | ||
await initialize(testActorDirname); | ||
|
||
const { stats, datasetItems } = await runActor(testActorDirname); | ||
|
||
await expect(stats.requestsFinished === 1, 'All requests finished'); | ||
await expect(datasetItems.length === 1, 'Number of dataset items'); | ||
await expect(validateDataset(datasetItems, ['result']), 'Dataset items validation'); | ||
await expect(datasetItems[0].result, 'Dataset items'); |