-
Notifications
You must be signed in to change notification settings - Fork 664
/
cheerio-crawler.ts
269 lines (250 loc) · 10.9 KB
/
cheerio-crawler.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import type { IncomingMessage } from 'http';
import type {
EnqueueLinksOptions,
ErrorHandler,
GetUserDataFromRequest,
HttpCrawlerOptions,
InternalHttpCrawlingContext,
InternalHttpHook,
RequestHandler,
RouterRoutes,
Configuration,
RequestProvider,
} from '@crawlee/http';
import {
HttpCrawler,
enqueueLinks,
Router,
resolveBaseUrlForEnqueueLinksFiltering,
} from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import { extractUrlsFromCheerio } from '@crawlee/utils';
import type { CheerioOptions } from 'cheerio';
import * as cheerio from 'cheerio';
import { DomHandler } from 'htmlparser2';
import { WritableStream } from 'htmlparser2/lib/WritableStream';
export type CheerioErrorHandler<
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
> = ErrorHandler<CheerioCrawlingContext<UserData, JSONData>>;
export interface CheerioCrawlerOptions<
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
> extends HttpCrawlerOptions<CheerioCrawlingContext<UserData, JSONData>> {}
export type CheerioHook<
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
> = InternalHttpHook<CheerioCrawlingContext<UserData, JSONData>>;
export interface CheerioCrawlingContext<
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
> extends InternalHttpCrawlingContext<UserData, JSONData, CheerioCrawler> {
/**
* The [Cheerio](https://cheerio.js.org/) object with parsed HTML.
* Cheerio is available only for HTML and XML content types.
*/
$: cheerio.CheerioAPI;
/**
* Returns Cheerio handle, this is here to unify the crawler API, so they all have this handy method.
* It has the same return type as the `$` context property, use it only if you are abstracting your workflow to
* support different context types in one handler.
*
* **Example usage:**
* ```javascript
* async requestHandler({ parseWithCheerio }) {
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
parseWithCheerio(): Promise<cheerio.CheerioAPI>;
}
export type CheerioRequestHandler<
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
> = RequestHandler<CheerioCrawlingContext<UserData, JSONData>>;
/**
* Provides a framework for the parallel crawling of web pages using plain HTTP requests and
* [cheerio](https://www.npmjs.com/package/cheerio) HTML parser.
* The URLs to crawl are fed either from a static list of URLs
* or from a dynamic queue of URLs enabling recursive crawling of websites.
*
* Since `CheerioCrawler` uses raw HTTP requests to download web pages,
* it is very fast and efficient on data bandwidth. However, if the target website requires JavaScript
* to display the content, you might need to use {@apilink PuppeteerCrawler} or {@apilink PlaywrightCrawler} instead,
* because it loads the pages using full-featured headless Chrome browser.
*
* `CheerioCrawler` downloads each URL using a plain HTTP request,
* parses the HTML content using [Cheerio](https://www.npmjs.com/package/cheerio)
* and then invokes the user-provided {@apilink CheerioCrawlerOptions.requestHandler} to extract page data
* using a [jQuery](https://jquery.com/)-like interface to the parsed HTML DOM.
*
* The source URLs are represented using {@apilink Request} objects that are fed from
* {@apilink RequestList} or {@apilink RequestQueue} instances provided by the {@apilink CheerioCrawlerOptions.requestList}
* or {@apilink CheerioCrawlerOptions.requestQueue} constructor options, respectively.
*
* If both {@apilink CheerioCrawlerOptions.requestList} and {@apilink CheerioCrawlerOptions.requestQueue} are used,
* the instance first processes URLs from the {@apilink RequestList} and automatically enqueues all of them
* to {@apilink RequestQueue} before it starts their processing. This ensures that a single URL is not crawled multiple times.
*
* The crawler finishes when there are no more {@apilink Request} objects to crawl.
*
* We can use the `preNavigationHooks` to adjust `gotOptions`:
*
* ```
* preNavigationHooks: [
* (crawlingContext, gotOptions) => {
* // ...
* },
* ]
* ```
*
* By default, `CheerioCrawler` only processes web pages with the `text/html`
* and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
* and skips pages with other content types. If you want the crawler to process other content types,
* use the {@apilink CheerioCrawlerOptions.additionalMimeTypes} constructor option.
* Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
* For more details, see {@apilink CheerioCrawlerOptions.requestHandler}.
*
* New requests are only dispatched when there is enough free CPU and memory available,
* using the functionality provided by the {@apilink AutoscaledPool} class.
* All {@apilink AutoscaledPool} configuration options can be passed to the `autoscaledPoolOptions`
* parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency`
* {@apilink AutoscaledPool} options are available directly in the `CheerioCrawler` constructor.
*
* **Example usage:**
*
* ```javascript
* const crawler = new CheerioCrawler({
* async requestHandler({ request, response, body, contentType, $ }) {
* const data = [];
*
* // Do some data extraction from the page with Cheerio.
* $('.some-collection').each((index, el) => {
* data.push({ title: $(el).find('.some-title').text() });
* });
*
* // Save the data to dataset.
* await Dataset.pushData({
* url: request.url,
* html: body,
* data,
* })
* },
* });
*
* await crawler.run([
* 'http://www.example.com/page-1',
* 'http://www.example.com/page-2',
* ]);
* ```
* @category Crawlers
*/
export class CheerioCrawler extends HttpCrawler<CheerioCrawlingContext> {
/**
* All `CheerioCrawler` parameters are passed via an options object.
*/
// eslint-disable-next-line @typescript-eslint/no-useless-constructor
constructor(options?: CheerioCrawlerOptions, config?: Configuration) {
super(options, config);
}
protected override async _parseHTML(response: IncomingMessage, isXml: boolean, crawlingContext: CheerioCrawlingContext) {
const dom = await this._parseHtmlToDom(response, isXml);
const $ = cheerio.load(dom as string, {
xmlMode: isXml,
// Recent versions of cheerio use parse5 as the HTML parser/serializer. It's more strict than htmlparser2
// and not good for scraping. It also does not have a great streaming interface.
// Here we tell cheerio to use htmlparser2 for serialization, otherwise the conflict produces weird errors.
_useHtmlParser2: true,
} as CheerioOptions);
return {
dom,
$,
get body() {
return isXml ? $!.xml() : $!.html({ decodeEntities: false });
},
enqueueLinks: async (enqueueOptions?: EnqueueLinksOptions) => {
return cheerioCrawlerEnqueueLinks({
options: enqueueOptions,
$,
requestQueue: await this.getRequestQueue(),
originalRequestUrl: crawlingContext.request.url,
finalRequestUrl: crawlingContext.request.loadedUrl,
});
},
};
}
protected async _parseHtmlToDom(response: IncomingMessage, isXml: boolean) {
return new Promise((resolve, reject) => {
const domHandler = new DomHandler((err, dom) => {
if (err) reject(err);
else resolve(dom);
}, { xmlMode: isXml });
const parser = new WritableStream(domHandler, { decodeEntities: true, xmlMode: isXml });
parser.on('error', reject);
response
.on('error', reject)
.pipe(parser);
});
}
protected override async _runRequestHandler(context: CheerioCrawlingContext) {
context.parseWithCheerio = async () => Promise.resolve(context.$);
await super._runRequestHandler(context);
}
}
interface EnqueueLinksInternalOptions {
options?: EnqueueLinksOptions;
$: cheerio.CheerioAPI | null;
requestQueue: RequestProvider;
originalRequestUrl: string;
finalRequestUrl?: string;
}
/** @internal */
export async function cheerioCrawlerEnqueueLinks({ options, $, requestQueue, originalRequestUrl, finalRequestUrl }: EnqueueLinksInternalOptions) {
if (!$) {
throw new Error('Cannot enqueue links because the DOM is not available.');
}
const baseUrl = resolveBaseUrlForEnqueueLinksFiltering({
enqueueStrategy: options?.strategy,
finalRequestUrl,
originalRequestUrl,
userProvidedBaseUrl: options?.baseUrl,
});
const urls = extractUrlsFromCheerio($, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl);
return enqueueLinks({
requestQueue,
urls,
baseUrl,
...options,
});
}
/**
* Creates new {@apilink Router} instance that works based on request labels.
* This instance can then serve as a `requestHandler` of your {@apilink CheerioCrawler}.
* Defaults to the {@apilink CheerioCrawlingContext}.
*
* > Serves as a shortcut for using `Router.create<CheerioCrawlingContext>()`.
*
* ```ts
* import { CheerioCrawler, createCheerioRouter } from 'crawlee';
*
* const router = createCheerioRouter();
* router.addHandler('label-a', async (ctx) => {
* ctx.log.info('...');
* });
* router.addDefaultHandler(async (ctx) => {
* ctx.log.info('...');
* });
*
* const crawler = new CheerioCrawler({
* requestHandler: router,
* });
* await crawler.run();
* ```
*/
export function createCheerioRouter<
Context extends CheerioCrawlingContext = CheerioCrawlingContext,
UserData extends Dictionary = GetUserDataFromRequest<Context['request']>,
>(routes?: RouterRoutes<Context, UserData>) {
return Router.create<Context>(routes);
}