/
crawler_commons.ts
83 lines (77 loc) · 3.27 KB
/
crawler_commons.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
import type { Response as GotResponse, OptionsInit } from 'got-scraping';
import type { EnqueueLinksOptions } from '../enqueue_links/enqueue_links';
import type { Log } from '../log';
import type { ProxyInfo } from '../proxy_configuration';
import type { Request } from '../request';
import type { Session } from '../session_pool/session';
import { type Dataset } from '../storages';
// eslint-disable-next-line @typescript-eslint/ban-types
export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary = Dictionary> extends Record<string & {}, unknown> {
id: string;
/**
* The original {@apilink Request} object.
*/
request: Request<UserData>;
session?: Session;
/**
* An object with information about currently used proxy by the crawler
* and configured by the {@apilink ProxyConfiguration} class.
*/
proxyInfo?: ProxyInfo;
log: Log;
crawler: Crawler;
/**
* This function automatically finds and enqueues links from the current page, adding them to the {@apilink RequestQueue}
* currently used by the crawler.
*
* Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
* and override settings of the enqueued {@apilink Request} objects.
*
* Check out the [Crawl a website with relative links](https://crawlee.dev/docs/examples/crawl-relative-links) example
* for more details regarding its usage.
*
* **Example usage**
*
* ```ts
* async requestHandler({ enqueueLinks }) {
* await enqueueLinks({
* globs: [
* 'https://www.example.com/handbags/*',
* ],
* });
* },
* ```
*
* @param [options] All `enqueueLinks()` parameters are passed via an options object.
* @returns Promise that resolves to {@apilink BatchAddRequestsResult} object.
*/
enqueueLinks(options?: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;
/**
* This function allows you to push data to the default {@apilink Dataset} currently used by the crawler.
*
* Shortcut for `crawler.pushData()`.
*
* @param [data] Data to be pushed to the default dataset.
*/
pushData(...args: Parameters<Dataset['pushData']>): Promise<void>;
/**
* Fires HTTP request via [`got-scraping`](https://crawlee.dev/docs/guides/got-scraping), allowing to override the request
* options on the fly.
*
* This is handy when you work with a browser crawler but want to execute some requests outside it (e.g. API requests).
* Check the [Skipping navigations for certain requests](https://crawlee.dev/docs/examples/skip-navigation) example for
* more detailed explanation of how to do that.
*
* ```ts
* async requestHandler({ sendRequest }) {
* const { body } = await sendRequest({
* // override headers only
* headers: { ... },
* });
* },
* ```
*/
sendRequest<Response = string>(overrideOptions?: Partial<OptionsInit>): Promise<GotResponse<Response>>;
}