-
Notifications
You must be signed in to change notification settings - Fork 571
/
enqueue_links.js
173 lines (166 loc) · 7.44 KB
/
enqueue_links.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import { URL } from 'url';
import log from 'apify-shared/log';
import { checkParamOrThrow } from 'apify-client/build/utils';
import { checkParamPrototypeOrThrow } from 'apify-shared/utilities';
import { RequestQueue, RequestQueueLocal } from '../request_queue';
import { constructPseudoUrlInstances, createRequests, addRequestsToQueueInBatches, createRequestOptions } from './shared';
/**
* The function finds elements matching a specific CSS selector (HTML anchor (`<a>`) by default)
* either in a Puppeteer page, or in a Cheerio object (parsed HTML),
* and enqueues the URLs in their `href` attributes to the provided {@link RequestQueue}.
* If you're looking to find URLs in JavaScript heavy pages where links are not available
* in `href` elements, but rather navigations are triggered in click handlers
* see [`enqueueLinksByClickingElements()`](puppeteer#puppeteer.enqueueLinksByClickingElements).
*
* Optionally, the function allows you to filter the target links' URLs using an array of {@link PseudoUrl} objects
* and override settings of the enqueued {@link Request} objects.
*
* **Example usage**
*
* ```javascript
* const Apify = require('apify');
*
* const browser = await Apify.launchPuppeteer();
* const page = await browser.goto('https://www.example.com');
* const requestQueue = await Apify.openRequestQueue();
*
* await Apify.utils.enqueueLinks({
* page,
* requestQueue,
* selector: 'a.product-detail',
* pseudoUrls: [
* 'https://www.example.com/handbags/[.*]',
* 'https://www.example.com/purses/[.*]'
* ],
* });
* ```
*
* @param {Object} options
* All `enqueueLinks()` parameters are passed
* via an options object with the following keys:
* @param {Page} options.page
* Puppeteer <a href="https://pptr.dev/#?product=Puppeteer&show=api-class-page" target="_blank"><code>Page</code></a> object.
* Either `page` or `$` option must be provided.
* @param {Cheerio} options.$
* <a href="https://github.com/cheeriojs/cheerio" target="_blank"><code>Cheerio</code></a> object with loaded HTML.
* Either `page` or `$` option must be provided.
* @param {RequestQueue} options.requestQueue
* A request queue to which the URLs will be enqueued.
* @param {String} [options.selector='a']
* A CSS selector matching links to be enqueued.
* @param {string} [options.baseUrl]
* A base URL that will be used to resolve relative URLs when using Cheerio. Ignored when using Puppeteer,
* since the relative URL resolution is done inside the browser automatically.
* @param {Object[]|String[]} [options.pseudoUrls]
* An array of {@link PseudoUrl}s matching the URLs to be enqueued,
* or an array of strings or RegExps or plain Objects from which the {@link PseudoUrl}s can be constructed.
*
* The plain objects must include at least the `purl` property, which holds the pseudo-URL string or RegExp.
* All remaining keys will be used as the `requestTemplate` argument of the {@link PseudoUrl} constructor,
* which lets you specify special properties for the enqueued {@link Request} objects.
*
* If `pseudoUrls` is an empty array, `null` or `undefined`, then the function
* enqueues all links found on the page.
* @param {Function} [options.transformRequestFunction]
* **Signature:** ({@link Request}): {@link Request}
*
* Just before a new {@link Request} is constructed and enqueued to the {@link RequestQueue}, this function can be used
* to remove it or modify its contents such as `userData`, `payload` or, most importantly `uniqueKey`. This is useful
* when you need to enqueue multiple `Requests` to the queue that share the same URL, but differ in methods or payloads,
* or to dynamically update or create `userData`.
*
* For example: by adding `keepUrlFragment: true` to the `request` object, URL fragments will not be removed
* when `uniqueKey` is computed.
*
* **Example:**
* ```javascript
* {
* transformRequestFunction: (request) => {
* request.userData.foo = 'bar';
* request.keepUrlFragment = true;
* return request;
* }
* }
* ```
* @return {Promise<QueueOperationInfo[]>}
* Promise that resolves to an array of {@link QueueOperationInfo} objects.
* @memberOf utils
* @name enqueueLinks
*/
export async function enqueueLinks(options = {}) {
const {
page,
$,
selector = 'a',
requestQueue,
baseUrl,
pseudoUrls,
userData, // TODO DEPRECATED 2019/06/27
transformRequestFunction,
} = options;
if (userData) {
log.deprecated('options.userData of Apify.utils.enqueueLinks() is deprecated. Use options.transformRequestFunction instead.');
}
checkParamOrThrow(page, 'page', 'Maybe Object');
checkParamOrThrow($, '$', 'Maybe Function');
if (!page && !$) {
throw new Error('One of the parameters "options.page" or "options.$" must be provided!');
}
if (page && $) {
throw new Error('Only one of the parameters "options.page" or "options.$" must be provided!');
}
checkParamOrThrow(selector, 'selector', 'String');
checkParamPrototypeOrThrow(requestQueue, 'requestQueue', [RequestQueue, RequestQueueLocal], 'Apify.RequestQueue');
checkParamOrThrow(baseUrl, 'baseUrl', 'Maybe String');
if (baseUrl && page) log.warning('The parameter options.baseUrl can only be used when parsing a Cheerio object. It will be ignored.');
checkParamOrThrow(pseudoUrls, 'pseudoUrls', 'Maybe Array');
checkParamOrThrow(userData, 'userData', 'Maybe Object');
checkParamOrThrow(transformRequestFunction, 'transformRequestFunction', 'Maybe Function');
// Construct pseudoUrls from input where necessary.
const pseudoUrlInstances = constructPseudoUrlInstances(pseudoUrls || []);
const urls = page ? await extractUrlsFromPage(page, selector) : extractUrlsFromCheerio($, selector, baseUrl);
let requestOptions = createRequestOptions(urls, userData);
if (transformRequestFunction) {
requestOptions = requestOptions.map(transformRequestFunction).filter(r => !!r);
}
const requests = createRequests(requestOptions, pseudoUrlInstances);
return addRequestsToQueueInBatches(requests, requestQueue);
}
/**
* Extracts URLs from a given Puppeteer Page.
*
* @param {Page} page
* @param {string} selector
* @return {string[]}
* @ignore
*/
export async function extractUrlsFromPage(page, selector) {
/* istanbul ignore next */
return page.$$eval(selector, linkEls => linkEls.map(link => link.href).filter(href => !!href));
}
/**
* Extracts URLs from a given Cheerio object.
*
* @param {Function} $
* @param {string} selector
* @param {string} baseUrl
* @return {string[]}
* @ignore
*/
export function extractUrlsFromCheerio($, selector, baseUrl) {
return $(selector)
.map((i, el) => $(el).attr('href'))
.get()
.filter(href => !!href)
.map((href) => {
// Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later.
const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package.
if (!isHrefAbsolute && !baseUrl) {
throw new Error(`An extracted URL: ${href} is relative and options.baseUrl is not set. `
+ 'Use options.baseUrl in utils.enqueueLinks() to automatically resolve relative URLs.');
}
return baseUrl
? (new URL(href, baseUrl)).href
: href;
});
}