-
Notifications
You must be signed in to change notification settings - Fork 571
/
request_list.js
677 lines (611 loc) · 27.2 KB
/
request_list.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
import { checkParamOrThrow } from 'apify-client/build/utils';
import log from 'apify-shared/log';
import _ from 'underscore';
import { ACTOR_EVENT_NAMES_EX } from './constants';
import Request from './request';
import events from './events';
import { getFirstKey, publicUtils } from './utils';
import { getValue, setValue } from './key_value_store';
export const STATE_PERSISTENCE_KEY = 'REQUEST_LIST_STATE';
export const SOURCES_PERSISTENCE_KEY = 'REQUEST_LIST_SOURCES';
/**
* Represents a static list of URLs to crawl.
* The URLs can be provided either in code or parsed from a text file hosted on the web.
*
* Each URL is represented using an instance of the {@link Request} class.
* The list can only contain unique URLs. More precisely, it can only contain `Request` instances
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
* To add a single URL to the list multiple times, corresponding {@link Request} objects will need to have different
* `uniqueKey` properties. You can use the `keepDuplicateUrls` option to do this for you when initializing the
* `RequestList` from sources.
*
* Once you create an instance of `RequestList`, you need to call the {@link RequestList#initialize} function
* before the instance can be used. After that, no more URLs can be added to the list.
*
* `RequestList` is used by {@link BasicCrawler}, {@link CheerioCrawler}
* and {@link PuppeteerCrawler} as a source of URLs to crawl.
* Unlike {@link RequestQueue}, `RequestList` is static but it can contain even millions of URLs.
*
* `RequestList` has an internal state where it stores information about which requests were already handled,
* which are in progress and which were reclaimed. The state may be automatically persisted to the default
* {@link KeyValueStore} by setting the `persistStateKey` option so that if the Node.js process is restarted,
* the crawling can continue where it left off. The automated persisting is launched upon receiving the `persistState`
* event that is periodically emitted by {@link events|Apify.events}.
*
* The internal state is closely tied to the provided sources (URLs). If the sources change on actor restart, the state will become corrupted and
* `RequestList` will raise an exception. This typically happens when the sources is a list of URLs downloaded from the web.
* In such case, use the `persistSourcesKey` option in conjunction with `persistStateKey`,
* to make the `RequestList` store the initial sources to the default key-value store and load them after restart,
* which will prevent any issues that a live list of URLs might cause.
*
* **Example usage:**
*
* ```javascript
* const requestList = new Apify.RequestList({
* sources: [
* // Separate requests
* { url: 'http://www.example.com/page-1', method: 'GET', headers: {} },
* { url: 'http://www.example.com/page-2', userData: { foo: 'bar' }},
*
* // Bulk load of URLs from file `http://www.example.com/my-url-list.txt`
* // Note that all URLs must start with http:// or https://
* { requestsFromUrl: 'http://www.example.com/my-url-list.txt', userData: { isFromUrl: true } },
* ],
*
* // Ensure both the sources and crawling state of the request list is persisted,
* // so that on actor restart, the crawling will continue where it left off
* persistStateKey: 'my-state',
* persistSourcesKey: 'my-sources'
* });
*
* // This call loads and parses the URLs from the remote file.
* await requestList.initialize();
*
* // Get requests from list
* const request1 = await requestList.fetchNextRequest();
* const request2 = await requestList.fetchNextRequest();
* const request3 = await requestList.fetchNextRequest();
*
* // Mark some of them as handled
* await requestList.markRequestHandled(request1);
*
* // If processing fails then reclaim it back to the list
* await requestList.reclaimRequest(request2);
* ```
*
* @param {Object} options All `RequestList` parameters are passed
* via an options object with the following keys:
* @param {Array} options.sources
* An array of sources of URLs for the `RequestList`. It can be either an array of plain objects that
* define the `url` property, or an array of instances of the {@link Request} class.
* Additionally, the `requestsFromUrl` property may be used instead of `url`,
* which will instruct `RequestList` to download the source URLs from a given remote location.
* The URLs will be parsed from the received response.
*
* ```
* [
* // One URL
* { method: 'GET', url: 'http://example.com/a/b' },
* // Batch import of URLs from a file hosted on the web
* { method: 'POST', requestsFromUrl: 'http://example.com/urls.txt' },
* // Batch import combined with regex.
* { method: 'POST', requestsFromUrl: 'http://example.com/urls.txt', regex: /https:\/\/example.com\/.+/ },
* ]
* ```
* @param {String} [options.persistStateKey]
* Identifies the key in the default key-value store under which `RequestList` periodically stores its
* state (i.e. which URLs were crawled and which not).
* If the actor is restarted, `RequestList` will read the state
* and continue where it left off.
*
* If `persistStateKey` is not set, `RequestList` will always start from the beginning,
* and all the source URLs will be crawled again.
* @param {String} [options.persistSourcesKey]
* Identifies the key in the default key-value store under which the `RequestList` persists its
* sources (i.e. the lists of URLs) during the {@link RequestList#initialize} call.
* This is necessary if `persistStateKey` is set and the source URLs might potentially change,
* to ensure consistency of the source URLs and state object. However, it comes with some storage and performance overheads.
*
* If `persistSourcesKey` is not set, {@link RequestList#initialize} will always fetch the sources
* from their origin, check that they are consistent with the restored state (if any)
* and throw an error if they are not.
* @param {Object} [options.state]
* The state object that the `RequestList` will be initialized from.
* It is in the form as returned by `RequestList.getState()`, such as follows:
*
* ```
* {
* nextIndex: 5,
* nextUniqueKey: 'unique-key-5'
* inProgress: {
* 'unique-key-1': true,
* 'unique-key-4': true,
* },
* }
* ```
*
* Note that the preferred (and simpler) way to persist the state of crawling of the `RequestList`
* is to use the `stateKeyPrefix` parameter instead.
* @param {Boolean} [options.keepDuplicateUrls=false]
* By default, `RequestList` will deduplicate the provided URLs. Default deduplication is based
* on the `uniqueKey` property of passed source {@link Request} objects.
*
* If the property is not present, it is generated by normalizing the URL. If present, it is kept intact.
* In any case, only one request per `uniqueKey` is added to the `RequestList` resulting in removal
* of duplicate URLs / unique keys.
*
* Setting `keepDuplicateUrls` to `true` will append an additional identifier to the `uniqueKey`
* of each request that does not already include a `uniqueKey`. Therefore, duplicate
* URLs will be kept in the list. It does not protect the user from having duplicates in user set
* `uniqueKey`s however. It is the user's responsibility to ensure uniqueness of their unique keys
* if they wish to keep more than just a single copy in the `RequestList`.
*/
export class RequestList {
constructor(options = {}) {
checkParamOrThrow(options, 'options', 'Object');
const { sources, persistStateKey, persistSourcesKey, state, keepDuplicateUrls = false } = options;
checkParamOrThrow(sources, 'options.sources', 'Array');
checkParamOrThrow(state, 'options.state', 'Maybe Object');
checkParamOrThrow(persistStateKey, 'options.persistStateKey', 'Maybe String');
checkParamOrThrow(persistSourcesKey, 'options.persistSourcesKey', 'Maybe String');
checkParamOrThrow(keepDuplicateUrls, 'options.keepDuplicateUrls', 'Maybe Boolean');
// Array of all requests from all sources, in the order as they appeared in sources.
// All requests in the array have distinct uniqueKey!
this.requests = [];
// Index to the next item in requests array to fetch. All previous requests are either handled or in progress.
this.nextIndex = 0;
// Dictionary, key is Request.uniqueKey, value is corresponding index in the requests array.
this.uniqueKeyToIndex = {};
// Dictionary of requests that were returned by fetchNextRequest().
// The key is uniqueKey, value is true.
// TODO: Change this to Set
this.inProgress = {};
// Dictionary of requests for which reclaimRequest() was called.
// The key is uniqueKey, value is true. TODO: Change this to Set
// Note that reclaimedRequests is always a subset of inProgress!
this.reclaimed = {};
this.persistStateKey = persistStateKey;
this.persistSourcesKey = persistSourcesKey;
this.initialState = state;
// If this option is set then all requests will get a pre-generated unique ID and duplicate URLs will be kept in the list.
this.keepDuplicateUrls = keepDuplicateUrls;
// Starts as true because until we handle the first request, the list is effectively persisted by doing nothing.
this.isStatePersisted = true;
// Starts as false because we don't know yet and sources might change in the meantime (eg. download from live list).
this.areSourcesPersisted = false;
this.isLoading = false;
this.isInitialized = false;
this.sources = sources;
}
/**
* Loads all remote sources of URLs and potentially starts periodic state persistence.
* This function must be called before you can start using the instance in a meaningful way.
*
* @returns {Promise}
*/
async initialize() {
if (this.isLoading) {
throw new Error('RequestList sources are already loading or were loaded.');
}
this.isLoading = true;
const [state, sources] = await this._loadStateAndSources();
// If there are no sources, it just means that we've not persisted any (yet).
if (sources) this.areSourcesPersisted = true;
const actualSources = sources || this.sources;
// We'll load all sources in sequence to ensure that they get loaded in the right order.
const sourcesWithRequestsFromUrl = [];
for (const source of actualSources) {
if (source.requestsFromUrl) {
const fetchedRequests = await this._fetchRequestsFromUrl(source);
sourcesWithRequestsFromUrl.push(...fetchedRequests);
await this._addFetchedRequests(source, fetchedRequests);
} else {
sourcesWithRequestsFromUrl.push(source);
this._addRequest(source);
}
}
// Replace source with source with request from remote URLs
this.sources = sourcesWithRequestsFromUrl;
this._restoreState(state);
this.isInitialized = true;
if (this.persistSourcesKey && !this.areSourcesPersisted) await this._persistSources();
if (this.persistStateKey) {
events.on(ACTOR_EVENT_NAMES_EX.PERSIST_STATE, this.persistState.bind(this));
}
}
/**
* Persists the current state of the `RequestList` into the default {@link KeyValueStore}.
* The state is persisted automatically in regular intervals, but calling this method manually
* is useful in cases where you want to have the most current state available after you pause
* or stop fetching its requests. For example after you pause or abort a crawl. Or just before
* a server migration.
*
* @return {Promise}
*/
async persistState() {
if (!this.persistStateKey) {
throw new Error('RequestList: Cannot persist state. options.persistStateKey is not set.');
}
if (this.isStatePersisted) return;
try {
await setValue(this.persistStateKey, this.getState());
this.isStatePersisted = true;
} catch (err) {
log.exception(err, 'RequestList attempted to persist state, but failed.');
}
}
/**
* Unlike persistState(), this is used only internally, since the sources
* are automatically persisted at RequestList initialization (if the persistSourcesKey is set),
* but there's no reason to persist it again afterwards, because RequestList is immutable.
*
* @return {Promise}
* @ignore
*/
async _persistSources() {
await setValue(this.persistSourcesKey, this.sources);
this.areSourcesPersisted = true;
}
/**
* Restores RequestList state from a state object.
*
* @param {Object} state
* @ignore
*/
_restoreState(state) {
// If there's no state it means we've not persisted any (yet).
if (!state) return;
// Restore previous state.
if (typeof state.nextIndex !== 'number' || state.nextIndex < 0) {
throw new Error('The state object is invalid: nextIndex must be a non-negative number.');
}
if (state.nextIndex > this.requests.length) {
throw new Error('The state object is not consistent with RequestList: too few requests loaded.');
}
if (state.nextIndex < this.requests.length
&& this.requests[state.nextIndex].uniqueKey !== state.nextUniqueKey) {
throw new Error('The state object is not consistent with RequestList: the order of URLs seems to have changed.');
}
const deleteFromInProgress = [];
_.keys(state.inProgress).forEach((uniqueKey) => {
const index = this.uniqueKeyToIndex[uniqueKey];
if (typeof index !== 'number') {
throw new Error('The state object is not consistent with RequestList: unknown uniqueKey is present in the state.');
}
if (index >= state.nextIndex) {
deleteFromInProgress.push(uniqueKey);
}
});
// WORKAROUND:
// It happened to some users that state object contained something like:
// {
// "nextIndex": 11308,
// "nextUniqueKey": "https://www.anychart.com",
// "inProgress": {
// "https://www.ams360.com": true,
// ...
// "https://www.anychart.com": true,
// }
// Which then caused error "The request is not being processed (uniqueKey: https://www.anychart.com)"
// As a workaround, we just remove all inProgress requests whose index >= nextIndex,
// since they will be crawled again.
if (deleteFromInProgress.length) {
log.warning('RequestList\'s in-progress field is not consistent, skipping invalid in-progress entries', { deleteFromInProgress });
_.each(deleteFromInProgress, (uniqueKey) => {
delete state.inProgress[uniqueKey];
});
}
this.nextIndex = state.nextIndex;
this.inProgress = state.inProgress;
// All in-progress requests need to be recrawled
this.reclaimed = _.clone(this.inProgress);
}
/**
* Attempts to load state and sources using the `RequestList` configuration
* and returns a tuple of [state, sources] where each may be null if not loaded.
*
* @return {Promise<Array>}
* @ignore
*/
async _loadStateAndSources() {
let state;
if (this.initialState) {
log.debug('RequestList: Loading previous state from options.state argument.');
state = this.initialState;
} else if (this.persistStateKey) {
log.debug('RequestList: Loading previous state from key value store using the persistStateKey.');
state = getValue(this.persistStateKey);
}
if (this.persistSourcesKey) {
log.debug('RequestList: Loading sources from key value store using the persistSourcesKey.');
return Promise.all([state, getValue(this.persistSourcesKey)]);
}
return [await state, null];
}
/**
* Returns an object representing the internal state of the `RequestList` instance.
* Note that the object's fields can change in future releases.
*
* @returns {Object}
*/
getState() {
this._ensureIsInitialized();
return {
nextIndex: this.nextIndex,
nextUniqueKey: this.nextIndex < this.requests.length
? this.requests[this.nextIndex].uniqueKey
: null,
inProgress: this.inProgress,
};
}
/**
* Resolves to `true` if the next call to {@link RequestList#fetchNextRequest} function
* would return `null`, otherwise it resolves to `false`.
* Note that even if the list is empty, there might be some pending requests currently being processed.
*
* @returns {Promise<Boolean>}
*/
async isEmpty() {
this._ensureIsInitialized();
return !getFirstKey(this.reclaimed) && this.nextIndex >= this.requests.length;
}
/**
* Returns `true` if all requests were already handled and there are no more left.
*
* @returns {Promise<Boolean>}
*/
async isFinished() {
this._ensureIsInitialized();
return !getFirstKey(this.inProgress) && this.nextIndex >= this.requests.length;
}
/**
* Gets the next {@link Request} to process. First, the function gets a request previously reclaimed
* using the {@link RequestList#reclaimRequest} function, if there is any.
* Otherwise it gets the next request from sources.
*
* The function's `Promise` resolves to `null` if there are no more
* requests to process.
*
* @returns {Promise<Request>}
*/
async fetchNextRequest() {
this._ensureIsInitialized();
// First return reclaimed requests if any.
const uniqueKey = getFirstKey(this.reclaimed);
if (uniqueKey) {
delete this.reclaimed[uniqueKey];
const index = this.uniqueKeyToIndex[uniqueKey];
return this.requests[index];
}
// Otherwise return next request.
if (this.nextIndex < this.requests.length) {
const request = this.requests[this.nextIndex];
this.inProgress[request.uniqueKey] = true;
this.nextIndex++;
this.isStatePersisted = false;
return request;
}
return null;
}
/**
* Marks request as handled after successful processing.
*
* @param {Request} request
*
* @returns {Promise}
*/
async markRequestHandled(request) {
const { uniqueKey } = request;
this._ensureUniqueKeyValid(uniqueKey);
this._ensureInProgressAndNotReclaimed(uniqueKey);
this._ensureIsInitialized();
delete this.inProgress[uniqueKey];
this.isStatePersisted = false;
}
/**
* Reclaims request to the list if its processing failed.
* The request will become available in the next `this.fetchNextRequest()`.
*
* @param {Request} request
*
* @returns {Promise}
*/
async reclaimRequest(request) {
const { uniqueKey } = request;
this._ensureUniqueKeyValid(uniqueKey);
this._ensureInProgressAndNotReclaimed(uniqueKey);
this._ensureIsInitialized();
this.reclaimed[uniqueKey] = true;
}
/**
* Adds all fetched requests from a URL from a remote resource.
*
* @ignore
*/
async _addFetchedRequests(source, fetchedRequests) {
const { requestsFromUrl, regex } = source;
const originalLength = this.requests.length;
fetchedRequests.forEach(request => this._addRequest(request));
const fetchedCount = fetchedRequests.length;
const importedCount = this.requests.length - originalLength;
log.info('RequestList: list fetched.', {
requestsFromUrl,
regex,
fetchedCount,
importedCount,
duplicateCount: fetchedCount - importedCount,
sample: JSON.stringify(fetchedRequests.slice(0, 5)),
});
}
/**
* Fetches URLs from requestsFromUrl and returns them in format of list of requests
* @param source
* @return {Promise<Object[]|Array>}
* @ignore
*/
async _fetchRequestsFromUrl(source) {
const sharedOpts = _.omit(source, 'requestsFromUrl', 'regex');
const { requestsFromUrl, regex } = source;
const { downloadListOfUrls } = publicUtils;
// Download remote resource and parse URLs.
let urlsArr;
try {
urlsArr = await downloadListOfUrls({ url: requestsFromUrl, urlRegExp: regex });
} catch (err) {
throw new Error(`Cannot fetch a request list from ${requestsFromUrl}: ${err}`);
}
// Skip if resource contained no URLs.
if (!urlsArr.length) {
log.warning('RequestList: list fetched, but it is empty.', { requestsFromUrl, regex });
return [];
}
return urlsArr.map(url => _.extend({ url }, sharedOpts));
}
/**
* Adds given request.
* If the `opts` parameter is a plain object and not an instance of a `Request`, then the function
* creates a `Request` instance.
*
* @ignore
*/
_addRequest(opts) {
const hasUniqueKey = !!opts.uniqueKey;
const request = opts instanceof Request
? opts
: new Request(opts);
// Add index to uniqueKey if duplicates are to be kept
if (this.keepDuplicateUrls && !hasUniqueKey) {
request.uniqueKey += `-${this.requests.length}`;
}
const { uniqueKey } = request;
this._ensureUniqueKeyValid(uniqueKey);
// Skip requests with duplicate uniqueKey
if (!this.uniqueKeyToIndex.hasOwnProperty(uniqueKey)) { // eslint-disable-line no-prototype-builtins
this.uniqueKeyToIndex[uniqueKey] = this.requests.length;
this.requests.push(request);
} else if (this.keepDuplicateUrls) {
log.warning(`RequestList: Duplicate uniqueKey: ${uniqueKey} found while the keepDuplicateUrls option was set. Check your sources' unique keys.`); // eslint-disable-line max-len
}
}
/**
* Helper function that validates unique key.
* Throws an error if uniqueKey is not a non-empty string.
*
* @ignore
*/
_ensureUniqueKeyValid(uniqueKey) { // eslint-disable-line class-methods-use-this
if (typeof uniqueKey !== 'string' || !uniqueKey) {
throw new Error('Request object\'s uniqueKey must be a non-empty string');
}
}
/**
* Checks that request is not reclaimed and throws an error if so.
*
* @ignore
*/
_ensureInProgressAndNotReclaimed(uniqueKey) {
if (!this.inProgress[uniqueKey]) {
throw new Error(`The request is not being processed (uniqueKey: ${uniqueKey})`);
}
if (this.reclaimed[uniqueKey]) {
throw new Error(`The request was already reclaimed (uniqueKey: ${uniqueKey})`);
}
}
/**
* Throws an error if request list wasn't initialized.
*
* @ignore
*/
_ensureIsInitialized() {
if (!this.isInitialized) {
throw new Error('RequestList is not initialized; you must call "await requestList.initialize()" before using it!');
}
}
/**
* Returns the total number of unique requests present in the `RequestList`.
*
* @returns {Number}
*/
length() {
this._ensureIsInitialized();
return this.requests.length;
}
/**
* Returns number of handled requests.
*
* @returns {Number}
*/
handledCount() {
this._ensureIsInitialized();
return this.nextIndex - _.size(this.inProgress);
}
}
/**
* Opens a request list and returns a promise resolving to an instance
* of the {@link RequestList} class that is already initialized.
*
* {@link RequestList} represents a list of URLs to crawl, which is always stored in memory.
* To enable picking up where left off after a process restart, the request list sources
* are persisted to the key value store at initialization of the list. Then, while crawling,
* a small state object is regularly persisted to keep track of the crawling status.
*
* For more details and code examples, see the {@link RequestList} class.
*
* **Example usage:**
*
* ```javascript
* const sources = [
* 'https://www.example.com',
* 'https://www.google.com',
* 'https://www.bing.com'
* ];
*
* const requestList = await Apify.openRequestList('my-name', sources);
* ```
*
* @param {string|null} listName
* Name of the request list to be opened. Setting a name enables the `RequestList`'s state to be persisted
* in the key value store. This is useful in case of a restart or migration. Since `RequestList` is only
* stored in memory, a restart or migration wipes it clean. Setting a name will enable the `RequestList`'s
* state to survive those situations and continue where it left off.
*
* The name will be used as a prefix in key value store, producing keys such as `NAME-REQUEST_LIST_STATE`
* and `NAME-REQUEST_LIST_SOURCES`.
*
* If `null`, the list will not be persisted and will only be stored in memory. Process restart
* will then cause the list to be crawled again from the beginning. We suggest always using a name.
* @param {Object[]|string[]} sources
* An array of sources of URLs for the `RequestList`.
* It can be either an array of plain objects that
* define the `url` property, or an array of instances of the {@link Request} class.
*
* Additionally, the `requestsFromUrl` property may be used instead of `url`,
* which will instruct `RequestList` to download the source URLs from a given remote location.
* The URLs will be parsed from the received response. In this case you can limit the URLs
* using `regex` parameter containing regular expression pattern for URLs to be included.
*
* For details, see the [`RequestList`](requestlist#new_RequestList_new)
* constructor options.
* @param {Object} [options]
* The [`new RequestList`](requestlist#new_RequestList_new) options. Note that the listName parameter supersedes
* the `persistStateKey` and `persistSourcesKey` options and the sources parameter supersedes the `sources` option.
* @returns {Promise<RequestList>}
* @memberof module:Apify
* @name openRequestList
*/
export const openRequestList = async (listName, sources, options = {}) => {
checkParamOrThrow(listName, 'listName', 'String | Null');
checkParamOrThrow(sources, 'sources', '[Object | String]');
if (!sources.length) throw new Error('Parameter sources must not be an empty array.');
checkParamOrThrow(options, 'options', 'Object');
// Support both an array of strings and array of objects.
if (typeof sources[0] === 'string') sources = sources.map(url => ({ url }));
const rl = new RequestList({
...options,
persistStateKey: listName ? `${listName}-${STATE_PERSISTENCE_KEY}` : null,
persistSourcesKey: listName ? `${listName}-${SOURCES_PERSISTENCE_KEY}` : null,
sources,
});
await rl.initialize();
return rl;
};