/
webdriver.js
366 lines (316 loc) · 13.5 KB
/
webdriver.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
// import { ChromeLauncher } from 'lighthouse/lighthouse-cli/chrome-launcher';
import { anonymizeProxy, closeAnonymizedProxy } from 'proxy-chain';
import { ENV_VARS } from 'apify-shared/consts';
import { DEFAULT_USER_AGENT } from './constants';
import { newPromise } from './utils';
/* global process, require */
// interesting resources:
// https://chromium.googlesource.com/chromium/src/+/master/docs/linux_debugging.md
// http://peter.sh/experiments/chromium-command-line-switches/#user-agent
// https://github.com/SeleniumHQ/selenium/tree/master/javascript/node/selenium-webdriver/example
// logging.installConsoleHandler();
// logging.getLogger('webdriver.http').setLevel(logging.Level.ALL);
// TODO: on first use of Apify.browse(), print out the version of Chrome and ChromeDriver
/**
* Gets the default options for the browse() function, generated from current process environment
* variables. This is function to enable unit testing.
*
* @ignore
*/
export const getDefaultBrowseOptions = () => {
return {
headless: process.env[ENV_VARS.HEADLESS] === '1' && process.env[ENV_VARS.XVFB] !== '1',
browserName: 'chrome',
proxyUrl: null,
userAgent: null,
};
};
const isWin = process.platform === 'win32';
/**
* Represents a single web browser process.
* Currently it is just a thin wrapper of Selenium's WebDriver instance.
*
* @ignore
*/
export class Browser {
constructor(options) {
this.options = Object.assign(getDefaultBrowseOptions(), options);
// This is an optional dependency because it is quite large, only require it when used
const { Capabilities, Builder } = require('selenium-webdriver'); // eslint-disable-line global-require
const chrome = require('selenium-webdriver/chrome'); // eslint-disable-line global-require
this.anonymizedProxyUrl = null;
// logging.installConsoleHandler();
// logging.getLogger('webdriver.http').setLevel(logging.Level.ALL);
// See https://github.com/SeleniumHQ/selenium/wiki/DesiredCapabilities for reference.
this.capabilities = new Capabilities();
this.capabilities.set('browserName', this.options.browserName);
// Chrome-specific options
// By default, Selenium already defines a long list of command-line options
// to enable browser automation, here we add a few other ones
// (inspired by Lighthouse, see lighthouse/lighthouse-cli/chrome-launcher)
this.chromeOptions = new chrome.Options();
this.chromeOptions.addArguments('--disable-translate');
this.chromeOptions.addArguments('--safebrowsing-disable-auto-update');
if (this.options.headless) {
this.chromeOptions.addArguments('--headless', '--no-sandbox');
if (isWin) {
// Temporarily needed on Windows
this.chromeOptions.addArguments('--disable-gpu');
}
}
this.chromeOptions.addArguments(`--user-agent=${this.options.userAgent || DEFAULT_USER_AGENT}`);
if (this.options.extraChromeArguments) {
this.chromeOptions.addArguments(this.options.extraChromeArguments);
}
this.builder = new Builder();
// Instance of Selenium's WebDriver
this.webDriver = null;
}
_initialize() {
let promise = null;
// Applies options.proxyUrl setting to the WebDriver's Capabilities and Chrome Options.
// For proxy servers with authentication, this class starts a local proxy server
// NOTE: to view effective proxy settings in Chrome, open chrome://net-internals/#proxy
if (this.options.proxyUrl) {
// NOTE: call anonymizeProxy() outside of promise, so that errors in proxyUrl are thrown!
promise = anonymizeProxy(this.options.proxyUrl)
.then((result) => {
this.anonymizedProxyUrl = result;
if (/^chrome$/i.test(this.options.browserName)) {
// In Chrome, Capabilities.setProxy() has no effect,
// so we setup the proxy manually
this.chromeOptions.addArguments(`--proxy-server=${this.anonymizedProxyUrl}`);
} else {
const proxyConfig = {
proxyType: 'MANUAL',
httpProxy: this.anonymizedProxyUrl,
sslProxy: this.anonymizedProxyUrl,
ftpProxy: this.anonymizedProxyUrl,
};
this.capabilities.setProxy(proxyConfig);
}
});
}
// Ensure that the returned promise is of type set in setPromiseDependency()
return newPromise()
.then(() => {
return promise;
})
.then(() => {
this.webDriver = this.builder
.setChromeOptions(this.chromeOptions)
.withCapabilities(this.capabilities)
.build();
})
.then(() => {
return this;
});
}
close() {
return newPromise()
.then(() => {
if (this.webDriver) {
return this.webDriver.quit();
}
})
.then(() => {
if (this.anonymizedProxyUrl) {
return closeAnonymizedProxy(this.anonymizedProxyUrl, true);
}
})
.then(() => {
this.webDriver = null;
});
}
}
/**
* Normalizes arguments for Apify.browse(), fills correctly default values.
* The function is exported to allow unit testing.
*
* @param {String} [url]
* @param {Object} [options]
*
* @ignore
*/
export const processBrowseArgs = (url, options) => {
if (typeof (url) === 'object') {
options = url;
url = null;
}
options = Object.assign({}, options);
options.url = url || options.url || 'about:blank';
if (typeof (options.url) !== 'string') throw new Error('Invalid "url" provided.');
return { options };
};
/*
OLD INFO FROM README:
### Browser
Apify runtime optionally depends on
the [selenium-webdriver](https://www.npmjs.com/package/selenium-webdriver) package that enables
automation of a web browser.
The simplest way to launch a new web browser is using the `Apify.browse([url,] [options)`
function. For example:
```javascript
const browser = await Apify.browse('https://www.example.com/');
```
or
```javascript
const browser = await Apify.browse({
url: 'https://www.example.com/',
userAgent: 'MyCrawlingBot/1.23',
});
```
The `options` parameter controls settings of the web browser and it has the following properties:
```javascript
{
// Initial URL to open. Note that the url argument in Apify.browse() overrides this value.
// The default value is 'about:blank'
url: String,
// The type of the web browser to use.
// See https://github.com/SeleniumHQ/selenium/wiki/DesiredCapabilities for possible options.
// The default value is 'chrome', which is currently the only fully-supported browser.
browserName: String,
// Indicates whether the browser should be opened in headless mode (i.e. without windows).
// By default, this value is based on the APIFY_HEADLESS environment variable.
headless: Boolean,
// URL of the proxy server, e.g. 'http://username:password@1.2.3.4:55555'.
// Currently only the 'http' proxy type is supported.
// By default it is null, which means no proxy server is used.
proxyUrl: String,
// Overrides the User-Agent HTTP header of the web browser.
// By default it is null, which means the browser uses its default User-Agent.
userAgent: String,
}
```
The result of the `Apify.browse()` is a new instance of the `Browser` class,
which represents a web browser instance (possibly with multiple windows or tabs).
The `Browser` class has the following properties:
```javascript
{
// An instance of the Selenium's WebDriver class.
webDriver: Object,
// A method that closes the web browser and releases associated resources.
// The method has no arguments and returns a promise that resolves when the browser was closed.
close: Function,
}
```
The `webDriver` property can be used to manipulate the web browser:
```javascript
const url = await browser.webDriver.getCurrentUrl();
```
For more information, see [WebDriver documentation](http://seleniumhq.github.io/selenium/docs/api/
javascript/module/selenium-webdriver/index_exports_WebDriver.html).
When the web browser is no longer needed, it should be closed:
```javascript
await browser.close();
```
*/
// TODO: browse() is only kept for backwards compatibility, get rid of it after no actors are using it!
/**
* Opens a new web browser, which is attached to Apify debugger so that snapshots are sent to Run console (TODO).
* Internally, this function calls Selenium WebDrivers's Builder command to create a new WebDriver instance.
* (see http://seleniumhq.github.io/selenium/docs/api/javascript/module/selenium-webdriver/index_exports_Builder.html)
* The result of the function is a new instance of the Browser class.
*
* @param {String} [url] start URL to open. Defaults to about:blank
* @param {Object} [options] settings, their defaults are provided by the getDefaultBrowseOptions() function.
* @returns {Promise}
*
* @memberof module:Apify
* @function
* @ignore
*/
export const browse = (url, options) => {
const args = processBrowseArgs(url, options);
const browser = new Browser(args.options);
return browser._initialize() // eslint-disable-line no-underscore-dangle
.then(() => {
return browser.webDriver.get(args.options.url);
})
.then(() => {
return browser;
});
};
/**
* Opens a new instance of Chrome web browser
* controlled by <a href="http://www.seleniumhq.org/projects/webdriver/" target="_blank">Selenium WebDriver</a>.
* The result of the function is the new instance of the
* <a href="http://seleniumhq.github.io/selenium/docs/api/javascript/module/selenium-webdriver/index_exports_WebDriver.html" target="_blank">
* WebDriver</a>
* class.
*
* To use this function, you need to have Google Chrome and
* <a href="https://sites.google.com/a/chromium.org/chromedriver/" target="_blank">ChromeDriver</a> installed in your environment.
* For example, you can use the `apify/actor-node-chrome` base Docker image for your actor - see
* <a href="https://apify.com/docs/actor#base-images" target="_blank">documentation</a>
* for more details.
*
* For an example of usage, see the <a href="https://apify.com/apify/example-selenium" target="_blank">apify/example-selenium</a> actor.
*
* @param {Object} [options] Optional settings passed to WebDriver. Additionally the object can contain the following fields:
* @param {String} [options.proxyUrl] - URL to a proxy server. Currently only `http://` scheme is supported.
* Port number must be specified. For example, `http://example.com:1234`.
* @param {String} [options.headless] - Indicates that the browser will be started in headless mode.
* If the option is not defined, and the `APIFY_HEADLESS` environment variable has value `1`
* and `APIFY_XVFB` is NOT `1`, the value defaults to `true`, otherwise it will be `false`.
* @param {String} [options.userAgent] - User-Agent for the browser.
* If not provided, the function sets it to a reasonable default.
* @returns {Promise}
*
* @memberof module:Apify
* @name launchWebDriver
* @function
*/
export const launchWebDriver = (options) => {
const args = processBrowseArgs(undefined, options);
const browser = new Browser(args.options);
// NOTE: eventually get rid of the Browser class
return browser._initialize() // eslint-disable-line no-underscore-dangle
.then(() => {
// TODO: for some reason this doesn't work, the proxy chain will never shut down!!
// BTW this also prevents us from upgrading to mocha 4+
// we'll need to find a way to fix this!
// browser.webDriver.onQuit_ = () => {
// if (browser.proxyChain) {
// browser.proxyChain.shutdown();
// browser.proxyChain = null;
// }
// };
return browser.webDriver;
});
};
// /**
// * Launches a debugging instance of Chrome on port 9222, without Selenium.
// * This code is kept here for legacy reasons, it's not used.
// * @param {boolean=} headless True (default) to launch Chrome in headless mode.
// * Set to false to launch Chrome normally.
// * @returns {Promise<ChromeLauncher>}
// */
// export const launchChrome = (headless = !!process.env.APIFY_HEADLESS) => {
// // code inspired by https://developers.google.com/web/updates/2017/04/headless-chrome
// // TODO: enable other options e.g. userAgent, windowHeight, windowWidth, proxy
//
// const launcher = new ChromeLauncher({
// port: 9222,
// autoSelectChrome: true, // False to manually select which Chrome install.
// additionalFlags: [
// '--window-size=412,732',
// '--disable-gpu',
// headless ? '--headless' : '',
// ],
// });
//
// return newPromise()
// .then(() => {
// return launcher.run();
// })
// .then(() => {
// return launcher;
// })
// .catch((err) => {
// // Kill Chrome if there's an error.
// return launcher.kill().then(() => {
// throw err;
// }, console.error);
// });
// };