Skip to content

Commit

Permalink
Merge pull request #304 from kossidts/dev
Browse files Browse the repository at this point in the history
Replacement of istanbul with nyc
  • Loading branch information
mike442144 committed Mar 25, 2019
2 parents 808e208 + a712e5d commit 2dce18f
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 39 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ npm-debug.log
.vscode/
*~
coverage
.nyc_*
71 changes: 34 additions & 37 deletions lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@ var log = defaultLog;

function Crawler (options) {
var self = this;

options = options||{};
if(['onDrain','cache'].some(key => key in options)){
throw new Error('Support for "onDrain", "cache" has been removed! For more details, see https://github.com/bda-research/node-crawler');
}

self.init(options);
}
// augment the prototype for node events using util.inherits
Expand Down Expand Up @@ -95,21 +95,21 @@ Crawler.prototype.init = function init (options) {

// you can use jquery or jQuery
self.options = checkJQueryNaming(self.options);

// Don't make these options persist to individual queries
self.globalOnlyOptions = ['maxConnections', 'rateLimit', 'priorityRange', 'homogeneous', 'skipDuplicates', 'rotateUA'];

self.limiters = new Bottleneck.Cluster(self.options.maxConnections,self.options.rateLimit,self.options.priorityRange, self.options.priority, self.options.homogeneous);
self.seen = new seenreq(self.options.seenreq);
level = self.options.debug?'debug':'info';
level = self.options.debug ? 'debug' : 'info';

if(self.options.logger)
log = self.options.logger.log.bind(self.options.logger);

self.log = log;
self.on('_release', function(){
log('debug','Queue size: %d',this.queueSize);

if(this.limiters.empty)
return this.emit('drain');
});
Expand All @@ -131,7 +131,7 @@ Crawler.prototype._inject = function _inject (response, options, callback) {
if(!whacko){
throw new Error('Please install whacko by your own since `crawler` detected you specify explicitly');
}

$ = whacko.load(response.body);
callback(null, response, options, $);
}else if (options.jQuery === 'cheerio' || options.jQuery.name === 'cheerio' || options.jQuery === true) {
Expand Down Expand Up @@ -180,10 +180,7 @@ Crawler.prototype._inject = function _inject (response, options, callback) {
};

Crawler.prototype.isIllegal = function isIllegal (options) {
if(_.isNull(options) || _.isUndefined(options) || (!_.isString(options) && !_.isPlainObject(options))) {
return true;
}
return false;
return (_.isNull(options) || _.isUndefined(options) || (!_.isString(options) && !_.isPlainObject(options)));
};

Crawler.prototype.direct = function direct (options) {
Expand All @@ -198,7 +195,7 @@ Crawler.prototype.direct = function direct (options) {
}

options = checkJQueryNaming(options);

// direct request does not follow the global preRequest
options.preRequest = options.preRequest || null;

Expand All @@ -210,14 +207,14 @@ Crawler.prototype.direct = function direct (options) {
// direct request does not emit event:'request' by default
options.skipEventRequest = _.isBoolean(options.skipEventRequest) ? options.skipEventRequest : true;

self.globalOnlyOptions.forEach(globalOnlyOption=>delete options[globalOnlyOption]);
self.globalOnlyOptions.forEach(globalOnlyOption => delete options[globalOnlyOption]);

self._buildHttpRequest(options);
};

Crawler.prototype.queue = function queue (options) {
var self = this;

// Did you get a single object or string? Make it compatible.
options = _.isArray(options) ? options : [options];

Expand All @@ -242,27 +239,27 @@ Crawler.prototype._pushToQueue = function _pushToQueue (options) {

_.defaults(options, self.options);
options.headers = _.assign({}, self.options.headers, options.headers);

// Remove all the global options from our options
// TODO we are doing this for every _pushToQueue, find a way to avoid this
self.globalOnlyOptions.forEach(globalOnlyOption=>delete options[globalOnlyOption]);
self.globalOnlyOptions.forEach(globalOnlyOption => delete options[globalOnlyOption]);

// If duplicate skipping is enabled, avoid queueing entirely for URLs we already crawled
if (self.options.skipDuplicates && self.seen.exists(options, options.seenreq)) {
return;
}

self.emit('schedule',options);

self.limiters.key(options.limiter||'default').submit(options.priority,function(done, limiter){
options.release = function(){ done();self.emit('_release'); };
if(!options.callback)
options.callback = options.release;

if (limiter) {
self.emit('limiterChange', options, limiter);
}

if (options.html) {
self._onContent(null, options, {body:options.html,headers:{'content-type':'text/html'}});
} else if (typeof options.uri === 'function') {
Expand All @@ -287,12 +284,12 @@ Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
// - some versions of "request" apply the second parameter as a
// property called "callback" to the first parameter
// - keeps the query object fresh in case of a retry

var ropts = _.assign({},options);

if (!ropts.headers) { ropts.headers={}; }
if (ropts.forceUTF8) {ropts.encoding=null;}
// specifying json in request will have request sets body to JSON representation of value and
// specifying json in request will have request sets body to JSON representation of value and
// adds Content-type: application/json header. Additionally, parses the response body as JSON
// so the response will be JSON object, no need to deal with encoding
if (ropts.json) {options.encoding=null;}
Expand All @@ -305,11 +302,11 @@ Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
ropts.headers['User-Agent'] = ropts.userAgent;
}
}

if (ropts.referer) {
ropts.headers.Referer = ropts.referer;
}

if (ropts.proxies && ropts.proxies.length) {
ropts.proxy = ropts.proxies[0];
}
Expand All @@ -326,7 +323,7 @@ Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
}
return;
}

if(ropts.skipEventRequest !== true) {
self.emit('request',ropts);
}
Expand All @@ -337,7 +334,7 @@ Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
if (error) {
return self._onContent(error, options);
}

self._onContent(error,options,response);
});
};
Expand All @@ -353,8 +350,8 @@ Crawler.prototype._onContent = function _onContent (error, options, response) {
var self = this;

if (error) {
log('error','Error '+error+' when fetching '+ (options.uri||options.url)+(options.retries?' ('+options.retries+' retries left)':''));
log('error','Error '+error+' when fetching '+ (options.uri||options.url)+(options.retries ? ' ('+options.retries+' retries left)' : ''));

if (options.retries) {
self.options.skipDuplicates = false;
setTimeout(function() {
Expand All @@ -365,10 +362,10 @@ Crawler.prototype._onContent = function _onContent (error, options, response) {
} else{
options.callback(error,{options:options},options.release);
}

return;
}

if (!response.body) { response.body=''; }

log('debug','Got '+(options.uri||'html')+' ('+response.body.length+' bytes)...');
Expand All @@ -379,7 +376,7 @@ Crawler.prototype._onContent = function _onContent (error, options, response) {
log('error',e);
return options.callback(e,{options:options},options.release);
}

response.options = options;

if(options.method === 'HEAD' || !options.jQuery){
Expand All @@ -393,7 +390,7 @@ Crawler.prototype._onContent = function _onContent (error, options, response) {
}

log('debug','Injecting');

self._inject(response, options, self._injected.bind(self));
};

Expand All @@ -406,11 +403,11 @@ Crawler.prototype._injected = function(errors, response, options, $){

Crawler.prototype._doEncoding = function(options,response){
var self = this;

if(options.encoding === null){
return;
}

if (options.forceUTF8) {
var charset = options.incomingEncoding || self._parseCharset(response);
response.charset = charset;
Expand All @@ -420,7 +417,7 @@ Crawler.prototype._doEncoding = function(options,response){
response.body = iconvLite.decode(response.body, charset);
}
}

response.body = response.body.toString();
};

Expand All @@ -433,10 +430,10 @@ Crawler.prototype._parseCharset = function(res){
log('debug','Charset not detected in response headers, please specify using `incomingEncoding`, use `utf-8` by default');
return 'utf-8';
}
var body = res.body instanceof Buffer?res.body.toString():res.body;

var body = res.body instanceof Buffer ? res.body.toString() : res.body;
charset = charsetParser(contentType(res),body,'utf-8');

return charset;
};

Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"scripts": {
"hint": "eslint ./lib/*.js ./tests/*.js",
"test": "mocha --timeout=15000 tests/*.test.js",
"cover": "istanbul cover _mocha --report lcovonly -- --timeout=15000 --reporter spec tests/*.test.js"
"cover": "nyc --reporter=lcovonly --reporter=text --reporter=text-summary mocha --timeout=15000 --reporter spec tests/*.test.js"
},
"repository": {
"type": "git",
Expand All @@ -32,11 +32,11 @@
"chai": "^2.3.0",
"coveralls": "^3.0.2",
"eslint": "^5.3.0",
"istanbul": "^0.4.5",
"jsdom": "^9.6.0",
"mocha": "^5.2.0",
"mocha-testdata": "^1.2.0",
"nock": "^9.2.6",
"nyc": "^13.1.0",
"sinon": "^6.0.0",
"whacko": "^0.19.1"
},
Expand Down

0 comments on commit 2dce18f

Please sign in to comment.