Skip to content

Commit

Permalink
Use a regexp for specifying routes so that '/' can be matched. Added …
Browse files Browse the repository at this point in the history
…spiderCrossDomain option. Prevent call stack size exceeded errors from breaking the spider.
  • Loading branch information
allanmboyd committed May 14, 2012
1 parent e4768a9 commit b3b95dc
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 34 deletions.
2 changes: 1 addition & 1 deletion bin/cli.js
Expand Up @@ -30,7 +30,7 @@ if (!spiderStartUrl || !testDir) {
optimist.showHelp();
process.exit(-1);
} else {
spiderTest.runTests(spiderStartUrl, testDir, null, null, reporters);
spiderTest.runTests(spiderStartUrl, testDir, null, null, reporters, config);
}

var reporters = initialiseReporters(config.get("reporters"));
Expand Down
9 changes: 6 additions & 3 deletions bin/options.js
Expand Up @@ -3,25 +3,28 @@ exports.options = {
"describe": "Path to a json configuration file defining custom options. All command line options - except this one and --help - will be used if present. Options specified directly on the command line override file loaded options."
},
"failOnMissingRoute": {
"default": false,
"default": "false",
"describe": "Set to true if spidering should stop when a route to an encountered link is not available"
},
"help": {
"alias": "h",
"describe": "This message."
},
"reporters": {
"alias": "reporters",
"default": "../lib/reporters/ConsoleReporter",
"describe": "Comma separated list of paths to reporter.js Reporter implementations for reporting test results."
},
"reportOptions": {
"describe": "String of options passed into the createReporter() Reporter function. It is up to the reporter to determine what to do with it."
},
"spiderCrossDomain": {
"default": "false",
"describe": "Allow spidering to continue across different domains"
},
"spiderStartUrl": {
"describe": "The full http url from which to start spidering."
},
"testDir" : {
"describe": "Path to folder containing javascript test definitions "
"describe": "Absolute path to folder containing javascript test definitions "
}
};
6 changes: 3 additions & 3 deletions examples/tests/htmlTests.js
Expand Up @@ -2,13 +2,13 @@ var should = require("should");

exports.tests = {
"Common HTML Tests" : {
urlPattern: "\.html$",
urlPattern: "/",
tests: {
"HTML responses should have a statusCode of 200": function(spiderPayload) {
should.equal(spiderPayload.response.statusCode, 200)
},
"HTML responses should have a content type of text/html; charset=UTF-8": function (spiderPayload) {
should.equal("text/html; charset=UTF-8", spiderPayload.response.headers['content-type']);
"HTML responses should have a content type of text/html": function (spiderPayload) {
spiderPayload.response.headers['content-type'].should.include("text/html");
}
}
}
Expand Down
65 changes: 41 additions & 24 deletions lib/spider.js
Expand Up @@ -77,6 +77,7 @@ function Spider(options) {
this.userAgent = options.userAgent || firefoxUA;
this.cache = options.cache || new NoCache();
this.pool = options.pool || {maxSockets: options.maxSockets};
this.spiderCrossDomain = options.spiderCrossDomain || false;
this.options = options;
this.currentUrl = null;
this.routers = {};
Expand All @@ -100,6 +101,9 @@ Spider.prototype.get = function (url, referer) {
this.urls.push(url);

var u = urlParse(url);
if(!this.domain) {
this.domain = u.hostname;
}
if (!this.routers[u.host]) {
if (this.throwOnMissingRoute) {
errors.NO_ROUTES_FOR_HOST.thro("No routes for host '" + u.host + "'");
Expand All @@ -117,6 +121,9 @@ Spider.prototype.get = function (url, referer) {
return this;
}
}
if(u.hostname !== this.domain && !options.spiderCrossDomain) {
return this;
}

if (referer) {
h.referer = referer;
Expand All @@ -135,6 +142,7 @@ Spider.prototype.get = function (url, referer) {

request.get({url:url, headers:h, pool:self.pool}, function (e, resp) {
self.emit('log', debug, 'Response received for ' + url + '.');

if (e || !resp) {
console.log(e); //todo make sure this exception is visible when it is needed
self.emit('log', debug, 'Error getting URL ' + url);
Expand Down Expand Up @@ -162,37 +170,46 @@ Spider.prototype.route = function (hosts, pattern, cb) {
if (!self.routers[host]) {
self.routers[host] = new routes.Router();
}
self.routers[host].addRoute(pattern, cb);
self.routers[host].addRoute(new RegExp(pattern), cb);
});
return self;
};

Spider.prototype._handler = function (url, referer, response) {
var u = urlParse(url);
var $ = cheerio.load(response.body);
if (this.routers[u.host]) {
var r = this.routers[u.host].match(u.href.slice(u.href.indexOf(u.host) + u.host.length));
r.spider = this;
r.response = response;
r.url = u;

this.currentUrl = url;
r.fn.call(r, $, url);
this.currentUrl = null;
}
if (this.options.autoSpider) {
var auto = this.options.autoSpider;
if (auto & AUTO.ANCHORS) {
spiderUrls("a", this, response, $);
try {
var u = urlParse(url);
var $ = cheerio.load(response.body);
if (this.routers[u.host]) {
var r = this.routers[u.host].match(u.href.slice(u.href.indexOf(u.host) + u.host.length));
r.spider = this;
r.response = response;
r.url = u;

this.currentUrl = url;
r.fn.call(r, $, url);
this.currentUrl = null;
}
if (auto & AUTO.IMAGES) {
spiderUrls("img", this, response, $, "src");
}
if (auto & AUTO.LINKS) {
spiderUrls("link", this, response, $);
if (this.options.autoSpider) {
var auto = this.options.autoSpider;
if (auto & AUTO.ANCHORS) {
spiderUrls("a", this, response, $);
}
if (auto & AUTO.IMAGES) {
spiderUrls("img", this, response, $, "src");
}
if (auto & AUTO.LINKS) {
spiderUrls("link", this, response, $);
}
if (auto & AUTO.SCRIPTS) {
spiderUrls("script", this, response, $, "src");
}
}
if (auto & AUTO.SCRIPTS) {
spiderUrls("script", this, response, $, "src");
} catch(e) {
if (e.message === "Maximum call stack size exceeded") {
this.emit('log', info, e.message);
} else {
this.emit('log', error, e);
throw e;
}
}
};
Expand Down
9 changes: 6 additions & 3 deletions lib/spiderTest.js
Expand Up @@ -19,8 +19,12 @@ var timeout;
* relative to this dir.
* @param {Reporter | Reporter []} reporter one or more instantiated reporters to use for generating the test report. If not specified then
* the console reporter is used. This argument can be a single Reporter or an array of Reporters.
* @config {Config} config a Config object from nconf describing the options for running the tests
*/
exports.runTests = function (startUrl, testsDir, callback, baseDir, reporter, failOnMissingRoute) {
exports.runTests = function (startUrl, testsDir, callback, baseDir, reporter, config) {

var failOnMissingRoute = config && config.get("failOnMissingRoute") !== "false";
var spiderCrossDomain = config && config.get("spiderCrossDomain") !== "false";

var origDir;

Expand Down Expand Up @@ -48,6 +52,7 @@ exports.runTests = function (startUrl, testsDir, callback, baseDir, reporter, fa

var spider = spiderModule({
throwOnMissingRoute: failOnMissingRoute,
spiderCrossDomain: spiderCrossDomain,
autoSpider: spiderOptions.AUTO.ANCHORS | spiderOptions.AUTO.LINKS |
spiderOptions.AUTO.IMAGES | spiderOptions.AUTO.SCRIPTS
});
Expand All @@ -63,8 +68,6 @@ exports.runTests = function (startUrl, testsDir, callback, baseDir, reporter, fa
})
.get(startUrl.href)
.log("error");


};

/**
Expand Down

0 comments on commit b3b95dc

Please sign in to comment.