Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Updated test suite, fixed some small issues with resource validity de…

…tection.
  • Loading branch information...
commit c5404dbcd20305fca195398746053fc670e19345 1 parent 5eaca1d
@cgiffard authored
View
2  lib/cache-backend-fs.js
@@ -33,7 +33,7 @@ function sanitisePath(path,queueObject) {
// Trim whitespace. If no path is present - assume index.html.
var sanitisedPath = path.length ? path.replace(/\s*$/ig,"") : "index.html";
- var headers = queueObject.stateData.headers;
+ var headers = queueObject.stateData.headers, sanitisedPathParts;
if (sanitisedPath.match(/\?/)) {
sanitisedPathParts = sanitisedPath.split(/\?/g);
View
37 lib/crawler.js
@@ -85,8 +85,8 @@ var Crawler = function(host,initialPath,initialPort,interval) {
// Supported Protocols
this.allowedProtocols = [
- /^http(s)?$/ig, // HTTP & HTTPS
- /^(rss|atom|feed)(\+xml)?$/ig // RSS / XML
+ /^http(s)?$/i, // HTTP & HTTPS
+ /^(rss|atom|feed)(\+xml)?$/i // RSS / XML
];
// Max file size to download/store
@@ -96,7 +96,7 @@ var Crawler = function(host,initialPath,initialPort,interval) {
// Matching MIME-types will be scanned for links
this.supportedMimeTypes = [
/^text\//i,
- /^application\/(rss)?[\+\/\-]?xml/i,
+ /^application\/(rss|html|xhtml)?[\+\/\-]?xml/i,
/^application\/javascript/i,
/^xml/i
];
@@ -136,12 +136,14 @@ Crawler.prototype.start = function() {
this.running = true;
// Now kick off the initial crawl
- this.crawl();
+ process.nextTick(function() {
+ crawler.crawl();
+ });
};
// Determines whether the protocol is supported, given a URL
Crawler.prototype.protocolSupported = function(URL) {
- var supported = false, protocol;
+ var protocol;
try {
protocol = URI(URL).protocol();
@@ -151,26 +153,19 @@ Crawler.prototype.protocolSupported = function(URL) {
return false;
}
- this.allowedProtocols.forEach(function(protocolCheck) {
- if (!!protocolCheck.exec(protocol)) {
- supported = true;
- }
- });
-
- return supported;
+ return this.allowedProtocols.reduce(function(prev,protocolCheck) {
+ return prev || !!protocolCheck.exec(protocol);
+ },false);
};
// Determines whether the mimetype is supported, given a... mimetype
Crawler.prototype.mimeTypeSupported = function(MIMEType) {
- var supported = false;
-
- this.supportedMimeTypes.forEach(function(mimeCheck) {
- if (!!mimeCheck.exec(MIMEType)) {
- supported = true;
- }
- });
-
- return supported;
+
+ return (
+ this.supportedMimeTypes.reduce(function(prev,mimeCheck) {
+ return prev || !!mimeCheck.exec(MIMEType);
+ },false)
+ );
};
// Takes a URL, and extracts the protocol, host, port, and resource
View
2  package.json
@@ -1,7 +1,7 @@
{
"name": "simplecrawler",
"description": "Very straigntforward web crawler. Uses EventEmitter. Generates queue statistics and has a basic cache mechanism with extensible backend.",
- "version": "0.1.3",
+ "version": "0.1.4",
"homepage": "http://github.com/cgiffard/node-simplecrawler",
"author": "Christopher Giffard <christopher.giffard@cgiffard.com>",
"keywords": [
View
20 test/jshint.js
@@ -17,13 +17,29 @@ describe("Core code",function() {
"cli",
"crawler",
"index",
- "queue" ].forEach(function(item) {
+ "queue",
+ "quickcrawl" ].forEach(function(item) {
var code = readCode(item);
it("module `" + item + "` should pass JSHint with no errors",function() {
- JSHINT(code);
+ JSHINT(code,{
+ "indent": 4,
+ "undef": true
+ },
+ {
+ // Don't want no errant logging statements going to production!
+ // `console` has been deliberately omitted from this whitelist.
+
+ // All the regular node stuff
+ "require": true,
+ "module": true,
+ "process": true,
+ "setInterval": true,
+ "clearInterval": true,
+ "Buffer": true
+ });
if (JSHINT.errors.length) {
throw new Error(
View
155 test/resourcevalidity.js
@@ -6,6 +6,21 @@ var chai = require("chai");
describe("Resource validity checker",function() {
+ it("should be able to determine whether a domain is in crawl scope",
+ function() {
+
+ var crawler = new (require("../"))("example.com",3000);
+
+ // The domain itself should be allowed.
+ crawler.domainValid("example.com").should.equal(true);
+
+ // Whereas other domains should not be allowed.
+ crawler.domainValid("somethingelse").should.equal(false);
+ crawler.domainValid("microsoft.com").should.equal(false);
+ crawler.domainValid("a.really.complex.fqdn.").should.equal(false);
+
+ });
+
it("should be able to determine whether a domain is a subdomain of another",
function() {
@@ -34,22 +49,138 @@ describe("Resource validity checker",function() {
});
-});
-
-describe("Link parser",function() {
+ it("should consider WWW domains and non-WWW domains alike by default",
+ function() {
+
+ var crawler = new (require("../"))("example.com",3000);
+
+ // Explicitly disallow crawling subdomains, important for this test
+ crawler.scanSubdomains = false;
+
+ // The domain itself isn't a subdomain per-se, but should be allowed
+ crawler.domainValid("example.com").should.equal(true);
+
+ // Its WWW domain should be allowed by default
+ crawler.domainValid("www.example.com").should.equal(true);
+
+ });
+
+ it("should consider WWW domains and non-WWW domains as separate if requested",
+ function() {
+
+ var crawler = new (require("../"))("example.com",3000);
+
+ // Explicitly disallow crawling subdomains, important for this test
+ crawler.scanSubdomains = false;
+
+ // Explicitly consider www a separate subdomain (ordinarily, true)
+ crawler.ignoreWWWDomain = false;
+
+ // The domain itself isn't a subdomain per-se, but should be allowed
+ crawler.domainValid("example.com").should.equal(true);
+
+ // Its WWW domain should be allowed by default
+ crawler.domainValid("www.example.com").should.equal(false);
+
+ });
+
+ it("should permit a specified set of domains based on the internal whitelist",
+ function() {
+
+ var crawler = new (require("../"))("example.com",3000);
+
+ // Add a few specific subdomains
+ crawler.domainWhitelist.push("foo.com");
+ crawler.domainWhitelist.push("bar.com");
+ crawler.domainWhitelist.push("abcdefg.net.nz");
+
+ // The domain itself isn't a subdomain per-se, but should be allowed
+ crawler.domainValid("example.com").should.equal(true);
+
+ // The explicitly set domains should be permitted
+ crawler.domainValid("foo.com").should.equal(true);
+ crawler.domainValid("bar.com").should.equal(true);
+ crawler.domainValid("abcdefg.net.nz").should.equal(true);
+
+ // These domains were never whitelisted, and should be denied
+ crawler.domainValid("wumpus.com").should.equal(false);
+ crawler.domainValid("fish.net").should.equal(false);
+
+ });
- var crawler = new (require("../"))("127.0.0.1",3000);
+ it("should permit fetching of specified protocols based on internal whitelist",
+ function() {
+
+ var crawler = new (require("../"))("example.com",3000);
+
+ // Protocols supported by default
+ crawler.protocolSupported("http://google.com").should.equal(true);
+ crawler.protocolSupported("https://google.com").should.equal(true);
+ crawler.protocolSupported("rss://google.com").should.equal(true);
+ crawler.protocolSupported("feed://google.com").should.equal(true);
+ crawler.protocolSupported("atom://google.com").should.equal(true);
+
+ // Protocols not supported
+ crawler.protocolSupported("gopher://google.com").should.equal(false);
+ crawler.protocolSupported("ws://google.com").should.equal(false);
+ crawler.protocolSupported("wss://google.com").should.equal(false);
+ });
- it("should throw out junky or invalid URLs without dying",function() {
+ it("should permit parsing of specified resources based on mimetype checks",
+ function() {
- var urlContext = {
- "url": "http://www.example.com"
- };
+ this.supportedMimeTypes = [
+ /^text\//i,
+ /^application\/(rss)?[\+\/\-]?xml/i,
+ /^application\/javascript/i,
+ /^xml/i
+ ];
- crawler.processURL("",urlContext).should.equal(false);
- crawler.processURL("\n\n",urlContext).should.equal(false);
- crawler.processURL("ur34nfie4985:s////dsf/",urlContext).should.equal(false);
+ var crawler = new (require("../"))("example.com",3000);
+
+ // Protocols supported by default
+ crawler.mimeTypeSupported("text/plain").should.equal(true);
+
+ // Crawler should be able to process all plain-text formats
+ crawler.mimeTypeSupported("text/SomeFormat").should.equal(true);
+ crawler.mimeTypeSupported("text/html").should.equal(true);
+
+ // XML based formats
+ crawler.mimeTypeSupported("application/rss+xml").should.equal(true);
+ crawler.mimeTypeSupported("application/html+xml").should.equal(true);
+ crawler.mimeTypeSupported("application/xhtml+xml").should.equal(true);
+
+ // Some weird JS mimetypes
+ crawler.mimeTypeSupported("application/javascript").should.equal(true);
+
+ // Anything with XML...
+ crawler.mimeTypeSupported("xml/manifest").should.equal(true);
+
+ // And these should fail
+ crawler.mimeTypeSupported("application/octet-stream").should.equal(false);
+ crawler.mimeTypeSupported("img/png").should.equal(false);
+ crawler.mimeTypeSupported("video/webm").should.equal(false);
+ crawler.mimeTypeSupported("blah/blah").should.equal(false);
});
-});
+
+ describe("Link parser",function() {
+
+ var crawler = new (require("../"))("127.0.0.1",3000);
+
+ it("should throw out junky or invalid URLs without dying",function() {
+
+ var urlContext = {
+ "url": "http://www.example.com"
+ };
+
+ crawler.processURL("",urlContext).should.equal(false);
+ crawler.processURL("\n\n",urlContext).should.equal(false);
+ crawler.processURL("ur34nfie4985:s////dsf/",urlContext).should.equal(false);
+
+ });
+
+ });
+});
+
View
0  test/crawl.js → test/testcrawl.js
File renamed without changes
Please sign in to comment.
Something went wrong with that request. Please try again.