Permalink
Browse files

Updated jQuery to 1.8.3, fix+test always absolute a.href

  • Loading branch information...
sylvinus committed Dec 8, 2012
1 parent 8821db2 commit 2c45fb481ab141b95a8146e5ebd45cf9c8ff4ded
View
@@ -33,7 +33,7 @@ Crash course
"callback":function(error,result,$) {
// $ is a jQuery instance scoped to the server-side DOM of the page
- $("#content a:link").each(function(a) {
+ $("#content a").each(function(a) {
c.queue(a.href);
});
}
@@ -132,6 +132,13 @@ Rough todolist
ChangeLog
---------
+0.2.2
+ - Fix relative link bug, all a.href should be absolute when crawling a remote URL
+ - Updated default jQuery to 1.8.3
+
+0.2.1
+ - Updated jsdom to 0.2.19
+
0.2.0
- Updated code & dependencies for node 0.6/0.8, cleaned package.json
- Added a forceUTF8 mode
View
@@ -7,6 +7,7 @@ var http = require('http'),
jschardet = require('jschardet'),
Iconv = require('iconv').Iconv,
jsdom = require('jsdom'),
+ fs = require("fs"),
Pool = require('generic-pool').Pool;
@@ -18,10 +19,10 @@ exports.Crawler = function(options) {
self.options = _.extend({
timeout: 60000,
jQuery: true,
- jQueryUrl: path.resolve(__dirname,"../vendor/jquery-1.8.1.min.js"),
+ jQueryUrl: path.resolve(__dirname,"../vendor/jquery-1.8.3.min.js"),
maxConnections: 10,
priorityRange: 10,
- priority: 5,
+ priority: 5,
retries: 3,
forceUTF8: false,
retryTimeout: 10000,
@@ -65,7 +66,7 @@ exports.Crawler = function(options) {
if (queuedCount+plannedQueueCallsCount === 0) {
if (self.options.onDrain) self.options.onDrain();
}
- }
+ };
self.onDrain = function() {};
@@ -86,7 +87,7 @@ exports.Crawler = function(options) {
//If a query has already been made to self URL, don't callback again
if (cacheData) {
- // Make sure we actually have cached data, and not just a note
+ // Make sure we actually have cached data, and not just a note
// that the page was already crawled
if (_.isArray(cacheData)) {
self.onContent(null,opts,cacheData[0],true);
@@ -179,7 +180,7 @@ exports.Crawler = function(options) {
self.cache[toQueue.uri] = [response];
//If we don't cache but still want to skip duplicates we have to maintain a list of fetched URLs.
- } else if (toQueue.skipDuplicates) {
+ } else if (toQueue.skipDuplicates) {
self.cache[toQueue.uri] = true;
}
}
@@ -190,19 +191,45 @@ exports.Crawler = function(options) {
if (toQueue.jQuery && toQueue.method!="HEAD") {
- // TODO support for non-HTML content
+ // TODO support for non-HTML content
// https://github.com/joshfire/node-crawler/issues/9
try {
- jsdom.env(response.body,[toQueue.jQueryUrl],function(errors,window) {
- if (errors) {
- toQueue.callback(errors);
- } else {
- response.window = window;
- toQueue.callback(null,response,window.jQuery);
- }
-
- release(toQueue);
- });
+ var jsd = function(src) {
+ jsdom.env({
+ "url":toQueue.uri,
+ "html":response.body,
+ "src":src,
+ "done":function(errors,window) {
+
+ if (errors) {
+ toQueue.callback(errors);
+ } else {
+ response.window = window;
+ toQueue.callback(null,response,window.jQuery);
+ }
+
+ release(toQueue);
+ }
+ });
+ };
+
+ // jsdom doesn't support adding local scripts,
+ // We have to read jQuery from the local fs
+ if (toQueue.jQueryUrl.match(/^(file\:\/\/|\/)/)) {
+
+ // TODO cache this
+ fs.readFile(toQueue.jQueryUrl.replace(/^file\:\/\//,""),"utf-8",function(err,jq) {
+ if (err) {
+ toQueue.callback(e);
+ release(toQueue);
+ } else {
+ jsd([jq]);
+ }
+ });
+ } else {
+ jsd([toQueue.jQueryUrl]);
+ }
+
} catch (e) {
toQueue.callback(e);
release(toQueue);
@@ -212,7 +239,7 @@ exports.Crawler = function(options) {
toQueue.callback(null,response);
release(toQueue);
- }
+ }
};
@@ -280,7 +307,7 @@ exports.Crawler = function(options) {
}
},toQueue.priority);
- }
+ };
};
View
@@ -1,6 +1,6 @@
{
"name": "crawler",
- "version": "0.2.1",
+ "version": "0.2.2",
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously.",
"keywords": [
"dom",
@@ -0,0 +1,7 @@
+<html>
+<body>
+ <a href="links2.html">Relative link</a>
+
+ <a href="/mockfiles/links2.html">Absolute link</a>
+</body>
+</html>
@@ -0,0 +1,5 @@
+<html>
+<body>
+ <a href="links1.html">Relative link</a>
+</body>
+</html>
View
@@ -25,4 +25,14 @@ app.get('/close/destroy', function(req, res){
res.end();
});
-exports.app = app;
+
+app.get('/mockfiles/*', function(req, res){
+ res.sendfile("test/mockfiles/"+req.param(0));
+});
+
+
+exports.app = app;
+
+if (require.main === module) {
+ app.listen(8080);
+}
View
@@ -0,0 +1,23 @@
+0 info it worked if it ends with ok
+1 verbose cli [ 'node', '/usr/local/bin/npm', 'test' ]
+2 info using npm@1.1.45
+3 info using node@v0.8.4
+4 verbose node symlink /usr/local/bin/node
+5 verbose config file /Users/sylvinus/.npmrc
+6 verbose config file /usr/local/etc/npmrc
+7 verbose config file /usr/local/lib/node_modules/npm/npmrc
+8 verbose read json /Users/sylvinus/w/sz/node-crawler/test/package.json
+9 error Error: ENOENT, open '/Users/sylvinus/w/sz/node-crawler/test/package.json'
+10 error If you need help, you may report this log at:
+10 error <http://github.com/isaacs/npm/issues>
+10 error or email it to:
+10 error <npm-@googlegroups.com>
+11 error System Darwin 12.2.0
+12 error command "node" "/usr/local/bin/npm" "test"
+13 error cwd /Users/sylvinus/w/sz/node-crawler/test
+14 error node -v v0.8.4
+15 error npm -v 1.1.45
+16 error path /Users/sylvinus/w/sz/node-crawler/test/package.json
+17 error code ENOENT
+18 error errno 34
+19 verbose exit [ 34, true ]
View
@@ -16,9 +16,12 @@ testrunner.run([
{
code: path + "/lib/crawler.js",
tests: [
+ path + "/test/units/links.js",
+
path + "/test/units/forceutf8.js",
path + "/test/units/simple.js",
path + "/test/units/errors.js"
+
]
}
],function() {
View
@@ -0,0 +1,41 @@
+var Crawler = require("../../lib/crawler").Crawler;
+var _ = require("underscore");
+
+QUnit.module("links");
+
+var DEBUG = true;
+var MOCKPORT = 30045;
+
+
+test("links resolve to absolute urls", function() {
+ expect( 2 );
+
+ stop();
+
+ var c = new Crawler({
+ "debug":DEBUG,
+ "timeout":500,
+ "retryTimeout":1000,
+ "retries":1,
+ "onDrain":function() {
+ start();
+ }
+ });
+
+ c.queue([{
+ "uri":"http://127.0.0.1:"+MOCKPORT+"/mockfiles/links1.html",
+ "callback":function(error,result,$) {
+
+ var links = _.map($("a"),function(a) {
+ return a.href;
+ });
+
+ //Both links should be resolve to absolute URLs
+ equal(links[0],"http://127.0.0.1:30045/mockfiles/links2.html");
+ equal(links[1],"http://127.0.0.1:30045/mockfiles/links2.html");
+
+ }
+ }]);
+
+
+});

Large diffs are not rendered by default.

Oops, something went wrong.

Large diffs are not rendered by default.

Oops, something went wrong.

0 comments on commit 2c45fb4

Please sign in to comment.