Skip to content
This repository has been archived by the owner on Mar 7, 2021. It is now read-only.

Commit

Permalink
Add option to sort Query parameters
Browse files Browse the repository at this point in the history
The URLs `http://www.example.com/test.jsp?a=1&b=2` and `http://www.example.com/test.jsp?a=2&b=1` in most cases will return the same content and we want to consider them the same URL. We can canonicalize this URL by sorting the parameters. urijs does this in the [equals()](http://medialize.github.io/URI.js/docs.html#equals) method, but normalize() does not sort the parameters.

I added an option to have crawler sort the query parameters before queuing URL's.
  • Loading branch information
Lexmark-haputman committed May 31, 2017
1 parent 8ab9962 commit 60b9cde
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Expand Up @@ -295,6 +295,9 @@ change to adapt it to your specific needs.
Or go even further and strip WWW subdomain from requests altogether!
* `crawler.stripQuerystring=false` -
Specify to strip querystring parameters from URL's.
* `crawler.sortQueryParameters=false` -
Specify to sort the querystring parameters before queueing URL's. This is
to canonicalize URLs so that foo?a=1&b=2 is considered same as foo?b=2&a=1.
* `crawler.discoverResources` -
simplecrawler's default resource discovery function -
which, given a buffer containing a resource, returns an array of URLs.
Expand Down
18 changes: 18 additions & 0 deletions lib/crawler.js
Expand Up @@ -308,6 +308,13 @@ var Crawler = function(initialURL) {
*/
this.stripQuerystring = false;

/**
* Controls whether to sort query string parameters from URL's at queue
* item construction time.
* @type {Boolean}
*/
this.sortQueryParameters = false;

/**
* Collection of regular expressions and functions that are applied in the
* default {@link Crawler#discoverResources} method.
Expand Down Expand Up @@ -748,6 +755,17 @@ Crawler.prototype.processURL = function(url, referrer) {
url = uri(url).search("").href();
}

// Canonicalize the URL by sorting query parameters.
if (crawler.sortQueryParameters === true) {
url = uri(url).query(function(data) {
var _data = {};
Object.keys(data).sort().forEach(function(key) {
_data[key] = data[key];
});
return _data;
}).href();
}

if (crawler.stripWWWDomain && url.match(/https?:\/\/(www\.).*/i)) {
url = url.replace("www.", "");
}
Expand Down
11 changes: 11 additions & 0 deletions test/queue.js
Expand Up @@ -329,4 +329,15 @@ describe("Queue methods", function() {

crawler.start();
});

it("Doesn't queue URL with reordered query parameters", function(done) {
var crawler = new Crawler("http://127.0.0.1:3000");
crawler.sortQueryParameters = true;
crawler.queueURL("http://127.0.0.1:3000/sample.jsp?a=1&b=2");
crawler.queueURL("http://127.0.0.1:3000/sample.jsp?b=2&a=1");
crawler.queue.getLength(function(error, length) {
length.should.equal(1);
done();
});
});
});
14 changes: 14 additions & 0 deletions test/resourcevalidity.js
Expand Up @@ -141,6 +141,20 @@ describe("Resource validity checker", function() {
crawler.processURL("http://example.com/test?q=crawler&foo=bar").path.should.equal("/test?q=crawler&foo=bar");
});

it("should canonicalize query strings by sorting parameters", function() {

var crawler = makeCrawler("http://example.com");

crawler.sortQueryParameters = true;
crawler.processURL("http://example.com/example?s=1&r=9&b=3&r=2&r=7").path.should.equal("/example?b=3&r=9&r=2&r=7&s=1");
crawler.processURL("http://example.com/test?q=crawler&foo=bar").path.should.equal("/test?foo=bar&q=crawler");

crawler.sortQueryParameters = false;
crawler.processURL("http://example.com/example?s=1&r=9&b=3&r=2&r=7").path.should.equal("/example?s=1&r=9&r=2&r=7&b=3");
// ^^^ note: urijs normalize() rearranges the query parameters, grouping those with same name.
crawler.processURL("http://example.com/test?q=crawler&foo=bar").path.should.equal("/test?q=crawler&foo=bar");
});

it("should throw out junky or invalid URLs without dying", function() {

var crawler = makeCrawler("http://127.0.0.1:3000");
Expand Down

0 comments on commit 60b9cde

Please sign in to comment.