Skip to content

Commit

Permalink
WebCrawler sample
Browse files Browse the repository at this point in the history
  • Loading branch information
ajlopez committed Aug 30, 2014
1 parent efc3587 commit d145884
Showing 1 changed file with 75 additions and 0 deletions.
75 changes: 75 additions & 0 deletions samples/webcrawler/run.js
@@ -0,0 +1,75 @@

var sm = require('../..');
var url = require('url');
var http = require('http');

var visited = [];
var hostname;

var match1 = /href=\s*"([^&"]*)"/ig;
var match2= /href=\s*'([^&']*)'/ig;

var flow = sm.flow()
.transform(function (link, next) {
var urldata = url.parse(link);

if (!hostname)
hostname = urldata.hostname;

if (hostname == urldata.hostname)
next(null, link);
})
.transform(function (link, next) {
console.log('downloading', link);

var urldata = url.parse(link);

options = {
host: urldata.hostname,
port: urldata.port,
path: urldata.path,
method: 'GET'
};

http.get(options, function(res) {
var result = '';
console.log('Url: ' + link);
res.setEncoding('utf8');
res.on('data', function(data) {
result += data.toString();
});
res.on('end', function (data) {
next(null, result);
});
}).on('error', function(e) {
console.log('Url: ' + link);
console.log('Error: ' + e.message);
next(e, null);
});
})
.output(function (data, next) {
var links;

while ((links = match1.exec(data)) !== null) {
var link = links[1];

//if (link.indexOf(':') < 0 && prefix)
// link = prefix + link;

if (link.indexOf('http:') == 0)
flow.post(link);
};

while ((links = match2.exec(data)) !== null) {
var link = links[1];

//if (link.indexOf(':') < 0 && prefix)
// link = prefix + link;

if (link.indexOf('http:') == 0)
flow.post(link);
};
})


flow.post(process.argv[2]);

0 comments on commit d145884

Please sign in to comment.