This repository has been archived by the owner on Mar 7, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 354
/
quickcrawl.js
73 lines (57 loc) · 2.07 KB
/
quickcrawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
var Crawler = require("./crawler.js"),
URI = require("URIjs");
/*
Public: Convenience function for really quick, simple crawls. It generates
a new crawler, parses the URL provided, and sets up the new crawler with
the host and path information extracted from the URL. It returns the crawler
object, so you can set up event handlers, and waits until `process.nextTick`
before kicking off the crawl.
url - URL to begin crawl from.
successCallback - Optional function called once an item is completely
downloaded. Functionally identical to a fetchcomplete
event listener.
failCallback - Optional function to be called if an item fails to
download. Functionally identical to a fetcherror
event listener.
Examples
Crawler.crawl(
"http://example.com:3000/start",
function(queueItem,data) {
console.log("I got a new item!");
}
);
Crawler
.crawl("http://www.example.com/")
.on("fetchstart",function(queueItem) {
console.log("Beginning fetch for",queueItem.url);
});
Returns the new Vixen object which has now been constructed.
*/
module.exports = function crawl(url,successCallback,failCallback) {
// Parse the URL first
url = URI(url);
// If either the protocol, path, or hostname are unset, we can't really
// do much. Die with error.
if (!url.protocol())
throw new Error("Can't crawl with unspecified protocol.");
if (!url.hostname())
throw new Error("Can't crawl with unspecified hostname.");
if (!url.path())
throw new Error("Can't crawl with unspecified path.");
var tmpCrawler =
new Crawler(
url.hostname(),
url.path(),
url.port() || 80);
// Attach callbacks if they were provided
if (successCallback) tmpCrawler.on("fetchcomplete",successCallback);
if (failCallback) tmpCrawler.on("fetcherror",failCallback);
// Start the crawler on the next runloop
// This enables initial configuration options and event handlers to take
// effect before the first resource is queued.
process.nextTick(function() {
tmpCrawler.start();
});
// Return crawler
return tmpCrawler;
};