diff --git a/CHANGES.txt b/CHANGES.txt index 8db0d086dd..988c6f1bfd 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1786 CrawlDb should follow db.url.normalizers and db.url.filters (Diaa via markus) + * NUTCH-1757 ParserChecker to take custom metadata as input (jnioche) * NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 3b67f370e4..3ffac7dca4 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -76,11 +76,11 @@ NOTE: You should also check other related properties: - http.robots.agents - http.agent.description - http.agent.url - http.agent.email - http.agent.version + http.robots.agents + http.agent.description + http.agent.url + http.agent.email + http.agent.version and set their values appropriately. @@ -345,7 +345,6 @@ - db.fetch.interval.default 2592000 @@ -449,6 +448,18 @@ + + db.url.normalizers + false + Normalize urls when updating crawldb + + + + db.url.filters + false + Filter urls when updating crawldb + + db.update.max.inlinks 10000 diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java index cf34143aaf..40e888b9f8 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDb.java +++ b/src/java/org/apache/nutch/crawl/CrawlDb.java @@ -114,7 +114,9 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, boolean fil long end = System.currentTimeMillis(); LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } - +/* + * Configure a new CrawlDb in a temp folder at crawlDb/ + */ public static JobConf createJob(Configuration config, Path crawlDb) throws IOException { Path newCrawlDb = @@ -180,12 +182,11 @@ public int run(String[] args) throws Exception { return -1; } - boolean normalize = false; - boolean filter = false; + boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING, false); + boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false); + boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); boolean force = false; - boolean url404Purging = false; final FileSystem fs = FileSystem.get(getConf()); - boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true); HashSet dirs = new HashSet(); for (int i = 1; i < args.length; i++) { if (args[i].equals("-normalize")) {