diff --git a/CHANGES.txt b/CHANGES.txt
index 8db0d086dd..988c6f1bfd 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1786 CrawlDb should follow db.url.normalizers and db.url.filters (Diaa via markus)
+
* NUTCH-1757 ParserChecker to take custom metadata as input (jnioche)
* NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 3b67f370e4..3ffac7dca4 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -76,11 +76,11 @@
NOTE: You should also check other related properties:
- http.robots.agents
- http.agent.description
- http.agent.url
- http.agent.email
- http.agent.version
+ http.robots.agents
+ http.agent.description
+ http.agent.url
+ http.agent.email
+ http.agent.version
and set their values appropriately.
@@ -345,7 +345,6 @@
-
db.fetch.interval.default
2592000
@@ -449,6 +448,18 @@
+
+ db.url.normalizers
+ false
+ Normalize urls when updating crawldb
+
+
+
+ db.url.filters
+ false
+ Filter urls when updating crawldb
+
+
db.update.max.inlinks
10000
diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java
index cf34143aaf..40e888b9f8 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -114,7 +114,9 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, boolean fil
long end = System.currentTimeMillis();
LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
-
+/*
+ * Configure a new CrawlDb in a temp folder at crawlDb/
+ */
public static JobConf createJob(Configuration config, Path crawlDb)
throws IOException {
Path newCrawlDb =
@@ -180,12 +182,11 @@ public int run(String[] args) throws Exception {
return -1;
}
- boolean normalize = false;
- boolean filter = false;
+ boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING, false);
+ boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
+ boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
boolean force = false;
- boolean url404Purging = false;
final FileSystem fs = FileSystem.get(getConf());
- boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
HashSet dirs = new HashSet();
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-normalize")) {