Skip to content

Commit

Permalink
NUTCH-1786 CrawlDb should follow db.url.normalizers and db.url.filters
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1597556 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
Markus Jelsma committed May 26, 2014
1 parent 7089e15 commit 38bfb64
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Expand Up @@ -2,6 +2,8 @@ Nutch Change Log

Nutch Current Development

* NUTCH-1786 CrawlDb should follow db.url.normalizers and db.url.filters (Diaa via markus)

* NUTCH-1757 ParserChecker to take custom metadata as input (jnioche)

* NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus)
Expand Down
23 changes: 17 additions & 6 deletions conf/nutch-default.xml
Expand Up @@ -76,11 +76,11 @@

NOTE: You should also check other related properties:

http.robots.agents
http.agent.description
http.agent.url
http.agent.email
http.agent.version
http.robots.agents
http.agent.description
http.agent.url
http.agent.email
http.agent.version

and set their values appropriately.

Expand Down Expand Up @@ -345,7 +345,6 @@
</property>

<!-- web db properties -->

<property>
<name>db.fetch.interval.default</name>
<value>2592000</value>
Expand Down Expand Up @@ -449,6 +448,18 @@
</description>
</property>

<property>
<name>db.url.normalizers</name>
<value>false</value>
<description>Normalize urls when updating crawldb</description>
</property>

<property>
<name>db.url.filters</name>
<value>false</value>
<description>Filter urls when updating crawldb</description>
</property>

<property>
<name>db.update.max.inlinks</name>
<value>10000</value>
Expand Down
11 changes: 6 additions & 5 deletions src/java/org/apache/nutch/crawl/CrawlDb.java
Expand Up @@ -114,7 +114,9 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, boolean fil
long end = System.currentTimeMillis();
LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

/*
* Configure a new CrawlDb in a temp folder at crawlDb/<rand>
*/
public static JobConf createJob(Configuration config, Path crawlDb)
throws IOException {
Path newCrawlDb =
Expand Down Expand Up @@ -180,12 +182,11 @@ public int run(String[] args) throws Exception {

return -1;
}
boolean normalize = false;
boolean filter = false;
boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING, false);
boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
boolean force = false;
boolean url404Purging = false;
final FileSystem fs = FileSystem.get(getConf());
boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
HashSet<Path> dirs = new HashSet<Path>();
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-normalize")) {
Expand Down

0 comments on commit 38bfb64

Please sign in to comment.