Skip to content

Commit

Permalink
NUTCH-105 - Network error during robots.txt fetch causes file to beig…
Browse files Browse the repository at this point in the history
…nored, contributed by Greg Kim

git-svn-id: https://svn.apache.org/repos/asf/lucene/nutch/branches/branch-0.8@447867 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
siren committed Sep 19, 2006
1 parent e0d7405 commit 97b3517
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Expand Up @@ -22,6 +22,9 @@ Unreleased changes (0.8.1)
7. NUTCH-338 - Remove the text parser as an option for parsing PDF files
in parse-plugins.xml (Chris A. Mattmann via siren)

8. NUTCH-105 - Network error during robots.txt fetch causes file to
beignored (Greg Kim via siren)

Release 0.8 - 2006-07-25

0. Totally new architecture, based on hadoop
Expand Down
Expand Up @@ -420,6 +420,8 @@ private RobotRuleSet getRobotRulesSet(HttpBase http, URL url) {

RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);

boolean cacheRule = true;

if (robotRules == null) { // cache miss
if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
try {
Expand All @@ -430,16 +432,22 @@ private RobotRuleSet getRobotRulesSet(HttpBase http, URL url) {
robotRules = parseRules(response.getContent());
else if ( (response.getCode() == 403) && (!allowForbidden) )
robotRules = FORBID_ALL_RULES; // use forbid all
else
else if (response.getCode() >= 500) {
cacheRule = false;
robotRules = EMPTY_RULES;
}else
robotRules = EMPTY_RULES; // use default rules
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
cacheRule = false;
robotRules = EMPTY_RULES;
}

CACHE.put(host, robotRules); // cache rules for host
if (cacheRule){
CACHE.put(host, robotRules); // cache rules for host
}
}
return robotRules;
}
Expand Down

0 comments on commit 97b3517

Please sign in to comment.