Skip to content

Commit

Permalink
NUTCH-921 Reduce dependency of Nutch on config files.
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/nutch/branches/branch-1.3@1079760 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
sigram committed Mar 9, 2011
1 parent c3b52bb commit 488aba5
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 58 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ Nutch Change Log

Release 1.3 - Current Development

* NUTCH-921 Reduce dependency of Nutch on config files (ab)

* NUTCH-876 Remove remaining robots/IP blocking code in lib-http (ab)

* NUTCH-872 Change the default fetcher.parse to FALSE (ab)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
package org.apache.nutch.urlfilter.api;

// JDK imports
import java.io.File;
import java.io.Reader;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.List;
import java.util.ArrayList;

Expand Down Expand Up @@ -74,18 +76,29 @@ public RegexURLFilterBase() { }
* Constructs a new RegexURLFilter and init it with a file of rules.
* @param filename is the name of rules file.
*/
public RegexURLFilterBase(String filename)
public RegexURLFilterBase(File filename)
throws IOException, IllegalArgumentException {
this(new FileReader(filename));
}

/**
* Constructs a new RegexURLFilter and inits it with a list of rules.
* @param rules string with a list of rules, one rule per line
* @throws IOException
* @throws IllegalArgumentException
*/
public RegexURLFilterBase(String rules) throws IOException,
IllegalArgumentException {
this(new StringReader(rules));
}

/**
* Constructs a new RegexURLFilter and init it with a Reader of rules.
* @param reader is a reader of rules.
*/
protected RegexURLFilterBase(Reader reader)
throws IOException, IllegalArgumentException {
rules = readRulesFile(reader);
rules = readRules(reader);
}

/**
Expand All @@ -102,9 +115,9 @@ protected RegexURLFilterBase(Reader reader)
* Returns the name of the file of rules to use for
* a particular implementation.
* @param conf is the current configuration.
* @return the name of the file of rules to use.
* @return the name of the resource containing the rules to use.
*/
protected abstract String getRulesFile(Configuration conf);
protected abstract Reader getRulesReader(Configuration conf) throws IOException;


/* -------------------------- *
Expand Down Expand Up @@ -132,18 +145,18 @@ public synchronized String filter(String url) {

public void setConf(Configuration conf) {
this.conf = conf;
String file = getRulesFile(conf);
Reader reader = conf.getConfResourceAsReader(file);
if (reader == null) {
if (LOG.isFatalEnabled()) { LOG.fatal("Can't find resource: " + file); }
} else {
try {
rules = readRulesFile(reader);
} catch (IOException e) {
if (LOG.isFatalEnabled()) { LOG.fatal(e.getMessage()); }
//TODO mb@media-style.com: throw Exception? Because broken api.
throw new RuntimeException(e.getMessage(), e);
}
Reader reader = null;
try {
reader = getRulesReader(conf);
} catch (Exception e) {
if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
throw new RuntimeException(e.getMessage(), e);
}
try {
rules = readRules(reader);
} catch (IOException e) {
if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
throw new RuntimeException(e.getMessage(), e);
}
}

Expand All @@ -161,7 +174,7 @@ public Configuration getConf() {
* @param reader is a reader of regular expressions rules.
* @return the corresponding {@RegexRule rules}.
*/
private RegexRule[] readRulesFile(Reader reader)
private RegexRule[] readRules(Reader reader)
throws IOException, IllegalArgumentException {

BufferedReader in = new BufferedReader(reader);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
// JDK imports
import java.io.Reader;
import java.io.IOException;
import java.io.StringReader;
import java.util.regex.PatternSyntaxException;

// Hadoop imports
Expand All @@ -41,6 +42,8 @@
* @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
*/
public class AutomatonURLFilter extends RegexURLFilterBase {
public static final String URLFILTER_AUTOMATON_FILE = "urlfilter.automaton.file";
public static final String URLFILTER_AUTOMATON_RULES = "urlfilter.automaton.rules";

public AutomatonURLFilter() {
super();
Expand All @@ -61,9 +64,17 @@ public AutomatonURLFilter(String filename)
* <implementation:RegexURLFilterBase> *
* ----------------------------------- */

// Inherited Javadoc
protected String getRulesFile(Configuration conf) {
return conf.get("urlfilter.automaton.file");
/**
* Rules specified as a config property will override rules specified
* as a config file.
*/
protected Reader getRulesReader(Configuration conf) throws IOException {
String stringRules = conf.get(URLFILTER_AUTOMATON_RULES);
if (stringRules != null) {
return new StringReader(stringRules);
}
String fileRules = conf.get(URLFILTER_AUTOMATON_FILE);
return conf.getConfResourceAsReader(fileRules);
}

// Inherited Javadoc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.LinkedHashSet;
import java.util.Set;

Expand Down Expand Up @@ -70,7 +71,7 @@ public class DomainURLFilter
private String domainFile = null;
private Set<String> domainSet = new LinkedHashSet<String>();

private void readConfigurationFile(Reader configReader)
private void readConfiguration(Reader configReader)
throws IOException {

// read the configuration file, line by line
Expand Down Expand Up @@ -140,21 +141,24 @@ public void setConf(Configuration conf) {

// domain file and attribute "file" take precedence if defined
String file = conf.get("urlfilter.domain.file");
String stringRules = conf.get("urlfilter.domain.rules");
if (domainFile != null) {
file = domainFile;
}
else if (attributeFile != null) {
file = attributeFile;
}

// get the file as a classpath resource and populate the domain set with
// the domains from the file
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
reader = conf.getConfResourceAsReader(file);
}
try {
Reader reader = conf.getConfResourceAsReader(file);
if (reader == null) {
reader = new FileReader(file);
}
readConfigurationFile(reader);
readConfiguration(reader);
}
catch (IOException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.StringReader;

import java.util.List;
import java.util.ArrayList;
Expand Down Expand Up @@ -63,8 +64,8 @@ public PrefixURLFilter() throws IOException {

}

public PrefixURLFilter(String filename) throws IOException {
trie = readConfigurationFile(new FileReader(filename));
public PrefixURLFilter(String stringRules) throws IOException {
trie = readConfiguration(new StringReader(stringRules));
}

public String filter(String url) {
Expand All @@ -74,7 +75,7 @@ public String filter(String url) {
return url;
}

private TrieStringMatcher readConfigurationFile(Reader reader)
private TrieStringMatcher readConfiguration(Reader reader)
throws IOException {

BufferedReader in=new BufferedReader(reader);
Expand Down Expand Up @@ -144,16 +145,22 @@ public void setConf(Configuration conf) {
}

String file = conf.get("urlfilter.prefix.file");
String stringRules = conf.get("urlfilter.prefix.rules");
// attribute "file" takes precedence if defined
if (attributeFile != null)
file = attributeFile;
Reader reader = conf.getConfResourceAsReader(file);
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
reader = conf.getConfResourceAsReader(file);
}

if (reader == null) {
trie = new PrefixStringMatcher(new String[0]);
} else {
try {
trie = readConfigurationFile(reader);
trie = readConfiguration(reader);
} catch (IOException e) {
if (LOG.isFatalEnabled()) { LOG.fatal(e.getMessage()); }
// TODO mb@media-style.com: throw Exception? Because broken api.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
// JDK imports
import java.io.Reader;
import java.io.IOException;
import java.io.StringReader;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

Expand All @@ -35,6 +36,9 @@
* {@link java.util.regex Java Regex implementation}.
*/
public class RegexURLFilter extends RegexURLFilterBase {

public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file";
public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules";

public RegexURLFilter() {
super();
Expand All @@ -55,9 +59,17 @@ public RegexURLFilter(String filename)
* <implementation:RegexURLFilterBase> *
* ----------------------------------- */

// Inherited Javadoc
protected String getRulesFile(Configuration conf) {
return conf.get("urlfilter.regex.file");
/**
* Rules specified as a config property will override rules specified
* as a config file.
*/
protected Reader getRulesReader(Configuration conf) throws IOException {
String stringRules = conf.get(URLFILTER_REGEX_RULES);
if (stringRules != null) {
return new StringReader(stringRules);
}
String fileRules = conf.get(URLFILTER_REGEX_FILE);
return conf.getConfResourceAsReader(fileRules);
}

// Inherited Javadoc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.StringReader;

import java.util.List;
import java.util.ArrayList;
Expand Down Expand Up @@ -139,7 +140,7 @@ public SuffixURLFilter() throws IOException {
}

public SuffixURLFilter(Reader reader) throws IOException {
readConfigurationFile(reader);
readConfiguration(reader);
}

public String filter(String url) {
Expand Down Expand Up @@ -167,7 +168,7 @@ public String filter(String url) {
}
}

public void readConfigurationFile(Reader reader) throws IOException {
public void readConfiguration(Reader reader) throws IOException {

// handle missing config file
if (reader == null) {
Expand Down Expand Up @@ -269,12 +270,18 @@ public void setConf(Configuration conf) {
}

String file = conf.get("urlfilter.suffix.file");
String stringRules = conf.get("urlfilter.suffix.rules");
// attribute "file" takes precedence if defined
if (attributeFile != null) file = attributeFile;
Reader reader = conf.getConfResourceAsReader(file);
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
reader = conf.getConfResourceAsReader(file);
}

try {
readConfigurationFile(reader);
readConfiguration(reader);
} catch (IOException e) {
if (LOG.isFatalEnabled()) { LOG.fatal(e.getMessage()); }
throw new RuntimeException(e.getMessage(), e);
Expand Down
Loading

0 comments on commit 488aba5

Please sign in to comment.