apache · sebastian-nagel · May 14, 2020 · Sep 27, 2019 · May 13, 2020
diff --git a/...lugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/...lugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -51,20 +51,11 @@ public class RegexParseFilter implements HtmlParseFilter {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
   private static String attributeFile = null;
-  private String regexFile = null;
 
   private Configuration conf;
 
   private static final Map<String,RegexRule> rules = new HashMap<>();
 
-  public RegexParseFilter() {
-    //default constructor
-  }
-
-  public RegexParseFilter(String regexFile) {
-    this.regexFile = regexFile;
-  }
-
   public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
     Parse parse = parseResult.get(content.getUrl());
     String html = new String(content.getContent());
@@ -129,15 +120,8 @@ public void setConf(Configuration conf) {
       }
     }
 
-    // domain file and attribute "file" take precedence if defined
-    String file = conf.get("parsefilter.regex.file");
+    String file = conf.get("parsefilter.regex.file", attributeFile);
     String stringRules = conf.get("parsefilter.regex.rules");
-    if (regexFile != null) {
-      file = regexFile;
-    }
-    else if (attributeFile != null) {
-      file = attributeFile;
-    }
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);

diff --git a/...n/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java b/...n/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
@@ -35,7 +35,8 @@ public void testPositiveFilter() throws Exception {
     Configuration conf = NutchConfiguration.create();
 
     String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
-    RegexParseFilter filter = new RegexParseFilter(file);
+    conf.set("parsefilter.regex.file", file);
+    RegexParseFilter filter = new RegexParseFilter();
     filter.setConf(conf);
 
     String url = "http://nutch.apache.org/";
@@ -56,7 +57,8 @@ public void testNegativeFilter() throws Exception {
     Configuration conf = NutchConfiguration.create();
 
     String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
-    RegexParseFilter filter = new RegexParseFilter(file);
+    conf.set("parsefilter.regex.file", file);
+    RegexParseFilter filter = new RegexParseFilter();
     filter.setConf(conf);
 
     String url = "http://nutch.apache.org/";

diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -38,26 +38,28 @@
 /**
  * <p>
  * Filters URLs based on a file containing domain suffixes, domain names, and
- * hostnames. Only a url that matches one of the suffixes, domains, or hosts
+ * hostnames. Only a URL that matches one of the suffixes, domains, or hosts
  * present in the file is allowed.
  * </p>
  * 
  * <p>
- * Urls are checked in order of domain suffix, domain name, and hostname against
+ * URLs are checked in order of domain suffix, domain name, and hostname against
  * entries in the domain file. The domain file would be setup as follows with
  * one entry per line:
  * 
  * <pre>
- * com apache.org www.apache.org
+ * com
+ * apache.org
+ * www.apache.org
  * </pre>
  * 
  * <p>
  * The first line is an example of a filter that would allow all .com domains.
- * The second line allows all urls from apache.org and all of its subdomains
+ * The second line allows all URLs from apache.org and all of its subdomains
  * such as lucene.apache.org and hadoop.apache.org. The third line would allow
- * only urls from www.apache.org. There is no specific ordering to entries. The
+ * only URLs from www.apache.org. There is no specific ordering to entries. The
  * entries are from more general to more specific with the more general
- * overridding the more specific.
+ * overriding the more specific.
  * </p>
  * 
  * The domain file defaults to domain-urlfilter.txt in the classpath but can be
@@ -72,7 +74,6 @@
  * </li>
  * </ul>
  * 
- * the attribute "file" has higher precedence if defined.
  */
 public class DomainURLFilter implements URLFilter {
 
@@ -82,7 +83,6 @@ public class DomainURLFilter implements URLFilter {
   // read in attribute "file" of this plugin.
   private static String attributeFile = null;
   private Configuration conf;
-  private String domainFile = null;
   private Set<String> domainSet = new LinkedHashSet<String>();
 
   private void readConfiguration(Reader configReader) throws IOException {
@@ -98,23 +98,6 @@ private void readConfiguration(Reader configReader) throws IOException {
     }
   }
 
-  /**
-   * Default constructor.
-   */
-  public DomainURLFilter() {
-
-  }
-
-  /**
-   * Constructor that specifies the domain file to use.
-   * 
-   * @param domainFile
-   *          The domain file, overrides domain-urlfilter.text default.
-   */
-  public DomainURLFilter(String domainFile) {
-    this.domainFile = domainFile;
-  }
-
   /**
    * Sets the configuration.
    */
@@ -133,44 +116,36 @@ public void setConf(Configuration conf) {
       }
     }
 
-    // handle blank non empty input
-    if (attributeFile != null && attributeFile.trim().equals("")) {
+    if (attributeFile != null && attributeFile.trim().isEmpty()) {
       attributeFile = null;
     }
 
     if (attributeFile != null) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-            + " as " + attributeFile);
-      }
-    } else {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
-            + pluginName);
-      }
+      LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, attributeFile);
     }
 
-    // domain file and attribute "file" take precedence if defined
-    String file = conf.get("urlfilter.domain.file");
+    // precedence hierarchy for definition of filter rules
+    // (first non-empty definition takes precedence):
+    // 1. string rules defined by `urlfilter.domain.rules`
+    // 2. rule file name defined by `urlfilter.domain.file`
+    // 3. rule file name defined in plugin.xml (`attributeFile`)
     String stringRules = conf.get("urlfilter.domain.rules");
-    if (domainFile != null) {
-      file = domainFile;
-    } else if (attributeFile != null) {
-      file = attributeFile;
-    }
+    String file = conf.get("urlfilter.domain.file", attributeFile);
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
+      LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
     try {
       if (reader == null) {
+        // read local file
         reader = new FileReader(file);
       }
       readConfiguration(reader);
     } catch (IOException e) {
-      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }
 

diff --git a/...ugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/...ugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
@@ -31,7 +31,8 @@ public void testFilter() throws Exception {
 
     String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
     Configuration conf = NutchConfiguration.create();
-    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    conf.set("urlfilter.domain.file", domainFile);
+    DomainURLFilter domainFilter = new DomainURLFilter();
     domainFilter.setConf(conf);
     Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
     Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
@@ -50,7 +51,8 @@ public void testNoFilter() throws Exception {
     // https://issues.apache.org/jira/browse/NUTCH-2189
     String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
     Configuration conf = NutchConfiguration.create();
-    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    conf.set("urlfilter.domain.file", domainFile);
+    DomainURLFilter domainFilter = new DomainURLFilter();
     domainFilter.setConf(conf);
     Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
     Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));

diff --git a/...acklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java b/...acklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
@@ -38,26 +38,28 @@
 /**
  * <p>
  * Filters URLs based on a file containing domain suffixes, domain names, and
- * hostnames. A url that matches one of the suffixes, domains, or hosts present
+ * hostnames. A URL that matches one of the suffixes, domains, or hosts present
  * in the file is filtered out.
  * </p>
  * 
  * <p>
- * Urls are checked in order of domain suffix, domain name, and hostname against
+ * URLs are checked in order of domain suffix, domain name, and hostname against
  * entries in the domain file. The domain file would be setup as follows with
  * one entry per line:
  * 
  * <pre>
- * com apache.org www.apache.org
+ * com
+ * apache.org
+ * www.apache.org
  * </pre>
  * 
  * <p>
- * The first line is an example of a filter that would allow all .com domains.
- * The second line allows all urls from apache.org and all of its subdomains
- * such as lucene.apache.org and hadoop.apache.org. The third line would allow
- * only urls from www.apache.org. There is no specific ordering to entries. The
+ * The first line is an example of a filter that would exclude all .com domains.
+ * The second line excludes all URLs from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would exclude
+ * only URLs from www.apache.org. There is no specific ordering to entries. The
  * entries are from more general to more specific with the more general
- * overridding the more specific.
+ * overriding the more specific.
  * </p>
  * 
  * The domain file defaults to domainblacklist-urlfilter.txt in the classpath
@@ -72,7 +74,6 @@
  * </li>
  * </ul>
  * 
- * the attribute "file" has higher precedence if defined.
  */
 public class DomainBlacklistURLFilter implements URLFilter {
 
@@ -82,7 +83,6 @@ public class DomainBlacklistURLFilter implements URLFilter {
   // read in attribute "file" of this plugin.
   private static String attributeFile = null;
   private Configuration conf;
-  private String domainFile = null;
   private Set<String> domainSet = new LinkedHashSet<String>();
 
   private void readConfiguration(Reader configReader) throws IOException {
@@ -98,23 +98,6 @@ private void readConfiguration(Reader configReader) throws IOException {
     }
   }
 
-  /**
-   * Default constructor.
-   */
-  public DomainBlacklistURLFilter() {
-
-  }
-
-  /**
-   * Constructor that specifies the domain file to use.
-   * 
-   * @param domainFile
-   *          The domain file, overrides domainblacklist-urlfilter.text default.
-   */
-  public DomainBlacklistURLFilter(String domainFile) {
-    this.domainFile = domainFile;
-  }
-
   /**
    * Sets the configuration.
    */
@@ -133,44 +116,37 @@ public void setConf(Configuration conf) {
       }
     }
 
-    // handle blank non empty input
-    if (attributeFile != null && attributeFile.trim().equals("")) {
+    if (attributeFile != null && attributeFile.trim().isEmpty()) {
       attributeFile = null;
     }
 
     if (attributeFile != null) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-            + " as " + attributeFile);
-      }
-    } else {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
-            + pluginName);
-      }
+      LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName,
+          attributeFile);
     }
 
-    // domain file and attribute "file" take precedence if defined
-    String file = conf.get("urlfilter.domainblacklist.file");
+    // precedence hierarchy for definition of filter rules
+    // (first non-empty definition takes precedence):
+    // 1. string rules defined by `urlfilter.domainblacklist.rules`
+    // 2. rule file name defined by `urlfilter.domainblacklist.file`
+    // 3. rule file name defined in plugin.xml (`attributeFile`)
     String stringRules = conf.get("urlfilter.domainblacklist.rules");
-    if (domainFile != null) {
-      file = domainFile;
-    } else if (attributeFile != null) {
-      file = attributeFile;
-    }
+    String file = conf.get("urlfilter.domainblacklist.file", attributeFile);
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
+      LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
     try {
       if (reader == null) {
+        // read local file
         reader = new FileReader(file);
       }
       readConfiguration(reader);
     } catch (IOException e) {
-      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }
 

diff --git a/...ist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java b/...ist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
@@ -31,8 +31,8 @@ public void testFilter() throws Exception {
 
     String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
     Configuration conf = NutchConfiguration.create();
-    DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(
-        domainBlacklistFile);
+    conf.set("urlfilter.domainblacklist.file", domainBlacklistFile);
+    DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter();
     domainBlacklistFilter.setConf(conf);
     Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
     Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));