Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NUTCH-2419 Some URL filters and normalizers do not respect command-line override for rule file #526

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -51,20 +51,11 @@ public class RegexParseFilter implements HtmlParseFilter {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private static String attributeFile = null;
private String regexFile = null;

private Configuration conf;

private static final Map<String,RegexRule> rules = new HashMap<>();

public RegexParseFilter() {
//default constructor
}

public RegexParseFilter(String regexFile) {
this.regexFile = regexFile;
}

public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
String html = new String(content.getContent());
Expand Down Expand Up @@ -129,15 +120,8 @@ public void setConf(Configuration conf) {
}
}

// domain file and attribute "file" take precedence if defined
String file = conf.get("parsefilter.regex.file");
String file = conf.get("parsefilter.regex.file", attributeFile);
String stringRules = conf.get("parsefilter.regex.rules");
if (regexFile != null) {
file = regexFile;
}
else if (attributeFile != null) {
file = attributeFile;
}
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
Expand Down
Expand Up @@ -35,7 +35,8 @@ public void testPositiveFilter() throws Exception {
Configuration conf = NutchConfiguration.create();

String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
RegexParseFilter filter = new RegexParseFilter(file);
conf.set("parsefilter.regex.file", file);
RegexParseFilter filter = new RegexParseFilter();
filter.setConf(conf);

String url = "http://nutch.apache.org/";
Expand All @@ -56,7 +57,8 @@ public void testNegativeFilter() throws Exception {
Configuration conf = NutchConfiguration.create();

String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
RegexParseFilter filter = new RegexParseFilter(file);
conf.set("parsefilter.regex.file", file);
RegexParseFilter filter = new RegexParseFilter();
filter.setConf(conf);

String url = "http://nutch.apache.org/";
Expand Down
Expand Up @@ -38,26 +38,28 @@
/**
* <p>
* Filters URLs based on a file containing domain suffixes, domain names, and
* hostnames. Only a url that matches one of the suffixes, domains, or hosts
* hostnames. Only a URL that matches one of the suffixes, domains, or hosts
* present in the file is allowed.
* </p>
*
* <p>
* Urls are checked in order of domain suffix, domain name, and hostname against
* URLs are checked in order of domain suffix, domain name, and hostname against
* entries in the domain file. The domain file would be setup as follows with
* one entry per line:
*
* <pre>
* com apache.org www.apache.org
* com
* apache.org
* www.apache.org
* </pre>
*
* <p>
* The first line is an example of a filter that would allow all .com domains.
* The second line allows all urls from apache.org and all of its subdomains
* The second line allows all URLs from apache.org and all of its subdomains
* such as lucene.apache.org and hadoop.apache.org. The third line would allow
* only urls from www.apache.org. There is no specific ordering to entries. The
* only URLs from www.apache.org. There is no specific ordering to entries. The
* entries are from more general to more specific with the more general
* overridding the more specific.
* overriding the more specific.
* </p>
*
* The domain file defaults to domain-urlfilter.txt in the classpath but can be
Expand All @@ -72,7 +74,6 @@
* </li>
* </ul>
*
* the attribute "file" has higher precedence if defined.
*/
public class DomainURLFilter implements URLFilter {

Expand All @@ -82,7 +83,6 @@ public class DomainURLFilter implements URLFilter {
// read in attribute "file" of this plugin.
private static String attributeFile = null;
private Configuration conf;
private String domainFile = null;
private Set<String> domainSet = new LinkedHashSet<String>();

private void readConfiguration(Reader configReader) throws IOException {
Expand All @@ -98,23 +98,6 @@ private void readConfiguration(Reader configReader) throws IOException {
}
}

/**
* Default constructor.
*/
public DomainURLFilter() {

}

/**
* Constructor that specifies the domain file to use.
*
* @param domainFile
* The domain file, overrides domain-urlfilter.text default.
*/
public DomainURLFilter(String domainFile) {
this.domainFile = domainFile;
}

/**
* Sets the configuration.
*/
Expand All @@ -133,44 +116,36 @@ public void setConf(Configuration conf) {
}
}

// handle blank non empty input
if (attributeFile != null && attributeFile.trim().equals("")) {
if (attributeFile != null && attributeFile.trim().isEmpty()) {
attributeFile = null;
}

if (attributeFile != null) {
if (LOG.isInfoEnabled()) {
LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ " as " + attributeFile);
}
} else {
if (LOG.isWarnEnabled()) {
LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ pluginName);
}
LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, attributeFile);
}

// domain file and attribute "file" take precedence if defined
String file = conf.get("urlfilter.domain.file");
// precedence hierarchy for definition of filter rules
// (first non-empty definition takes precedence):
// 1. string rules defined by `urlfilter.domain.rules`
// 2. rule file name defined by `urlfilter.domain.file`
// 3. rule file name defined in plugin.xml (`attributeFile`)
String stringRules = conf.get("urlfilter.domain.rules");
if (domainFile != null) {
file = domainFile;
} else if (attributeFile != null) {
file = attributeFile;
}
String file = conf.get("urlfilter.domain.file", attributeFile);
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
try {
if (reader == null) {
// read local file
reader = new FileReader(file);
}
readConfiguration(reader);
} catch (IOException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
LOG.error("Error reading " + pluginName + " rule file " + file, e);
}
}

Expand Down
Expand Up @@ -31,7 +31,8 @@ public void testFilter() throws Exception {

String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
Configuration conf = NutchConfiguration.create();
DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
conf.set("urlfilter.domain.file", domainFile);
DomainURLFilter domainFilter = new DomainURLFilter();
domainFilter.setConf(conf);
Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
Expand All @@ -50,7 +51,8 @@ public void testNoFilter() throws Exception {
// https://issues.apache.org/jira/browse/NUTCH-2189
String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
Configuration conf = NutchConfiguration.create();
DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
conf.set("urlfilter.domain.file", domainFile);
DomainURLFilter domainFilter = new DomainURLFilter();
domainFilter.setConf(conf);
Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
Expand Down
Expand Up @@ -38,26 +38,28 @@
/**
* <p>
* Filters URLs based on a file containing domain suffixes, domain names, and
* hostnames. A url that matches one of the suffixes, domains, or hosts present
* hostnames. A URL that matches one of the suffixes, domains, or hosts present
* in the file is filtered out.
* </p>
*
* <p>
* Urls are checked in order of domain suffix, domain name, and hostname against
* URLs are checked in order of domain suffix, domain name, and hostname against
* entries in the domain file. The domain file would be setup as follows with
* one entry per line:
*
* <pre>
* com apache.org www.apache.org
* com
* apache.org
* www.apache.org
* </pre>
*
* <p>
* The first line is an example of a filter that would allow all .com domains.
* The second line allows all urls from apache.org and all of its subdomains
* such as lucene.apache.org and hadoop.apache.org. The third line would allow
* only urls from www.apache.org. There is no specific ordering to entries. The
* The first line is an example of a filter that would exclude all .com domains.
* The second line excludes all URLs from apache.org and all of its subdomains
* such as lucene.apache.org and hadoop.apache.org. The third line would exclude
* only URLs from www.apache.org. There is no specific ordering to entries. The
* entries are from more general to more specific with the more general
* overridding the more specific.
* overriding the more specific.
* </p>
*
* The domain file defaults to domainblacklist-urlfilter.txt in the classpath
Expand All @@ -72,7 +74,6 @@
* </li>
* </ul>
*
* the attribute "file" has higher precedence if defined.
*/
public class DomainBlacklistURLFilter implements URLFilter {

Expand All @@ -82,7 +83,6 @@ public class DomainBlacklistURLFilter implements URLFilter {
// read in attribute "file" of this plugin.
private static String attributeFile = null;
private Configuration conf;
private String domainFile = null;
private Set<String> domainSet = new LinkedHashSet<String>();

private void readConfiguration(Reader configReader) throws IOException {
Expand All @@ -98,23 +98,6 @@ private void readConfiguration(Reader configReader) throws IOException {
}
}

/**
* Default constructor.
*/
public DomainBlacklistURLFilter() {

}

/**
* Constructor that specifies the domain file to use.
*
* @param domainFile
* The domain file, overrides domainblacklist-urlfilter.text default.
*/
public DomainBlacklistURLFilter(String domainFile) {
this.domainFile = domainFile;
}

/**
* Sets the configuration.
*/
Expand All @@ -133,44 +116,37 @@ public void setConf(Configuration conf) {
}
}

// handle blank non empty input
if (attributeFile != null && attributeFile.trim().equals("")) {
if (attributeFile != null && attributeFile.trim().isEmpty()) {
attributeFile = null;
}

if (attributeFile != null) {
if (LOG.isInfoEnabled()) {
LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ " as " + attributeFile);
}
} else {
if (LOG.isWarnEnabled()) {
LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ pluginName);
}
LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName,
attributeFile);
}

// domain file and attribute "file" take precedence if defined
String file = conf.get("urlfilter.domainblacklist.file");
// precedence hierarchy for definition of filter rules
// (first non-empty definition takes precedence):
// 1. string rules defined by `urlfilter.domainblacklist.rules`
// 2. rule file name defined by `urlfilter.domainblacklist.file`
// 3. rule file name defined in plugin.xml (`attributeFile`)
String stringRules = conf.get("urlfilter.domainblacklist.rules");
if (domainFile != null) {
file = domainFile;
} else if (attributeFile != null) {
file = attributeFile;
}
String file = conf.get("urlfilter.domainblacklist.file", attributeFile);
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
try {
if (reader == null) {
// read local file
reader = new FileReader(file);
}
readConfiguration(reader);
} catch (IOException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
LOG.error("Error reading " + pluginName + " rule file " + file, e);
}
}

Expand Down
Expand Up @@ -31,8 +31,8 @@ public void testFilter() throws Exception {

String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
Configuration conf = NutchConfiguration.create();
DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(
domainBlacklistFile);
conf.set("urlfilter.domainblacklist.file", domainBlacklistFile);
DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter();
domainBlacklistFilter.setConf(conf);
Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
Expand Down