Skip to content

Commit

Permalink
NUTCH-1838 Host and domain based regex and automaton filtering
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1723710 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
asf-sync-process committed Jan 8, 2016
1 parent c93c24d commit c73707e
Show file tree
Hide file tree
Showing 8 changed files with 134 additions and 4 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
@@ -1,5 +1,7 @@
Nutch Change Log

* NUTCH-1838 Host and domain based regex and automaton filtering (markus)

* NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus)

* NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus)
Expand Down
Expand Up @@ -24,6 +24,10 @@
public abstract class RegexRule {

private final boolean sign;

private final String hostOrDomain;

private final String regex;

/**
* Constructs a new regular expression rule.
Expand All @@ -38,7 +42,27 @@ public abstract class RegexRule {
* {@link #match(String)} method).
*/
protected RegexRule(boolean sign, String regex) {
this(sign, regex, null);
}

/**
* Constructs a new regular expression rule.
*
* @param sign
* specifies if this rule must filter-in or filter-out. A
* <code>true</code> value means that any url matching this rule must
* be accepted, a <code>false</code> value means that any url
* matching this rule must be rejected.
* @param regex
* is the regular expression used for matching (see
* {@link #match(String)} method).
* @param hostOrDomain
* the host or domain to which this regex belongs
*/
protected RegexRule(boolean sign, String regex, String hostOrDomain) {
this.sign = sign;
this.hostOrDomain = hostOrDomain;
this.regex = regex;
}

/**
Expand All @@ -51,6 +75,20 @@ protected boolean accept() {
return sign;
}

/**
* Return if this rule is used for filtering-in or out.
*
* @return host or domain this regex rule belongs to
*/
protected String hostOrDomain() { return hostOrDomain; }

/**
* Return if this rule's regex.
*
* @return this regex
*/
protected String regex() { return regex; }

/**
* Checks if a url matches this rule.
*
Expand Down
Expand Up @@ -24,6 +24,7 @@
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.util.List;
import java.util.ArrayList;

Expand All @@ -36,6 +37,7 @@

// Nutch imports
import org.apache.nutch.net.*;
import org.apache.nutch.util.URLUtil;

/**
* Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
Expand Down Expand Up @@ -123,6 +125,20 @@ protected RegexURLFilterBase(Reader reader) throws IOException,
* is the regular expression associated to this rule.
*/
protected abstract RegexRule createRule(boolean sign, String regex);

/**
* Creates a new {@link RegexRule}.
* @param
* sign of the regular expression.
* A <code>true</code> value means that any URL matching this rule
* must be included, whereas a <code>false</code>
* value means that any URL matching this rule must be excluded.
* @param regex
* is the regular expression associated to this rule.
* @param hostOrDomain
* the host or domain to which this regex belongs
*/
protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain);

/**
* Returns the name of the file of rules to use for a particular
Expand All @@ -142,7 +158,35 @@ protected abstract Reader getRulesReader(Configuration conf)

// Inherited Javadoc
public String filter(String url) {
String host = URLUtil.getHost(url);
String domain = null;

try {
domain = URLUtil.getDomainName(url);
} catch (MalformedURLException e) {
// shouldnt happen here right?
}

if (LOG.isDebugEnabled()) {
LOG.debug("URL belongs to host " + host + " and domain " + domain);
}

for (RegexRule rule : rules) {
// Skip the skip for rules that don't share the same host and domain
if (rule.hostOrDomain() != null &&
!rule.hostOrDomain().equals(host) &&
!rule.hostOrDomain().equals(domain)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain());
}

continue;
}

if (LOG.isDebugEnabled()) {
LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain);
}

if (rule.match(url)) {
return rule.accept() ? url : null;
}
Expand Down Expand Up @@ -204,7 +248,8 @@ private List<RegexRule> readRules(Reader reader) throws IOException,
BufferedReader in = new BufferedReader(reader);
List<RegexRule> rules = new ArrayList<RegexRule>();
String line;

String hostOrDomain = null;

while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
Expand All @@ -222,15 +267,21 @@ private List<RegexRule> readRules(Reader reader) throws IOException,
case '\n':
case '#': // skip blank & comment lines
continue;
case '>':
hostOrDomain = line.substring(1).trim();
continue;
case '<':
hostOrDomain = null;
continue;
default:
throw new IOException("Invalid first character: " + line);
}

String regex = line.substring(1);
if (LOG.isTraceEnabled()) {
LOG.trace("Adding rule [" + regex + "]");
LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain);
}
RegexRule rule = createRule(sign, regex);
RegexRule rule = createRule(sign, regex, hostOrDomain);
rules.add(rule);
}
return rules;
Expand Down
Expand Up @@ -80,6 +80,10 @@ protected Reader getRulesReader(Configuration conf) throws IOException {
protected RegexRule createRule(boolean sign, String regex) {
return new Rule(sign, regex);
}

protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) {
return new Rule(sign, regex, hostOrDomain);
}

/*
* ------------------------------------ * </implementation:RegexURLFilterBase>
Expand All @@ -98,6 +102,11 @@ private class Rule extends RegexRule {
super(sign, regex);
automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
}

Rule(boolean sign, String regex, String hostOrDomain) {
super(sign, regex, hostOrDomain);
automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
}

protected boolean match(String url) {
return automaton.run(url);
Expand Down
12 changes: 12 additions & 0 deletions src/plugin/urlfilter-regex/sample/nutch1838.rules
@@ -0,0 +1,12 @@
# Skip all url's containing skip for example.org
> www.example.org
-skip
<

# Allow all url's containing skip for example.com
> www.example.com
+skip
<

# Skip everything else
-.
3 changes: 3 additions & 0 deletions src/plugin/urlfilter-regex/sample/nutch1838.urls
@@ -0,0 +1,3 @@
-http://www.example.org/skip-me-now
+http://www.example.com/noone-can-skip-me
-http://www.example.nl/i-am-filtered
Expand Up @@ -72,6 +72,12 @@ protected Reader getRulesReader(Configuration conf) throws IOException {
protected RegexRule createRule(boolean sign, String regex) {
return new Rule(sign, regex);
}

protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) {
return new Rule(sign, regex, hostOrDomain);
}



/*
* ------------------------------------ * </implementation:RegexURLFilterBase>
Expand All @@ -89,7 +95,11 @@ private class Rule extends RegexRule {
private Pattern pattern;

Rule(boolean sign, String regex) {
super(sign, regex);
this(sign, regex, null);
}

Rule(boolean sign, String regex, String hostOrDomain) {
super(sign, regex, hostOrDomain);
pattern = Pattern.compile(regex);
}

Expand Down
Expand Up @@ -52,5 +52,10 @@ public void test() {
bench(400, "Benchmarks");
bench(800, "Benchmarks");
}

@Test
public void test1838() {
test("nutch1838");
}

}

0 comments on commit c73707e

Please sign in to comment.