diff --git a/build.xml b/build.xml index db163c6207..9cd9e98460 100644 --- a/build.xml +++ b/build.xml @@ -227,6 +227,7 @@ + @@ -684,6 +685,7 @@ + @@ -1122,6 +1124,7 @@ + diff --git a/conf/db-ignore-external-exemptions-bidirectional.xml.template b/conf/db-ignore-external-exemptions-bidirectional.xml.template new file mode 100644 index 0000000000..c72bf30011 --- /dev/null +++ b/conf/db-ignore-external-exemptions-bidirectional.xml.template @@ -0,0 +1,32 @@ + + + + + + + + + + (www\.) + + + diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 87c405883e..4b06f157fa 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -611,6 +611,14 @@ + + db.ignore.external.exemptions.bidirectional.file + db-ignore-external-exemptions-bidirectional.xml + + This file contains exemption rules used by 'urlfiter-ignoreexempt-bidirectional' plugin + + + db.injector.overwrite false diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 3f579e8412..d72cfaee85 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -90,6 +90,7 @@ + @@ -220,6 +221,7 @@ + diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java index e408586399..2aca416770 100644 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java +++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java @@ -87,7 +87,7 @@ protected boolean accept() { * * @return this regex */ - protected String regex() { return regex; } + public String regex() { return regex; } /** * Checks if a url matches this rule. @@ -97,6 +97,15 @@ protected boolean accept() { * @return true if the specified url matches this rule, otherwise * false. */ - protected abstract boolean match(String url); + public abstract boolean match(String url); + /** + * Replace if a url matches this rule. + * + * @param url + * is the url to check. + * @return true if the specified url matches this rule, otherwise + * false. + */ + public abstract String replace(String url, String replacement); } diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java index 154f9e1a13..f3028d9c0b 100644 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java +++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java @@ -17,28 +17,26 @@ package org.apache.nutch.urlfilter.api; // JDK imports -import java.lang.invoke.MethodHandles; +import java.io.BufferedReader; import java.io.File; -import java.io.Reader; import java.io.FileReader; -import java.io.BufferedReader; -import java.io.InputStreamReader; import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; import java.io.StringReader; +import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; -import java.util.List; import java.util.ArrayList; - -// Commons Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import java.util.List; // Hadoop imports import org.apache.hadoop.conf.Configuration; - // Nutch imports -import org.apache.nutch.net.*; +import org.apache.nutch.net.URLFilter; import org.apache.nutch.util.URLUtil; +// Commons Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular @@ -64,12 +62,16 @@ public abstract class RegexURLFilterBase implements URLFilter { /** My logger */ - private static final Logger LOG = LoggerFactory - .getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles + .lookup().lookupClass()); /** An array of applicable rules */ private List rules; + protected List getRules() { + return rules; + } + /** The current configuration */ private Configuration conf; @@ -126,20 +128,22 @@ protected RegexURLFilterBase(Reader reader) throws IOException, * is the regular expression associated to this rule. */ protected abstract RegexRule createRule(boolean sign, String regex); - + /** * Creates a new {@link RegexRule}. - * @param - * sign of the regular expression. - * A true value means that any URL matching this rule - * must be included, whereas a false - * value means that any URL matching this rule must be excluded. + * + * @param sign + * of the regular expression. A true value means that + * any URL matching this rule must be included, whereas a + * false value means that any URL matching this rule + * must be excluded. * @param regex - * is the regular expression associated to this rule. + * is the regular expression associated to this rule. * @param hostOrDomain - * the host or domain to which this regex belongs + * the host or domain to which this regex belongs */ - protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain); + protected abstract RegexRule createRule(boolean sign, String regex, + String hostOrDomain); /** * Returns the name of the file of rules to use for a particular @@ -161,31 +165,32 @@ protected abstract Reader getRulesReader(Configuration conf) public String filter(String url) { String host = URLUtil.getHost(url); String domain = null; - + try { domain = URLUtil.getDomainName(url); } catch (MalformedURLException e) { // shouldnt happen here right? } - + if (LOG.isDebugEnabled()) { LOG.debug("URL belongs to host " + host + " and domain " + domain); } for (RegexRule rule : rules) { // Skip the skip for rules that don't share the same host and domain - if (rule.hostOrDomain() != null && - !rule.hostOrDomain().equals(host) && - !rule.hostOrDomain().equals(domain)) { + if (rule.hostOrDomain() != null && !rule.hostOrDomain().equals(host) + && !rule.hostOrDomain().equals(domain)) { if (LOG.isDebugEnabled()) { - LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain()); + LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + + rule.hostOrDomain()); } continue; } - + if (LOG.isDebugEnabled()) { - LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain); + LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + + " and domain " + domain); } if (rule.match(url)) { @@ -250,7 +255,7 @@ private List readRules(Reader reader) throws IOException, List rules = new ArrayList(); String line; String hostOrDomain = null; - + while ((line = in.readLine()) != null) { if (line.length() == 0) { continue; diff --git a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java index ae4896d19a..ba4636ffc4 100644 --- a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java +++ b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java @@ -22,12 +22,15 @@ import java.io.StringReader; import java.util.regex.PatternSyntaxException; + // Hadoop imports import org.apache.hadoop.conf.Configuration; + // Automaton imports import dk.brics.automaton.RegExp; import dk.brics.automaton.RunAutomaton; + import org.apache.nutch.net.*; import org.apache.nutch.urlfilter.api.RegexRule; import org.apache.nutch.urlfilter.api.RegexURLFilterBase; @@ -108,9 +111,15 @@ private class Rule extends RegexRule { automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton()); } - protected boolean match(String url) { + public boolean match(String url) { return automaton.run(url); } + + @Override + public String replace(String url, String replacement) { + // TODO Auto-generated method stub + return null; + } } } diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml new file mode 100644 index 0000000000..05628f9c0a --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml new file mode 100644 index 0000000000..1a86d68030 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml @@ -0,0 +1,41 @@ + + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml new file mode 100644 index 0000000000..e9439f5f8b --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java new file mode 100644 index 0000000000..5d1aacf441 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.ignoreexempt.bidirectional; + +import java.io.FileReader; +import java.io.Reader; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import javax.xml.parsers.DocumentBuilderFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.nutch.net.URLExemptionFilter; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.URLUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.w3c.dom.Text; +import org.xml.sax.InputSource; + +/** + * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses + * regex configuration for both fromUrl and toUrl to check if URL is eligible + * for exemption from 'db.ignore.external'. When this filter is enabled, the + * external urls will be checked against configured sequence of regex rules. + *

+ * The exemption rule file defaults to + * db-ignore-external-exemptions-bidirectional.xml in the classpath but can be + * overridden using the property + * "db.ignore.external.exemptions.bidirectional.file" in ./conf/nutch-*.xml + *

+ * + * @since Mar 1, 2018 + * @version 1 + */ +public class BidirectionalExemptionUrlFilter extends Configured implements + URLExemptionFilter { + + public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE = "db.ignore.external.exemptions.bidirectional.file"; + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles + .lookup().lookupClass()); + + private static class Rule { + public Pattern pattern; + public String substitution; + } + + private List rules; + + // private Configuration conf; + private String fileName; + + public BidirectionalExemptionUrlFilter() { + } + + public BidirectionalExemptionUrlFilter(Configuration conf) { + super(conf); + } + + @Override + // This implementation checks rules exceptions for two arbitrary urls. True if + // reg_ex(toUrl) = reg_ex(fromUrl). + // Logic of reading of RegEx is taken from RegexURLNormalizer + public boolean filter(String fromUrl, String toUrl) { + String sourceHost = URLUtil.getHost(fromUrl).toLowerCase(); + String sourceDestination = URLUtil.getHost(toUrl).toLowerCase(); + + if (LOG.isDebugEnabled()) { + LOG.debug("BidirectionalExemptionUrlFilter. Source url: " + fromUrl + + " and destination url " + toUrl); + } + + String modifiedSourceHost = sourceHost; + String modifiedDestinationHost = sourceDestination; + + modifiedSourceHost = this.regexReplace(modifiedSourceHost); + modifiedDestinationHost = this.regexReplace(modifiedDestinationHost); + + if (modifiedSourceHost == null || modifiedDestinationHost == null) { + return false; + } + return modifiedSourceHost.equals(modifiedDestinationHost); + } + + private List readConfigurationFile() { + // String filename = + // this.getConf().get(DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE); + + if (LOG.isInfoEnabled()) { + LOG.info("loading " + this.fileName); + } + try { + Reader reader = getConf().getConfResourceAsReader(this.fileName); + return readConfiguration(reader); + } catch (Exception e) { + LOG.error("Error loading rules from '" + this.fileName + "': " + e); + return null; + } + } + + private List readConfiguration(Reader reader) { + List rules = new ArrayList(); + try { + + // borrowed heavily from code in Configuration.java + Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() + .parse(new InputSource(reader)); + Element root = doc.getDocumentElement(); + if ((!"regex-exemptionurl".equals(root.getTagName())) + && (LOG.isErrorEnabled())) { + LOG.error("bad conf file: top-level element not "); + } + NodeList regexes = root.getChildNodes(); + for (int i = 0; i < regexes.getLength(); i++) { + Node regexNode = regexes.item(i); + if (!(regexNode instanceof Element)) + continue; + Element regex = (Element) regexNode; + if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) { + LOG.warn("bad conf file: element not "); + } + NodeList fields = regex.getChildNodes(); + String patternValue = null; + String subValue = null; + for (int j = 0; j < fields.getLength(); j++) { + Node fieldNode = fields.item(j); + if (!(fieldNode instanceof Element)) + continue; + Element field = (Element) fieldNode; + if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) + patternValue = ((Text) field.getFirstChild()).getData(); + if ("substitution".equals(field.getTagName()) + && field.hasChildNodes()) + subValue = ((Text) field.getFirstChild()).getData(); + if (!field.hasChildNodes()) + subValue = ""; + } + if (patternValue != null && subValue != null) { + Rule rule = new Rule(); + try { + rule.pattern = Pattern.compile(patternValue); + } catch (PatternSyntaxException e) { + if (LOG.isErrorEnabled()) { + LOG.error("skipped rule: " + patternValue + " -> " + subValue + + " : invalid regular expression pattern: " + e); + } + continue; + } + rule.substitution = subValue; + rules.add(rule); + } + } + } catch (Exception e) { + if (LOG.isErrorEnabled()) { + LOG.error("error parsing conf file: " + e); + } + return null; + } + if (rules.size() == 0) + return null; + return rules; + } + + private String regexReplace(String urlString) { + if (rules == null) { + return null; + } + + Iterator i = rules.iterator(); + while (i.hasNext()) { + Rule r = (Rule) i.next(); + Matcher matcher = r.pattern.matcher(urlString); + urlString = matcher.replaceAll(r.substitution); + } + return urlString; + } + + public static void main(String[] args) { + + if (args.length != 2) { + System.out.println("Error: Invalid Args"); + System.out.println("Usage: " + + BidirectionalExemptionUrlFilter.class.getName() + + " "); + return; + } + String sourceUrl = args[0]; + String destinationUrl = args[1]; + BidirectionalExemptionUrlFilter instance = new BidirectionalExemptionUrlFilter( + NutchConfiguration.create()); + System.out.println(instance.filter(sourceUrl, destinationUrl)); + } + + /* + * @Override public Configuration getConf() { return super.getConf(); } + */ + @Override + public void setConf(Configuration conf) { + if (conf == null) { + return; + } + super.setConf(conf); + this.fileName = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE); + this.rules = readConfigurationFile(); + } + +} diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/package-info.java b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/package-info.java new file mode 100644 index 0000000000..d4862b3988 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin which identifies exemptions to external urls when + * when external urls are set to ignore. + * + */ +package org.apache.nutch.urlfilter.ignoreexempt.bidirectional; + diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java index 40128c8ecc..08c7b0b4f6 100644 --- a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java +++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java @@ -21,6 +21,7 @@ import org.apache.nutch.net.URLExemptionFilter; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.urlfilter.regex.RegexURLFilter; +import org.apache.nutch.urlfilter.api.RegexURLFilterBase; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java index 2988114f0d..c5bd44316f 100644 --- a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java +++ b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java @@ -72,12 +72,10 @@ protected Reader getRulesReader(Configuration conf) throws IOException { protected RegexRule createRule(boolean sign, String regex) { return new Rule(sign, regex); } - + protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) { return new Rule(sign, regex, hostOrDomain); } - - /* * ------------------------------------ * @@ -90,22 +88,30 @@ public static void main(String args[]) throws IOException { main(filter, args); } - private class Rule extends RegexRule { + public class Rule extends RegexRule { private Pattern pattern; Rule(boolean sign, String regex) { this(sign, regex, null); } - + Rule(boolean sign, String regex, String hostOrDomain) { super(sign, regex, hostOrDomain); pattern = Pattern.compile(regex); } - protected boolean match(String url) { + public boolean match(String url) { return pattern.matcher(url).find(); } + + public String replace(String url, String replacement) { + java.util.regex.Matcher mtch = pattern.matcher(url); + if (mtch.find()) { + return mtch.replaceAll(replacement); + } + return url; + } } }