();
String line;
String hostOrDomain = null;
-
+
while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
diff --git a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
index ae4896d19a..ba4636ffc4 100644
--- a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
+++ b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
@@ -22,12 +22,15 @@
import java.io.StringReader;
import java.util.regex.PatternSyntaxException;
+
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
+
// Automaton imports
import dk.brics.automaton.RegExp;
import dk.brics.automaton.RunAutomaton;
+
import org.apache.nutch.net.*;
import org.apache.nutch.urlfilter.api.RegexRule;
import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
@@ -108,9 +111,15 @@ private class Rule extends RegexRule {
automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
}
- protected boolean match(String url) {
+ public boolean match(String url) {
return automaton.run(url);
}
+
+ @Override
+ public String replace(String url, String replacement) {
+ // TODO Auto-generated method stub
+ return null;
+ }
}
}
diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml
new file mode 100644
index 0000000000..05628f9c0a
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml
new file mode 100644
index 0000000000..1a86d68030
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml
@@ -0,0 +1,41 @@
+
+
+
+
+
+
+
+
+
+ Apache Nutch
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml
new file mode 100644
index 0000000000..e9439f5f8b
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml
@@ -0,0 +1,43 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java
new file mode 100644
index 0000000000..5d1aacf441
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.ignoreexempt.bidirectional;
+
+import java.io.FileReader;
+import java.io.Reader;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.net.URLExemptionFilter;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+import org.xml.sax.InputSource;
+
+/**
+ * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses
+ * regex configuration for both fromUrl and toUrl to check if URL is eligible
+ * for exemption from 'db.ignore.external'. When this filter is enabled, the
+ * external urls will be checked against configured sequence of regex rules.
+ *
+ * The exemption rule file defaults to
+ * db-ignore-external-exemptions-bidirectional.xml in the classpath but can be
+ * overridden using the property
+ * "db.ignore.external.exemptions.bidirectional.file" in ./conf/nutch-*.xml
+ *
+ *
+ * @since Mar 1, 2018
+ * @version 1
+ */
+public class BidirectionalExemptionUrlFilter extends Configured implements
+ URLExemptionFilter {
+
+ public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE = "db.ignore.external.exemptions.bidirectional.file";
+ private static final Logger LOG = LoggerFactory.getLogger(MethodHandles
+ .lookup().lookupClass());
+
+ private static class Rule {
+ public Pattern pattern;
+ public String substitution;
+ }
+
+ private List rules;
+
+ // private Configuration conf;
+ private String fileName;
+
+ public BidirectionalExemptionUrlFilter() {
+ }
+
+ public BidirectionalExemptionUrlFilter(Configuration conf) {
+ super(conf);
+ }
+
+ @Override
+ // This implementation checks rules exceptions for two arbitrary urls. True if
+ // reg_ex(toUrl) = reg_ex(fromUrl).
+ // Logic of reading of RegEx is taken from RegexURLNormalizer
+ public boolean filter(String fromUrl, String toUrl) {
+ String sourceHost = URLUtil.getHost(fromUrl).toLowerCase();
+ String sourceDestination = URLUtil.getHost(toUrl).toLowerCase();
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("BidirectionalExemptionUrlFilter. Source url: " + fromUrl
+ + " and destination url " + toUrl);
+ }
+
+ String modifiedSourceHost = sourceHost;
+ String modifiedDestinationHost = sourceDestination;
+
+ modifiedSourceHost = this.regexReplace(modifiedSourceHost);
+ modifiedDestinationHost = this.regexReplace(modifiedDestinationHost);
+
+ if (modifiedSourceHost == null || modifiedDestinationHost == null) {
+ return false;
+ }
+ return modifiedSourceHost.equals(modifiedDestinationHost);
+ }
+
+ private List readConfigurationFile() {
+ // String filename =
+ // this.getConf().get(DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE);
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info("loading " + this.fileName);
+ }
+ try {
+ Reader reader = getConf().getConfResourceAsReader(this.fileName);
+ return readConfiguration(reader);
+ } catch (Exception e) {
+ LOG.error("Error loading rules from '" + this.fileName + "': " + e);
+ return null;
+ }
+ }
+
+ private List readConfiguration(Reader reader) {
+ List rules = new ArrayList();
+ try {
+
+ // borrowed heavily from code in Configuration.java
+ Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
+ .parse(new InputSource(reader));
+ Element root = doc.getDocumentElement();
+ if ((!"regex-exemptionurl".equals(root.getTagName()))
+ && (LOG.isErrorEnabled())) {
+ LOG.error("bad conf file: top-level element not ");
+ }
+ NodeList regexes = root.getChildNodes();
+ for (int i = 0; i < regexes.getLength(); i++) {
+ Node regexNode = regexes.item(i);
+ if (!(regexNode instanceof Element))
+ continue;
+ Element regex = (Element) regexNode;
+ if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) {
+ LOG.warn("bad conf file: element not ");
+ }
+ NodeList fields = regex.getChildNodes();
+ String patternValue = null;
+ String subValue = null;
+ for (int j = 0; j < fields.getLength(); j++) {
+ Node fieldNode = fields.item(j);
+ if (!(fieldNode instanceof Element))
+ continue;
+ Element field = (Element) fieldNode;
+ if ("pattern".equals(field.getTagName()) && field.hasChildNodes())
+ patternValue = ((Text) field.getFirstChild()).getData();
+ if ("substitution".equals(field.getTagName())
+ && field.hasChildNodes())
+ subValue = ((Text) field.getFirstChild()).getData();
+ if (!field.hasChildNodes())
+ subValue = "";
+ }
+ if (patternValue != null && subValue != null) {
+ Rule rule = new Rule();
+ try {
+ rule.pattern = Pattern.compile(patternValue);
+ } catch (PatternSyntaxException e) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error("skipped rule: " + patternValue + " -> " + subValue
+ + " : invalid regular expression pattern: " + e);
+ }
+ continue;
+ }
+ rule.substitution = subValue;
+ rules.add(rule);
+ }
+ }
+ } catch (Exception e) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error("error parsing conf file: " + e);
+ }
+ return null;
+ }
+ if (rules.size() == 0)
+ return null;
+ return rules;
+ }
+
+ private String regexReplace(String urlString) {
+ if (rules == null) {
+ return null;
+ }
+
+ Iterator i = rules.iterator();
+ while (i.hasNext()) {
+ Rule r = (Rule) i.next();
+ Matcher matcher = r.pattern.matcher(urlString);
+ urlString = matcher.replaceAll(r.substitution);
+ }
+ return urlString;
+ }
+
+ public static void main(String[] args) {
+
+ if (args.length != 2) {
+ System.out.println("Error: Invalid Args");
+ System.out.println("Usage: "
+ + BidirectionalExemptionUrlFilter.class.getName()
+ + " ");
+ return;
+ }
+ String sourceUrl = args[0];
+ String destinationUrl = args[1];
+ BidirectionalExemptionUrlFilter instance = new BidirectionalExemptionUrlFilter(
+ NutchConfiguration.create());
+ System.out.println(instance.filter(sourceUrl, destinationUrl));
+ }
+
+ /*
+ * @Override public Configuration getConf() { return super.getConf(); }
+ */
+ @Override
+ public void setConf(Configuration conf) {
+ if (conf == null) {
+ return;
+ }
+ super.setConf(conf);
+ this.fileName = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE);
+ this.rules = readConfigurationFile();
+ }
+
+}
diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/package-info.java b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/package-info.java
new file mode 100644
index 0000000000..d4862b3988
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin which identifies exemptions to external urls when
+ * when external urls are set to ignore.
+ *
+ */
+package org.apache.nutch.urlfilter.ignoreexempt.bidirectional;
+
diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
index 40128c8ecc..08c7b0b4f6 100644
--- a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
+++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
@@ -21,6 +21,7 @@
import org.apache.nutch.net.URLExemptionFilter;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.urlfilter.regex.RegexURLFilter;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
index 2988114f0d..c5bd44316f 100644
--- a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
+++ b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
@@ -72,12 +72,10 @@ protected Reader getRulesReader(Configuration conf) throws IOException {
protected RegexRule createRule(boolean sign, String regex) {
return new Rule(sign, regex);
}
-
+
protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) {
return new Rule(sign, regex, hostOrDomain);
}
-
-
/*
* ------------------------------------ *
@@ -90,22 +88,30 @@ public static void main(String args[]) throws IOException {
main(filter, args);
}
- private class Rule extends RegexRule {
+ public class Rule extends RegexRule {
private Pattern pattern;
Rule(boolean sign, String regex) {
this(sign, regex, null);
}
-
+
Rule(boolean sign, String regex, String hostOrDomain) {
super(sign, regex, hostOrDomain);
pattern = Pattern.compile(regex);
}
- protected boolean match(String url) {
+ public boolean match(String url) {
return pattern.matcher(url).find();
}
+
+ public String replace(String url, String replacement) {
+ java.util.regex.Matcher mtch = pattern.matcher(url);
+ if (mtch.find()) {
+ return mtch.replaceAll(replacement);
+ }
+ return url;
+ }
}
}