From a898b5c8f8220f717745fbcaa4e24c3fb9ac974d Mon Sep 17 00:00:00 2001 From: Semyon Semyonov Date: Mon, 22 Jan 2018 09:44:17 +0100 Subject: [PATCH 1/7] Merge branch 'master', remote branch 'origin' From 3977018b4cfc40ab9e68d11a78e5235290aee3a4 Mon Sep 17 00:00:00 2001 From: Semyon Semyonov Date: Tue, 6 Mar 2018 09:58:08 +0100 Subject: [PATCH 2/7] bidirectionexemptionurlfilter added --- build.xml | 3 + conf/nutch-default.xml | 8 ++ src/plugin/build.xml | 2 + .../apache/nutch/urlfilter/api/RegexRule.java | 13 ++- .../urlfilter/api/RegexURLFilterBase.java | 4 + .../automaton/AutomatonURLFilter.java | 11 +- .../build.xml | 37 ++++++ .../ivy.xml | 41 +++++++ .../plugin.xml | 45 ++++++++ .../BidirectionalExemptionUrlFilter.java | 108 ++++++++++++++++++ .../bidirectional/package-info.java | 24 ++++ .../ignoreexempt/ExemptionUrlFilter.java | 1 + .../nutch/urlfilter/regex/RegexURLFilter.java | 18 ++- 13 files changed, 306 insertions(+), 9 deletions(-) create mode 100644 src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml create mode 100644 src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml create mode 100644 src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml create mode 100644 src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java create mode 100644 src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/package-info.java diff --git a/build.xml b/build.xml index db163c6207..9cd9e98460 100644 --- a/build.xml +++ b/build.xml @@ -227,6 +227,7 @@ + @@ -684,6 +685,7 @@ + @@ -1122,6 +1124,7 @@ + diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 87c405883e..4fb4786298 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -611,6 +611,14 @@ + + db.ignore.external.exemptions.bidirectional.file + db-ignore-external-exemptions-bidirectional.txt + + This file contains exemption rules used by 'urlfiter-ignoreexempt-bidirectional' plugin + + + db.injector.overwrite false diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 3f579e8412..d72cfaee85 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -90,6 +90,7 @@ + @@ -220,6 +221,7 @@ + diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java index e408586399..2aca416770 100644 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java +++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java @@ -87,7 +87,7 @@ protected boolean accept() { * * @return this regex */ - protected String regex() { return regex; } + public String regex() { return regex; } /** * Checks if a url matches this rule. @@ -97,6 +97,15 @@ protected boolean accept() { * @return true if the specified url matches this rule, otherwise * false. */ - protected abstract boolean match(String url); + public abstract boolean match(String url); + /** + * Replace if a url matches this rule. + * + * @param url + * is the url to check. + * @return true if the specified url matches this rule, otherwise + * false. + */ + public abstract String replace(String url, String replacement); } diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java index 154f9e1a13..eced475bac 100644 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java +++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java @@ -69,6 +69,10 @@ public abstract class RegexURLFilterBase implements URLFilter { /** An array of applicable rules */ private List rules; + + protected List getRules(){ + return rules; + } /** The current configuration */ private Configuration conf; diff --git a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java index ae4896d19a..ba4636ffc4 100644 --- a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java +++ b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java @@ -22,12 +22,15 @@ import java.io.StringReader; import java.util.regex.PatternSyntaxException; + // Hadoop imports import org.apache.hadoop.conf.Configuration; + // Automaton imports import dk.brics.automaton.RegExp; import dk.brics.automaton.RunAutomaton; + import org.apache.nutch.net.*; import org.apache.nutch.urlfilter.api.RegexRule; import org.apache.nutch.urlfilter.api.RegexURLFilterBase; @@ -108,9 +111,15 @@ private class Rule extends RegexRule { automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton()); } - protected boolean match(String url) { + public boolean match(String url) { return automaton.run(url); } + + @Override + public String replace(String url, String replacement) { + // TODO Auto-generated method stub + return null; + } } } diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml new file mode 100644 index 0000000000..05628f9c0a --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml new file mode 100644 index 0000000000..1a86d68030 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml @@ -0,0 +1,41 @@ + + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml new file mode 100644 index 0000000000..27ab6d2c7d --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml @@ -0,0 +1,45 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java new file mode 100644 index 0000000000..960d90b280 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.urlfilter.ignoreexempt.bidirectional; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLExemptionFilter; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.URLUtil; +import org.apache.nutch.urlfilter.api.RegexRule; +import org.apache.nutch.urlfilter.regex.RegexURLFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.Arrays; +import java.util.regex.Pattern; +import java.util.List; +import java.util.ArrayList; + + +/** + * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses regex configuration for both fromUrl and toUrl + * to check if URL is eligible for exemption from 'db.ignore.external'. + * When this filter is enabled, the external urls will be checked against configured sequence of regex rules. + *

+ * The exemption rule file defaults to db-ignore-external-exemptions-bidirectional.txt in the classpath but can be + * overridden using the property "db.ignore.external.exemptions.bidirectional.file" in ./conf/nutch-*.xml + *

+ * @since Mar 1, 2018 + * @version 1 + * @see org.apache.nutch.net.URLExemptionFilter + * @see org.apache.nutch.urlfilter.regex.RegexURLFilter + */ +public class BidirectionalExemptionUrlFilter extends RegexURLFilter + implements URLExemptionFilter { + + public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE + = "db.ignore.external.exemptions.bidirectional.file"; + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + @Override + //This implementation checks rules exceptions for two arbitrary urls. True if reg_ex(toUrl) = fromUrl + public boolean filter(String fromUrl, String toUrl) { + + String sourceHost = URLUtil.getHost(fromUrl).toLowerCase(); + String sourceDestination = URLUtil.getHost(toUrl).toLowerCase(); + + if (LOG.isDebugEnabled()) { + LOG.debug("BidirectionalExemptionUrlFilter. Source url: " + fromUrl + " and destination url " + toUrl); + } + + String modifiedSourceHost = sourceHost; + String modifiedDestinationHost = sourceDestination; + for (RegexRule rule : super.getRules()) { + + if (LOG.isDebugEnabled()) { + LOG.debug("Applying rule [" + rule.regex() + "]"); + } + + modifiedSourceHost = rule.replace(modifiedSourceHost, ""); + modifiedDestinationHost = rule.replace(modifiedDestinationHost, ""); + }; + + return modifiedSourceHost.equals(modifiedDestinationHost); + } + + + protected Reader getRulesReader(Configuration conf) + throws IOException { + String fileRules = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE); + this.getConf(); + return conf.getConfResourceAsReader(fileRules); + } + public static void main(String[] args) { + + if (args.length != 2) { + System.out.println("Error: Invalid Args"); + System.out.println("Usage: " + + BidirectionalExemptionUrlFilter.class.getName() + " "); + return; + } + String sourceUrl = args[0]; + String destinationUrl = args[1]; + BidirectionalExemptionUrlFilter instance = new BidirectionalExemptionUrlFilter(); + instance.setConf(NutchConfiguration.create()); + System.out.println(instance.filter(sourceUrl, destinationUrl)); + } +} diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/package-info.java b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/package-info.java new file mode 100644 index 0000000000..d4862b3988 --- /dev/null +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * URL filter plugin which identifies exemptions to external urls when + * when external urls are set to ignore. + * + */ +package org.apache.nutch.urlfilter.ignoreexempt.bidirectional; + diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java index 40128c8ecc..08c7b0b4f6 100644 --- a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java +++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java @@ -21,6 +21,7 @@ import org.apache.nutch.net.URLExemptionFilter; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.urlfilter.regex.RegexURLFilter; +import org.apache.nutch.urlfilter.api.RegexURLFilterBase; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java index 2988114f0d..c5bd44316f 100644 --- a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java +++ b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java @@ -72,12 +72,10 @@ protected Reader getRulesReader(Configuration conf) throws IOException { protected RegexRule createRule(boolean sign, String regex) { return new Rule(sign, regex); } - + protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) { return new Rule(sign, regex, hostOrDomain); } - - /* * ------------------------------------ * @@ -90,22 +88,30 @@ public static void main(String args[]) throws IOException { main(filter, args); } - private class Rule extends RegexRule { + public class Rule extends RegexRule { private Pattern pattern; Rule(boolean sign, String regex) { this(sign, regex, null); } - + Rule(boolean sign, String regex, String hostOrDomain) { super(sign, regex, hostOrDomain); pattern = Pattern.compile(regex); } - protected boolean match(String url) { + public boolean match(String url) { return pattern.matcher(url).find(); } + + public String replace(String url, String replacement) { + java.util.regex.Matcher mtch = pattern.matcher(url); + if (mtch.find()) { + return mtch.replaceAll(replacement); + } + return url; + } } } From bbe5409fd8f0c21940d15ceed5b4fd41db0f80c1 Mon Sep 17 00:00:00 2001 From: Semyon Semyonov Date: Tue, 6 Mar 2018 10:14:05 +0100 Subject: [PATCH 3/7] added db-ignore-external-exemptions-bidirectional.txt in conf folder --- ...nore-external-exemptions-bidirectional.txt | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 conf/db-ignore-external-exemptions-bidirectional.txt diff --git a/conf/db-ignore-external-exemptions-bidirectional.txt b/conf/db-ignore-external-exemptions-bidirectional.txt new file mode 100644 index 0000000000..a6c976ba62 --- /dev/null +++ b/conf/db-ignore-external-exemptions-bidirectional.txt @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Exemption rules to db.ignore.external.links + + +# Format : +#-------- +# The format is same same as `regex-urlfilter.txt`. +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is exempted or ignored. If no pattern +# matches, the URL is ignored. + + + +# Example 1: +#---------- +# To exempt urls ending with image extensions, uncomment the below line +-(www.) From 3b07d036c5e9634eeed277d1d3513919a67d45fd Mon Sep 17 00:00:00 2001 From: Semyon Semyonov Date: Wed, 7 Mar 2018 14:26:09 +0100 Subject: [PATCH 4/7] fixed name of jar for bidirectional exemption url filter --- src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml index 27ab6d2c7d..4e6daeb0b4 100644 --- a/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml @@ -22,7 +22,7 @@ provider-name="nutch.org"> - + From 20c46edc0d8b7e879eeffb1b51990ebfe809dcb4 Mon Sep 17 00:00:00 2001 From: Semyon Semyonov Date: Fri, 16 Mar 2018 18:52:41 +0100 Subject: [PATCH 5/7] refactoring of bidirectional exemption filter, inspired by urlnormalizer-regex --- ...nore-external-exemptions-bidirectional.txt | 33 --- ...rnal-exemptions-bidirectional.xml.template | 32 +++ conf/nutch-default.xml | 2 +- .../urlfilter/api/RegexURLFilterBase.java | 69 ++--- .../plugin.xml | 4 +- .../BidirectionalExemptionUrlFilter.java | 244 +++++++++++++----- 6 files changed, 252 insertions(+), 132 deletions(-) delete mode 100644 conf/db-ignore-external-exemptions-bidirectional.txt create mode 100644 conf/db-ignore-external-exemptions-bidirectional.xml.template diff --git a/conf/db-ignore-external-exemptions-bidirectional.txt b/conf/db-ignore-external-exemptions-bidirectional.txt deleted file mode 100644 index a6c976ba62..0000000000 --- a/conf/db-ignore-external-exemptions-bidirectional.txt +++ /dev/null @@ -1,33 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -# Exemption rules to db.ignore.external.links - - -# Format : -#-------- -# The format is same same as `regex-urlfilter.txt`. -# Each non-comment, non-blank line contains a regular expression -# prefixed by '+' or '-'. The first matching pattern in the file -# determines whether a URL is exempted or ignored. If no pattern -# matches, the URL is ignored. - - - -# Example 1: -#---------- -# To exempt urls ending with image extensions, uncomment the below line --(www.) diff --git a/conf/db-ignore-external-exemptions-bidirectional.xml.template b/conf/db-ignore-external-exemptions-bidirectional.xml.template new file mode 100644 index 0000000000..c72bf30011 --- /dev/null +++ b/conf/db-ignore-external-exemptions-bidirectional.xml.template @@ -0,0 +1,32 @@ + + + + + + + + + + (www\.) + + + diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 4fb4786298..4b06f157fa 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -613,7 +613,7 @@ db.ignore.external.exemptions.bidirectional.file - db-ignore-external-exemptions-bidirectional.txt + db-ignore-external-exemptions-bidirectional.xml This file contains exemption rules used by 'urlfiter-ignoreexempt-bidirectional' plugin diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java index eced475bac..f3028d9c0b 100644 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java +++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java @@ -17,28 +17,26 @@ package org.apache.nutch.urlfilter.api; // JDK imports -import java.lang.invoke.MethodHandles; +import java.io.BufferedReader; import java.io.File; -import java.io.Reader; import java.io.FileReader; -import java.io.BufferedReader; -import java.io.InputStreamReader; import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; import java.io.StringReader; +import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; -import java.util.List; import java.util.ArrayList; - -// Commons Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import java.util.List; // Hadoop imports import org.apache.hadoop.conf.Configuration; - // Nutch imports -import org.apache.nutch.net.*; +import org.apache.nutch.net.URLFilter; import org.apache.nutch.util.URLUtil; +// Commons Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular @@ -64,14 +62,14 @@ public abstract class RegexURLFilterBase implements URLFilter { /** My logger */ - private static final Logger LOG = LoggerFactory - .getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles + .lookup().lookupClass()); /** An array of applicable rules */ private List rules; - - protected List getRules(){ - return rules; + + protected List getRules() { + return rules; } /** The current configuration */ @@ -130,20 +128,22 @@ protected RegexURLFilterBase(Reader reader) throws IOException, * is the regular expression associated to this rule. */ protected abstract RegexRule createRule(boolean sign, String regex); - + /** * Creates a new {@link RegexRule}. - * @param - * sign of the regular expression. - * A true value means that any URL matching this rule - * must be included, whereas a false - * value means that any URL matching this rule must be excluded. + * + * @param sign + * of the regular expression. A true value means that + * any URL matching this rule must be included, whereas a + * false value means that any URL matching this rule + * must be excluded. * @param regex - * is the regular expression associated to this rule. + * is the regular expression associated to this rule. * @param hostOrDomain - * the host or domain to which this regex belongs + * the host or domain to which this regex belongs */ - protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain); + protected abstract RegexRule createRule(boolean sign, String regex, + String hostOrDomain); /** * Returns the name of the file of rules to use for a particular @@ -165,31 +165,32 @@ protected abstract Reader getRulesReader(Configuration conf) public String filter(String url) { String host = URLUtil.getHost(url); String domain = null; - + try { domain = URLUtil.getDomainName(url); } catch (MalformedURLException e) { // shouldnt happen here right? } - + if (LOG.isDebugEnabled()) { LOG.debug("URL belongs to host " + host + " and domain " + domain); } for (RegexRule rule : rules) { // Skip the skip for rules that don't share the same host and domain - if (rule.hostOrDomain() != null && - !rule.hostOrDomain().equals(host) && - !rule.hostOrDomain().equals(domain)) { + if (rule.hostOrDomain() != null && !rule.hostOrDomain().equals(host) + && !rule.hostOrDomain().equals(domain)) { if (LOG.isDebugEnabled()) { - LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain()); + LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + + rule.hostOrDomain()); } continue; } - + if (LOG.isDebugEnabled()) { - LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain); + LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + + " and domain " + domain); } if (rule.match(url)) { @@ -254,7 +255,7 @@ private List readRules(Reader reader) throws IOException, List rules = new ArrayList(); String line; String hostOrDomain = null; - + while ((line = in.readLine()) != null) { if (line.length() == 0) { continue; diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml b/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml index 4e6daeb0b4..e9439f5f8b 100644 --- a/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/plugin.xml @@ -29,8 +29,6 @@ - - - + diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java index 960d90b280..c3d57b4251 100644 --- a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java @@ -16,93 +16,215 @@ */ package org.apache.nutch.urlfilter.ignoreexempt.bidirectional; -import org.apache.commons.io.IOUtils; +import java.io.FileReader; +import java.io.Reader; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import javax.xml.parsers.DocumentBuilderFactory; + import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; import org.apache.nutch.net.URLExemptionFilter; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.URLUtil; -import org.apache.nutch.urlfilter.api.RegexRule; -import org.apache.nutch.urlfilter.regex.RegexURLFilter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - -import java.lang.invoke.MethodHandles; -import java.net.MalformedURLException; -import java.io.IOException; -import java.io.InputStream; -import java.io.Reader; -import java.util.Arrays; -import java.util.regex.Pattern; -import java.util.List; -import java.util.ArrayList; - +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.w3c.dom.Text; +import org.xml.sax.InputSource; /** - * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses regex configuration for both fromUrl and toUrl - * to check if URL is eligible for exemption from 'db.ignore.external'. - * When this filter is enabled, the external urls will be checked against configured sequence of regex rules. - *

- * The exemption rule file defaults to db-ignore-external-exemptions-bidirectional.txt in the classpath but can be - * overridden using the property "db.ignore.external.exemptions.bidirectional.file" in ./conf/nutch-*.xml - *

+ * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses + * regex configuration for both fromUrl and toUrl to check if URL is eligible + * for exemption from 'db.ignore.external'. When this filter is enabled, the + * external urls will be checked against configured sequence of regex rules. + *

+ * The exemption rule file defaults to + * db-ignore-external-exemptions-bidirectional.xml in the classpath but can be + * overridden using the property + * "db.ignore.external.exemptions.bidirectional.file" in ./conf/nutch-*.xml + *

+ * * @since Mar 1, 2018 * @version 1 - * @see org.apache.nutch.net.URLExemptionFilter - * @see org.apache.nutch.urlfilter.regex.RegexURLFilter */ -public class BidirectionalExemptionUrlFilter extends RegexURLFilter - implements URLExemptionFilter { +public class BidirectionalExemptionUrlFilter extends Configured implements + URLExemptionFilter { + + public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE = "db.ignore.external.exemptions.bidirectional.file"; + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles + .lookup().lookupClass()); + + private static class Rule { + public Pattern pattern; + public String substitution; + } + + private List rules; - public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE - = "db.ignore.external.exemptions.bidirectional.file"; - private static final Logger LOG = LoggerFactory - .getLogger(MethodHandles.lookup().lookupClass()); + // private Configuration conf; + private String fileName; + + public BidirectionalExemptionUrlFilter() { + } + + public BidirectionalExemptionUrlFilter(Configuration conf) { + this.setConf(conf); + } @Override - //This implementation checks rules exceptions for two arbitrary urls. True if reg_ex(toUrl) = fromUrl + // This implementation checks rules exceptions for two arbitrary urls. True if + // reg_ex(toUrl) = reg_ex(fromUrl). + // Logic of reading of RegEx is taken from RegexURLNormalizer public boolean filter(String fromUrl, String toUrl) { - - String sourceHost = URLUtil.getHost(fromUrl).toLowerCase(); - String sourceDestination = URLUtil.getHost(toUrl).toLowerCase(); - - if (LOG.isDebugEnabled()) { - LOG.debug("BidirectionalExemptionUrlFilter. Source url: " + fromUrl + " and destination url " + toUrl); - } - - String modifiedSourceHost = sourceHost; - String modifiedDestinationHost = sourceDestination; - for (RegexRule rule : super.getRules()) { - - if (LOG.isDebugEnabled()) { - LOG.debug("Applying rule [" + rule.regex() + "]"); - } - - modifiedSourceHost = rule.replace(modifiedSourceHost, ""); - modifiedDestinationHost = rule.replace(modifiedDestinationHost, ""); - }; - - return modifiedSourceHost.equals(modifiedDestinationHost); + String sourceHost = URLUtil.getHost(fromUrl).toLowerCase(); + String sourceDestination = URLUtil.getHost(toUrl).toLowerCase(); + + if (LOG.isDebugEnabled()) { + LOG.debug("BidirectionalExemptionUrlFilter. Source url: " + fromUrl + + " and destination url " + toUrl); + } + + String modifiedSourceHost = sourceHost; + String modifiedDestinationHost = sourceDestination; + + modifiedSourceHost = this.regexReplace(modifiedSourceHost); + modifiedDestinationHost = this.regexReplace(modifiedDestinationHost); + + if (modifiedSourceHost == null || modifiedDestinationHost == null) { + return false; + } + return modifiedSourceHost.equals(modifiedDestinationHost); + } + + private List readConfigurationFile() { + // String filename = + // this.getConf().get(DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE); + + if (LOG.isInfoEnabled()) { + LOG.info("loading " + this.fileName); + } + try { + FileReader reader = new FileReader(this.fileName); + return readConfiguration(reader); + } catch (Exception e) { + LOG.error("Error loading rules from '" + this.fileName + "': " + e); + return null; + } } - - protected Reader getRulesReader(Configuration conf) - throws IOException { - String fileRules = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE); - this.getConf(); - return conf.getConfResourceAsReader(fileRules); + private List readConfiguration(Reader reader) { + List rules = new ArrayList(); + try { + + // borrowed heavily from code in Configuration.java + Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() + .parse(new InputSource(reader)); + Element root = doc.getDocumentElement(); + if ((!"regex-exemptionurl".equals(root.getTagName())) + && (LOG.isErrorEnabled())) { + LOG.error("bad conf file: top-level element not "); + } + NodeList regexes = root.getChildNodes(); + for (int i = 0; i < regexes.getLength(); i++) { + Node regexNode = regexes.item(i); + if (!(regexNode instanceof Element)) + continue; + Element regex = (Element) regexNode; + if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) { + LOG.warn("bad conf file: element not "); + } + NodeList fields = regex.getChildNodes(); + String patternValue = null; + String subValue = null; + for (int j = 0; j < fields.getLength(); j++) { + Node fieldNode = fields.item(j); + if (!(fieldNode instanceof Element)) + continue; + Element field = (Element) fieldNode; + if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) + patternValue = ((Text) field.getFirstChild()).getData(); + if ("substitution".equals(field.getTagName()) + && field.hasChildNodes()) + subValue = ((Text) field.getFirstChild()).getData(); + if (!field.hasChildNodes()) + subValue = ""; + } + if (patternValue != null && subValue != null) { + Rule rule = new Rule(); + try { + rule.pattern = Pattern.compile(patternValue); + } catch (PatternSyntaxException e) { + if (LOG.isErrorEnabled()) { + LOG.error("skipped rule: " + patternValue + " -> " + subValue + + " : invalid regular expression pattern: " + e); + } + continue; + } + rule.substitution = subValue; + rules.add(rule); + } + } + } catch (Exception e) { + if (LOG.isErrorEnabled()) { + LOG.error("error parsing conf file: " + e); + } + return null; + } + if (rules.size() == 0) + return null; + return rules; } + + private String regexReplace(String urlString) { + if (rules == null) { + return null; + } + + Iterator i = rules.iterator(); + while (i.hasNext()) { + Rule r = (Rule) i.next(); + Matcher matcher = r.pattern.matcher(urlString); + urlString = matcher.replaceAll(r.substitution); + } + return urlString; + } + public static void main(String[] args) { if (args.length != 2) { System.out.println("Error: Invalid Args"); - System.out.println("Usage: " + - BidirectionalExemptionUrlFilter.class.getName() + " "); + System.out.println("Usage: " + + BidirectionalExemptionUrlFilter.class.getName() + + " "); return; } String sourceUrl = args[0]; String destinationUrl = args[1]; - BidirectionalExemptionUrlFilter instance = new BidirectionalExemptionUrlFilter(); - instance.setConf(NutchConfiguration.create()); + BidirectionalExemptionUrlFilter instance = new BidirectionalExemptionUrlFilter( + NutchConfiguration.create()); System.out.println(instance.filter(sourceUrl, destinationUrl)); } + + /* + * @Override public Configuration getConf() { return super.getConf(); } + */ + @Override + public void setConf(Configuration conf) { + if (conf == null) { + return; + } + this.fileName = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE); + this.rules = readConfigurationFile(); + } + } From 375bc5642790f6d54596ed2d618703108284769a Mon Sep 17 00:00:00 2001 From: Semyon Semyonov Date: Wed, 21 Mar 2018 13:43:05 +0100 Subject: [PATCH 6/7] added super setup of config for BidirectionalExemptionUrlFilter --- .../bidirectional/BidirectionalExemptionUrlFilter.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java index c3d57b4251..4be01922fa 100644 --- a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java @@ -78,7 +78,7 @@ public BidirectionalExemptionUrlFilter() { } public BidirectionalExemptionUrlFilter(Configuration conf) { - this.setConf(conf); + super(conf); } @Override @@ -223,6 +223,7 @@ public void setConf(Configuration conf) { if (conf == null) { return; } + super.setConf(conf); this.fileName = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_BIDIRECTIONAL_FILE); this.rules = readConfigurationFile(); } From 010c0a207a9c42b8a87ac6c54d48543bd7766a05 Mon Sep 17 00:00:00 2001 From: Semyon Semyonov Date: Tue, 27 Mar 2018 17:09:37 +0200 Subject: [PATCH 7/7] FileReader was replaced with Reader in urlfilter-ignoreexempt-bidirectionla --- .../bidirectional/BidirectionalExemptionUrlFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java index 4be01922fa..5d1aacf441 100644 --- a/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java +++ b/src/plugin/urlfilter-ignoreexempt-bidirectional/src/java/org/apache/nutch/urlfilter/ignoreexempt/bidirectional/BidirectionalExemptionUrlFilter.java @@ -114,7 +114,7 @@ private List readConfigurationFile() { LOG.info("loading " + this.fileName); } try { - FileReader reader = new FileReader(this.fileName); + Reader reader = getConf().getConfResourceAsReader(this.fileName); return readConfiguration(reader); } catch (Exception e) { LOG.error("Error loading rules from '" + this.fileName + "': " + e);