Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NUTCH-2522 Bidirectional URL exemption filter #290

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@
<packageset dir="${plugins.dir}/urlfilter-domain/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-domainblacklist/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-ignoreexemptbidirectional/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-suffix/src/java"/>
Expand Down Expand Up @@ -684,6 +685,7 @@
<packageset dir="${plugins.dir}/urlfilter-domain/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-domainblacklist/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-ignoreexemptbidirectional/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
<packageset dir="${plugins.dir}/urlfilter-suffix/src/java"/>
Expand Down Expand Up @@ -1122,6 +1124,7 @@
<source path="${plugins.dir}/urlfilter-domainblacklist/src/java/" />
<source path="${plugins.dir}/urlfilter-domainblacklist/src/test/" />
<source path="${plugins.dir}/urlfilter-ignoreexempt/src/java/" />
<source path="${plugins.dir}/urlfilter-ignoreexempt-bidirectional/src/java"/>
<source path="${plugins.dir}/urlfilter-prefix/src/java/" />
<source path="${plugins.dir}/urlfilter-prefix/src/test/" />
<source path="${plugins.dir}/urlfilter-regex/src/java/" />
Expand Down
32 changes: 32 additions & 0 deletions conf/db-ignore-external-exemptions-bidirectional.xml.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- This is the configuration file for BidirectionalExemptionUrlFilter.
The usage is based on RegexUrlNormalize filter with additional check of both fromUrl and toUrl.
The regex engine that is used is Perl5 compatible.
The rules are applied to URLs in the order they occur in this file. -->

<!-- WATCH OUT: an xml parser reads this file an ampersands must be
expanded to &amp; -->

<!-- Example. Exempt links such as http://www.website.com to http://website.com -->
<regex-exemptionurl>
<regex>
<pattern>(www\.)</pattern>
<substitution></substitution>
</regex>
</regex-exemptionurl>
8 changes: 8 additions & 0 deletions conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,14 @@
</description>
</property>

<property>
<name>db.ignore.external.exemptions.bidirectional.file</name>
<value>db-ignore-external-exemptions-bidirectional.xml</value>
<description>
This file contains exemption rules used by 'urlfiter-ignoreexempt-bidirectional' plugin
</description>
</property>

<property>
<name>db.injector.overwrite</name>
<value>false</value>
Expand Down
2 changes: 2 additions & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
<ant dir="urlfilter-validator" target="deploy"/>
<!-- urlfilter-ignoreexempt depends on urlfilter-regex, must build after -->
<ant dir="urlfilter-ignoreexempt" target="deploy"/>
<ant dir="urlfilter-ignoreexempt-bidirectional" target="deploy"/>
<ant dir="urlmeta" target="deploy"/>
<ant dir="urlnormalizer-ajax" target="deploy"/>
<ant dir="urlnormalizer-basic" target="deploy"/>
Expand Down Expand Up @@ -220,6 +221,7 @@
<ant dir="urlfilter-domain" target="clean" />
<ant dir="urlfilter-domainblacklist" target="clean" />
<ant dir="urlfilter-ignoreexempt" target="clean"/>
<ant dir="urlfilter-ignoreexempt-bidirectional" target="clean"/>
<ant dir="urlfilter-prefix" target="clean"/>
<ant dir="urlfilter-regex" target="clean"/>
<ant dir="urlfilter-suffix" target="clean"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ protected boolean accept() {
*
* @return this regex
*/
protected String regex() { return regex; }
public String regex() { return regex; }

/**
* Checks if a url matches this rule.
Expand All @@ -97,6 +97,15 @@ protected boolean accept() {
* @return <code>true</code> if the specified url matches this rule, otherwise
* <code>false</code>.
*/
protected abstract boolean match(String url);
public abstract boolean match(String url);

/**
* Replace if a url matches this rule.
*
* @param url
* is the url to check.
* @return <code>true</code> if the specified url matches this rule, otherwise
* <code>false</code>.
*/
public abstract String replace(String url, String replacement);
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,26 @@
package org.apache.nutch.urlfilter.api;

// JDK imports
import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.File;
import java.io.Reader;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.util.List;
import java.util.ArrayList;

// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;

// Hadoop imports
import org.apache.hadoop.conf.Configuration;

// Nutch imports
import org.apache.nutch.net.*;
import org.apache.nutch.net.URLFilter;
import org.apache.nutch.util.URLUtil;
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
Expand All @@ -64,12 +62,16 @@
public abstract class RegexURLFilterBase implements URLFilter {

/** My logger */
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles
.lookup().lookupClass());

/** An array of applicable rules */
private List<RegexRule> rules;

protected List<RegexRule> getRules() {
return rules;
}

/** The current configuration */
private Configuration conf;

Expand Down Expand Up @@ -126,20 +128,22 @@ protected RegexURLFilterBase(Reader reader) throws IOException,
* is the regular expression associated to this rule.
*/
protected abstract RegexRule createRule(boolean sign, String regex);

/**
* Creates a new {@link RegexRule}.
* @param
* sign of the regular expression.
* A <code>true</code> value means that any URL matching this rule
* must be included, whereas a <code>false</code>
* value means that any URL matching this rule must be excluded.
*
* @param sign
* of the regular expression. A <code>true</code> value means that
* any URL matching this rule must be included, whereas a
* <code>false</code> value means that any URL matching this rule
* must be excluded.
* @param regex
* is the regular expression associated to this rule.
* is the regular expression associated to this rule.
* @param hostOrDomain
* the host or domain to which this regex belongs
* the host or domain to which this regex belongs
*/
protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain);
protected abstract RegexRule createRule(boolean sign, String regex,
String hostOrDomain);

/**
* Returns the name of the file of rules to use for a particular
Expand All @@ -161,31 +165,32 @@ protected abstract Reader getRulesReader(Configuration conf)
public String filter(String url) {
String host = URLUtil.getHost(url);
String domain = null;

try {
domain = URLUtil.getDomainName(url);
} catch (MalformedURLException e) {
// shouldnt happen here right?
}

if (LOG.isDebugEnabled()) {
LOG.debug("URL belongs to host " + host + " and domain " + domain);
}

for (RegexRule rule : rules) {
// Skip the skip for rules that don't share the same host and domain
if (rule.hostOrDomain() != null &&
!rule.hostOrDomain().equals(host) &&
!rule.hostOrDomain().equals(domain)) {
if (rule.hostOrDomain() != null && !rule.hostOrDomain().equals(host)
&& !rule.hostOrDomain().equals(domain)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain());
LOG.debug("Skipping rule [" + rule.regex() + "] for host: "
+ rule.hostOrDomain());
}

continue;
}

if (LOG.isDebugEnabled()) {
LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain);
LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host
+ " and domain " + domain);
}

if (rule.match(url)) {
Expand Down Expand Up @@ -250,7 +255,7 @@ private List<RegexRule> readRules(Reader reader) throws IOException,
List<RegexRule> rules = new ArrayList<RegexRule>();
String line;
String hostOrDomain = null;

while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,15 @@
import java.io.StringReader;
import java.util.regex.PatternSyntaxException;


// Hadoop imports
import org.apache.hadoop.conf.Configuration;


// Automaton imports
import dk.brics.automaton.RegExp;
import dk.brics.automaton.RunAutomaton;

import org.apache.nutch.net.*;
import org.apache.nutch.urlfilter.api.RegexRule;
import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
Expand Down Expand Up @@ -108,9 +111,15 @@ private class Rule extends RegexRule {
automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
}

protected boolean match(String url) {
public boolean match(String url) {
return automaton.run(url);
}

@Override
public String replace(String url, String replacement) {
// TODO Auto-generated method stub
return null;
}
}

}
37 changes: 37 additions & 0 deletions src/plugin/urlfilter-ignoreexempt-bidirectional/build.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="urlfilter-ignoreexempt-bidirectional" default="jar-core">

<import file="../build-plugin.xml"/>

<!-- Build compilation dependencies -->
<target name="deps-jar">
<ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
</target>

<!-- Add compilation dependencies to classpath -->
<path id="plugin.deps">
<fileset dir="${nutch.root}/build">
<include name="**/lib-regex-filter/*.jar" />
<include name="**/urlfilter-regex/*.jar" />
</fileset>
<pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
</path>


</project>
41 changes: 41 additions & 0 deletions src/plugin/urlfilter-ignoreexempt-bidirectional/ivy.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<?xml version="1.0" ?>

<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<ivy-module version="1.0">
<info organisation="org.apache.nutch" module="${ant.project.name}">
<license name="Apache 2.0"/>
<ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
<description>
Apache Nutch
</description>
</info>

<configurations>
<include file="../../..//ivy/ivy-configurations.xml"/>
</configurations>

<publications>
<!--get the artifact from our module name-->
<artifact conf="master"/>
</publications>

<dependencies>
</dependencies>

</ivy-module>
Loading