Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

NUTCH-1210 Domain Blacklist Filter

git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1292764 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
commit efeaf59e52ba67971558c2697cdaad4c60c55333 1 parent 6893fd2
Markus Jelsma authored
View
2  CHANGES.txt
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-1210 DomainBlacklistFilter (markus)
+
* NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
* NUTCH-1193 Incorrect url transform to lowercase: parameter solr (Eduardo dos Santos Leggiero via lewismc)
View
16 conf/domainblacklist-urlfilter.txt
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# config file for urlfilter-domainblacklist plugin
View
2  src/plugin/build.xml
@@ -57,6 +57,7 @@
<ant dir="tld" target="deploy"/>
<ant dir="urlfilter-automaton" target="deploy"/>
<ant dir="urlfilter-domain" target="deploy" />
+ <ant dir="urlfilter-domainblacklist" target="deploy" />
<ant dir="urlfilter-prefix" target="deploy"/>
<ant dir="urlfilter-regex" target="deploy"/>
<ant dir="urlfilter-suffix" target="deploy"/>
@@ -132,6 +133,7 @@
<ant dir="tld" target="clean"/>
<ant dir="urlfilter-automaton" target="clean"/>
<ant dir="urlfilter-domain" target="clean" />
+ <ant dir="urlfilter-domainblacklist" target="clean" />
<ant dir="urlfilter-prefix" target="clean"/>
<ant dir="urlfilter-regex" target="clean"/>
<ant dir="urlfilter-suffix" target="clean"/>
View
28 src/plugin/urlfilter-domainblacklist/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-domainblacklist" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="data" />
+ </copy>
+
+</project>
View
5 src/plugin/urlfilter-domainblacklist/data/hosts.txt
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
View
41 src/plugin/urlfilter-domainblacklist/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
View
43 src/plugin/urlfilter-domainblacklist/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="urlfilter-domainblacklist"
+ name="Domain Blacklist URL Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="urlfilter-domainblacklist.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter.domainblacklist"
+ name="Nutch Domain Blacklist URL Filter"
+ point="org.apache.nutch.net.URLFilter">
+ <implementation id="DomainBlacklistURLFilter"
+ class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilter">
+ <parameter name="file" value="domainblacklist-urlfilter.txt"/>
+ </implementation>
+ </extension>
+
+</plugin>
View
203 ...acklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * <p>Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. A url that matches one of the suffixes, domains, or hosts
+ * present in the file is filtered out.</p>
+ *
+ * <p>Urls are checked in order of domain suffix, domain name, and hostname
+ * against entries in the domain file. The domain file would be setup as follows
+ * with one entry per line:
+ *
+ * <pre> com apache.org www.apache.org </pre>
+ *
+ * <p>The first line is an example of a filter that would allow all .com
+ * domains. The second line allows all urls from apache.org and all of its
+ * subdomains such as lucene.apache.org and hadoop.apache.org. The third line
+ * would allow only urls from www.apache.org. There is no specific ordering to
+ * entries. The entries are from more general to more specific with the more
+ * general overridding the more specific.</p>
+ *
+ * The domain file defaults to domainblacklist-urlfilter.txt in the classpath but can be
+ * overridden using the:
+ *
+ * <ul> <ol>property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and</ol>
+ * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
+ *
+ * the attribute "file" has higher precedence if defined.
+ */
+public class DomainBlacklistURLFilter
+ implements URLFilter {
+
+ private static final Logger LOG = LoggerFactory.getLogger(DomainBlacklistURLFilter.class);
+
+ // read in attribute "file" of this plugin.
+ private static String attributeFile = null;
+ private Configuration conf;
+ private String domainFile = null;
+ private Set<String> domainSet = new LinkedHashSet<String>();
+
+ private void readConfiguration(Reader configReader)
+ throws IOException {
+
+ // read the configuration file, line by line
+ BufferedReader reader = new BufferedReader(configReader);
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ // add non-blank lines and non-commented lines
+ domainSet.add(StringUtils.lowerCase(line));
+ }
+ }
+ }
+
+ /**
+ * Default constructor.
+ */
+ public DomainBlacklistURLFilter() {
+
+ }
+
+ /**
+ * Constructor that specifies the domain file to use.
+ *
+ * @param domainFile The domain file, overrides domainblacklist-urlfilter.text default.
+ *
+ * @throws IOException
+ */
+ public DomainBlacklistURLFilter(String domainFile) {
+ this.domainFile = domainFile;
+ }
+
+ /**
+ * Sets the configuration.
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+
+ // get the extensions for domain urlfilter
+ String pluginName = "urlfilter-domainblacklist";
+ Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+ URLFilter.class.getName()).getExtensions();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+ attributeFile = extension.getAttribute("file");
+ break;
+ }
+ }
+
+ // handle blank non empty input
+ if (attributeFile != null && attributeFile.trim().equals("")) {
+ attributeFile = null;
+ }
+
+ if (attributeFile != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ + " as " + attributeFile);
+ }
+ }
+ else {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ + pluginName);
+ }
+ }
+
+ // domain file and attribute "file" take precedence if defined
+ String file = conf.get("urlfilter.domainblacklist.file");
+ String stringRules = conf.get("urlfilter.domainblacklist.rules");
+ if (domainFile != null) {
+ file = domainFile;
+ }
+ else if (attributeFile != null) {
+ file = attributeFile;
+ }
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
+ try {
+ if (reader == null) {
+ reader = new FileReader(file);
+ }
+ readConfiguration(reader);
+ }
+ catch (IOException e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ public String filter(String url) {
+
+ try {
+
+ // match for suffix, domain, and host in that order. more general will
+ // override more specific
+ String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+ String host = URLUtil.getHost(url);
+ String suffix = null;
+ DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
+ if (domainSuffix != null) {
+ suffix = domainSuffix.getDomain();
+ }
+
+ if (domainSet.contains(suffix) || domainSet.contains(domain)
+ || domainSet.contains(host)) {
+ // Matches, filter!
+ return null;
+ }
+
+ // doesn't match, allow
+ return url;
+ }
+ catch (Exception e) {
+
+ // if an error happens, allow the url to pass
+ LOG.error("Could not apply filter on url: " + url + "\n"
+ + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ return null;
+ }
+ }
+}
View
57 ...ist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
+import junit.framework.TestCase;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestDomainBlacklistURLFilter
+ extends TestCase {
+
+ protected static final Logger LOG = LoggerFactory.getLogger(TestDomainBlacklistURLFilter.class);
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ public TestDomainBlacklistURLFilter(String testName) {
+ super(testName);
+ }
+
+ public void testFilter()
+ throws Exception {
+
+ String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
+ Configuration conf = NutchConfiguration.create();
+ DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(domainBlacklistFile);
+ domainBlacklistFilter.setConf(conf);
+ assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
+ assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
+ assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
+ assertNotNull(domainBlacklistFilter.filter("http://www.google.com"));
+ assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
+ assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
+ assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
+ assertNull(domainBlacklistFilter.filter("http://www.yahoo.com"));
+ assertNull(domainBlacklistFilter.filter("http://www.foobar.be"));
+ assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com"));
+ }
+
+}
Please sign in to comment.
Something went wrong with that request. Please try again.