Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

NUTCH-1210 Domain Blacklist Filter

git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1292764 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
commit efeaf59e52ba67971558c2697cdaad4c60c55333 1 parent 6893fd2
Markus Jelsma authored
2  CHANGES.txt
... ... @@ -1,5 +1,7 @@
1 1 Nutch Change Log
2 2
  3 +* NUTCH-1210 DomainBlacklistFilter (markus)
  4 +
3 5 * NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
4 6
5 7 * NUTCH-1193 Incorrect url transform to lowercase: parameter solr (Eduardo dos Santos Leggiero via lewismc)
16 conf/domainblacklist-urlfilter.txt
... ... @@ -0,0 +1,16 @@
  1 +# Licensed to the Apache Software Foundation (ASF) under one or more
  2 +# contributor license agreements. See the NOTICE file distributed with
  3 +# this work for additional information regarding copyright ownership.
  4 +# The ASF licenses this file to You under the Apache License, Version 2.0
  5 +# (the "License"); you may not use this file except in compliance with
  6 +# the License. You may obtain a copy of the License at
  7 +#
  8 +# http://www.apache.org/licenses/LICENSE-2.0
  9 +#
  10 +# Unless required by applicable law or agreed to in writing, software
  11 +# distributed under the License is distributed on an "AS IS" BASIS,
  12 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 +# See the License for the specific language governing permissions and
  14 +# limitations under the License.
  15 +
  16 +# config file for urlfilter-domainblacklist plugin
2  src/plugin/build.xml
@@ -57,6 +57,7 @@
57 57 <ant dir="tld" target="deploy"/>
58 58 <ant dir="urlfilter-automaton" target="deploy"/>
59 59 <ant dir="urlfilter-domain" target="deploy" />
  60 + <ant dir="urlfilter-domainblacklist" target="deploy" />
60 61 <ant dir="urlfilter-prefix" target="deploy"/>
61 62 <ant dir="urlfilter-regex" target="deploy"/>
62 63 <ant dir="urlfilter-suffix" target="deploy"/>
@@ -132,6 +133,7 @@
132 133 <ant dir="tld" target="clean"/>
133 134 <ant dir="urlfilter-automaton" target="clean"/>
134 135 <ant dir="urlfilter-domain" target="clean" />
  136 + <ant dir="urlfilter-domainblacklist" target="clean" />
135 137 <ant dir="urlfilter-prefix" target="clean"/>
136 138 <ant dir="urlfilter-regex" target="clean"/>
137 139 <ant dir="urlfilter-suffix" target="clean"/>
28 src/plugin/urlfilter-domainblacklist/build.xml
... ... @@ -0,0 +1,28 @@
  1 +<?xml version="1.0"?>
  2 +<!--
  3 + Licensed to the Apache Software Foundation (ASF) under one or more
  4 + contributor license agreements. See the NOTICE file distributed with
  5 + this work for additional information regarding copyright ownership.
  6 + The ASF licenses this file to You under the Apache License, Version 2.0
  7 + (the "License"); you may not use this file except in compliance with
  8 + the License. You may obtain a copy of the License at
  9 +
  10 + http://www.apache.org/licenses/LICENSE-2.0
  11 +
  12 + Unless required by applicable law or agreed to in writing, software
  13 + distributed under the License is distributed on an "AS IS" BASIS,
  14 + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 + See the License for the specific language governing permissions and
  16 + limitations under the License.
  17 +-->
  18 +<project name="urlfilter-domainblacklist" default="jar-core">
  19 +
  20 + <import file="../build-plugin.xml"/>
  21 +
  22 + <!-- for junit test -->
  23 + <mkdir dir="${build.test}/data"/>
  24 + <copy todir="${build.test}/data">
  25 + <fileset dir="data" />
  26 + </copy>
  27 +
  28 +</project>
5 src/plugin/urlfilter-domainblacklist/data/hosts.txt
... ... @@ -0,0 +1,5 @@
  1 +# comments start with the pound sign
  2 +net
  3 +apache.org
  4 +be
  5 +www.yahoo.com
41 src/plugin/urlfilter-domainblacklist/ivy.xml
... ... @@ -0,0 +1,41 @@
  1 +<?xml version="1.0" ?>
  2 +
  3 +<!--
  4 + Licensed to the Apache Software Foundation (ASF) under one or more
  5 + contributor license agreements. See the NOTICE file distributed with
  6 + this work for additional information regarding copyright ownership.
  7 + The ASF licenses this file to You under the Apache License, Version 2.0
  8 + (the "License"); you may not use this file except in compliance with
  9 + the License. You may obtain a copy of the License at
  10 +
  11 + http://www.apache.org/licenses/LICENSE-2.0
  12 +
  13 + Unless required by applicable law or agreed to in writing, software
  14 + distributed under the License is distributed on an "AS IS" BASIS,
  15 + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16 + See the License for the specific language governing permissions and
  17 + limitations under the License.
  18 +-->
  19 +
  20 +<ivy-module version="1.0">
  21 + <info organisation="org.apache.nutch" module="${ant.project.name}">
  22 + <license name="Apache 2.0"/>
  23 + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
  24 + <description>
  25 + Apache Nutch
  26 + </description>
  27 + </info>
  28 +
  29 + <configurations>
  30 + <include file="../../../ivy/ivy-configurations.xml"/>
  31 + </configurations>
  32 +
  33 + <publications>
  34 + <!--get the artifact from our module name-->
  35 + <artifact conf="master"/>
  36 + </publications>
  37 +
  38 + <dependencies>
  39 + </dependencies>
  40 +
  41 +</ivy-module>
43 src/plugin/urlfilter-domainblacklist/plugin.xml
... ... @@ -0,0 +1,43 @@
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<!--
  3 + Licensed to the Apache Software Foundation (ASF) under one or more
  4 + contributor license agreements. See the NOTICE file distributed with
  5 + this work for additional information regarding copyright ownership.
  6 + The ASF licenses this file to You under the Apache License, Version 2.0
  7 + (the "License"); you may not use this file except in compliance with
  8 + the License. You may obtain a copy of the License at
  9 +
  10 + http://www.apache.org/licenses/LICENSE-2.0
  11 +
  12 + Unless required by applicable law or agreed to in writing, software
  13 + distributed under the License is distributed on an "AS IS" BASIS,
  14 + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 + See the License for the specific language governing permissions and
  16 + limitations under the License.
  17 +-->
  18 +<plugin
  19 + id="urlfilter-domainblacklist"
  20 + name="Domain Blacklist URL Filter"
  21 + version="1.0.0"
  22 + provider-name="nutch.org">
  23 +
  24 + <runtime>
  25 + <library name="urlfilter-domainblacklist.jar">
  26 + <export name="*"/>
  27 + </library>
  28 + </runtime>
  29 +
  30 + <requires>
  31 + <import plugin="nutch-extensionpoints"/>
  32 + </requires>
  33 +
  34 + <extension id="org.apache.nutch.net.urlfilter.domainblacklist"
  35 + name="Nutch Domain Blacklist URL Filter"
  36 + point="org.apache.nutch.net.URLFilter">
  37 + <implementation id="DomainBlacklistURLFilter"
  38 + class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilter">
  39 + <parameter name="file" value="domainblacklist-urlfilter.txt"/>
  40 + </implementation>
  41 + </extension>
  42 +
  43 +</plugin>
203 ...filter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
... ... @@ -0,0 +1,203 @@
  1 +/*
  2 + * Licensed to the Apache Software Foundation (ASF) under one or more
  3 + * contributor license agreements. See the NOTICE file distributed with
  4 + * this work for additional information regarding copyright ownership.
  5 + * The ASF licenses this file to You under the Apache License, Version 2.0
  6 + * (the "License"); you may not use this file except in compliance with
  7 + * the License. You may obtain a copy of the License at
  8 + *
  9 + * http://www.apache.org/licenses/LICENSE-2.0
  10 + *
  11 + * Unless required by applicable law or agreed to in writing, software
  12 + * distributed under the License is distributed on an "AS IS" BASIS,
  13 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 + * See the License for the specific language governing permissions and
  15 + * limitations under the License.
  16 + */
  17 +package org.apache.nutch.urlfilter.domainblacklist;
  18 +
  19 +import java.io.BufferedReader;
  20 +import java.io.FileReader;
  21 +import java.io.IOException;
  22 +import java.io.Reader;
  23 +import java.io.StringReader;
  24 +import java.util.LinkedHashSet;
  25 +import java.util.Set;
  26 +
  27 +import org.apache.commons.lang.StringUtils;
  28 +import org.slf4j.Logger;
  29 +import org.slf4j.LoggerFactory;
  30 +import org.apache.hadoop.conf.Configuration;
  31 +import org.apache.nutch.net.URLFilter;
  32 +import org.apache.nutch.plugin.Extension;
  33 +import org.apache.nutch.plugin.PluginRepository;
  34 +import org.apache.nutch.util.URLUtil;
  35 +import org.apache.nutch.util.domain.DomainSuffix;
  36 +
  37 +/**
  38 + * <p>Filters URLs based on a file containing domain suffixes, domain names, and
  39 + * hostnames. A url that matches one of the suffixes, domains, or hosts
  40 + * present in the file is filtered out.</p>
  41 + *
  42 + * <p>Urls are checked in order of domain suffix, domain name, and hostname
  43 + * against entries in the domain file. The domain file would be setup as follows
  44 + * with one entry per line:
  45 + *
  46 + * <pre> com apache.org www.apache.org </pre>
  47 + *
  48 + * <p>The first line is an example of a filter that would allow all .com
  49 + * domains. The second line allows all urls from apache.org and all of its
  50 + * subdomains such as lucene.apache.org and hadoop.apache.org. The third line
  51 + * would allow only urls from www.apache.org. There is no specific ordering to
  52 + * entries. The entries are from more general to more specific with the more
  53 + * general overridding the more specific.</p>
  54 + *
  55 + * The domain file defaults to domainblacklist-urlfilter.txt in the classpath but can be
  56 + * overridden using the:
  57 + *
  58 + * <ul> <ol>property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and</ol>
  59 + * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
  60 + *
  61 + * the attribute "file" has higher precedence if defined.
  62 + */
  63 +public class DomainBlacklistURLFilter
  64 + implements URLFilter {
  65 +
  66 + private static final Logger LOG = LoggerFactory.getLogger(DomainBlacklistURLFilter.class);
  67 +
  68 + // read in attribute "file" of this plugin.
  69 + private static String attributeFile = null;
  70 + private Configuration conf;
  71 + private String domainFile = null;
  72 + private Set<String> domainSet = new LinkedHashSet<String>();
  73 +
  74 + private void readConfiguration(Reader configReader)
  75 + throws IOException {
  76 +
  77 + // read the configuration file, line by line
  78 + BufferedReader reader = new BufferedReader(configReader);
  79 + String line = null;
  80 + while ((line = reader.readLine()) != null) {
  81 + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
  82 + // add non-blank lines and non-commented lines
  83 + domainSet.add(StringUtils.lowerCase(line));
  84 + }
  85 + }
  86 + }
  87 +
  88 + /**
  89 + * Default constructor.
  90 + */
  91 + public DomainBlacklistURLFilter() {
  92 +
  93 + }
  94 +
  95 + /**
  96 + * Constructor that specifies the domain file to use.
  97 + *
  98 + * @param domainFile The domain file, overrides domainblacklist-urlfilter.text default.
  99 + *
  100 + * @throws IOException
  101 + */
  102 + public DomainBlacklistURLFilter(String domainFile) {
  103 + this.domainFile = domainFile;
  104 + }
  105 +
  106 + /**
  107 + * Sets the configuration.
  108 + */
  109 + public void setConf(Configuration conf) {
  110 + this.conf = conf;
  111 +
  112 + // get the extensions for domain urlfilter
  113 + String pluginName = "urlfilter-domainblacklist";
  114 + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
  115 + URLFilter.class.getName()).getExtensions();
  116 + for (int i = 0; i < extensions.length; i++) {
  117 + Extension extension = extensions[i];
  118 + if (extension.getDescriptor().getPluginId().equals(pluginName)) {
  119 + attributeFile = extension.getAttribute("file");
  120 + break;
  121 + }
  122 + }
  123 +
  124 + // handle blank non empty input
  125 + if (attributeFile != null && attributeFile.trim().equals("")) {
  126 + attributeFile = null;
  127 + }
  128 +
  129 + if (attributeFile != null) {
  130 + if (LOG.isInfoEnabled()) {
  131 + LOG.info("Attribute \"file\" is defined for plugin " + pluginName
  132 + + " as " + attributeFile);
  133 + }
  134 + }
  135 + else {
  136 + if (LOG.isWarnEnabled()) {
  137 + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
  138 + + pluginName);
  139 + }
  140 + }
  141 +
  142 + // domain file and attribute "file" take precedence if defined
  143 + String file = conf.get("urlfilter.domainblacklist.file");
  144 + String stringRules = conf.get("urlfilter.domainblacklist.rules");
  145 + if (domainFile != null) {
  146 + file = domainFile;
  147 + }
  148 + else if (attributeFile != null) {
  149 + file = attributeFile;
  150 + }
  151 + Reader reader = null;
  152 + if (stringRules != null) { // takes precedence over files
  153 + reader = new StringReader(stringRules);
  154 + } else {
  155 + reader = conf.getConfResourceAsReader(file);
  156 + }
  157 + try {
  158 + if (reader == null) {
  159 + reader = new FileReader(file);
  160 + }
  161 + readConfiguration(reader);
  162 + }
  163 + catch (IOException e) {
  164 + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
  165 + }
  166 + }
  167 +
  168 + public Configuration getConf() {
  169 + return this.conf;
  170 + }
  171 +
  172 + public String filter(String url) {
  173 +
  174 + try {
  175 +
  176 + // match for suffix, domain, and host in that order. more general will
  177 + // override more specific
  178 + String domain = URLUtil.getDomainName(url).toLowerCase().trim();
  179 + String host = URLUtil.getHost(url);
  180 + String suffix = null;
  181 + DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
  182 + if (domainSuffix != null) {
  183 + suffix = domainSuffix.getDomain();
  184 + }
  185 +
  186 + if (domainSet.contains(suffix) || domainSet.contains(domain)
  187 + || domainSet.contains(host)) {
  188 + // Matches, filter!
  189 + return null;
  190 + }
  191 +
  192 + // doesn't match, allow
  193 + return url;
  194 + }
  195 + catch (Exception e) {
  196 +
  197 + // if an error happens, allow the url to pass
  198 + LOG.error("Could not apply filter on url: " + url + "\n"
  199 + + org.apache.hadoop.util.StringUtils.stringifyException(e));
  200 + return null;
  201 + }
  202 + }
  203 +}
57 ...er-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
... ... @@ -0,0 +1,57 @@
  1 +/*
  2 + * Licensed to the Apache Software Foundation (ASF) under one or more
  3 + * contributor license agreements. See the NOTICE file distributed with
  4 + * this work for additional information regarding copyright ownership.
  5 + * The ASF licenses this file to You under the Apache License, Version 2.0
  6 + * (the "License"); you may not use this file except in compliance with
  7 + * the License. You may obtain a copy of the License at
  8 + *
  9 + * http://www.apache.org/licenses/LICENSE-2.0
  10 + *
  11 + * Unless required by applicable law or agreed to in writing, software
  12 + * distributed under the License is distributed on an "AS IS" BASIS,
  13 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 + * See the License for the specific language governing permissions and
  15 + * limitations under the License.
  16 + */
  17 +package org.apache.nutch.urlfilter.domainblacklist;
  18 +
  19 +import junit.framework.TestCase;
  20 +
  21 +import org.slf4j.Logger;
  22 +import org.slf4j.LoggerFactory;
  23 +import org.apache.hadoop.conf.Configuration;
  24 +import org.apache.nutch.util.NutchConfiguration;
  25 +
  26 +public class TestDomainBlacklistURLFilter
  27 + extends TestCase {
  28 +
  29 + protected static final Logger LOG = LoggerFactory.getLogger(TestDomainBlacklistURLFilter.class);
  30 +
  31 + private final static String SEPARATOR = System.getProperty("file.separator");
  32 + private final static String SAMPLES = System.getProperty("test.data", ".");
  33 +
  34 + public TestDomainBlacklistURLFilter(String testName) {
  35 + super(testName);
  36 + }
  37 +
  38 + public void testFilter()
  39 + throws Exception {
  40 +
  41 + String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
  42 + Configuration conf = NutchConfiguration.create();
  43 + DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(domainBlacklistFile);
  44 + domainBlacklistFilter.setConf(conf);
  45 + assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
  46 + assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
  47 + assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
  48 + assertNotNull(domainBlacklistFilter.filter("http://www.google.com"));
  49 + assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
  50 + assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
  51 + assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
  52 + assertNull(domainBlacklistFilter.filter("http://www.yahoo.com"));
  53 + assertNull(domainBlacklistFilter.filter("http://www.foobar.be"));
  54 + assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com"));
  55 + }
  56 +
  57 +}

0 comments on commit efeaf59

Please sign in to comment.
Something went wrong with that request. Please try again.