From 3165c4d485d555f7729503dd7cae377f90fab949 Mon Sep 17 00:00:00 2001 From: Jake Dodd Date: Wed, 17 Dec 2014 08:45:50 -0800 Subject: [PATCH 1/2] Updated robots rules caching + a RobotsURLFilter This commit adds an interface for robots caches, an in-memory, thread-safe cache implementation, and a basic URLFilter for robots rules. --- .../crawler/filtering/RobotsURLFilter.java | 79 +++++++++++++++++++ .../crawler/protocol/MemoryRobotsCache.java | 57 +++++++++++++ .../crawler/protocol/RobotRulesParser.java | 2 +- .../storm/crawler/protocol/RobotsCache.java | 40 ++++++++++ .../protocol/http/HttpRobotRulesParser.java | 65 ++++++++------- src/main/resources/urlfilters.json | 5 ++ 6 files changed, 219 insertions(+), 29 deletions(-) create mode 100644 src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java create mode 100644 src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java create mode 100644 src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java diff --git a/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java b/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java new file mode 100644 index 000000000..cfdce02a8 --- /dev/null +++ b/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java @@ -0,0 +1,79 @@ +/** + * Licensed to DigitalPebble Ltd under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * DigitalPebble licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.digitalpebble.storm.crawler.filtering; + +import com.digitalpebble.storm.crawler.protocol.MemoryRobotsCache; +import com.digitalpebble.storm.crawler.protocol.RobotsCache; +import com.fasterxml.jackson.databind.JsonNode; +import crawlercommons.robots.BaseRobotRules; + +import java.net.MalformedURLException; +import java.net.URL; + +/** + * This {@link com.digitalpebble.storm.crawler.filtering.URLFilter} filters outlinks + * using the robots rules for the domain. If the rules for the domain aren't found + * in the cache, the outlink will pass the filter. + */ +public class RobotsURLFilter implements URLFilter { + + private RobotsCache cache; + + public String filter(String URL) { + try { + URL url = new URL(URL); + String key = getCacheKey(url); + BaseRobotRules rules = cache.get(key); + // If we have a cache miss, return the URL + if (rules == null) + return URL; + if (rules.isAllowed(URL)) + return URL; + else + return null; + } catch (MalformedURLException e) { + return null; + } + } + + public void configure(JsonNode paramNode) { + //TODO Specify the cache in the config + this.cache = MemoryRobotsCache.getInstance(); + } + + /** + * Compose unique key to store and access robot rules in cache for given URL + */ + private static String getCacheKey(URL url) { + // TODO This method is a direct port from HttpRobotsRulesParser. We should consolidate + + String protocol = url.getProtocol().toLowerCase(); // normalize to lower + // case + String host = url.getHost().toLowerCase(); // normalize to lower case + int port = url.getPort(); + if (port == -1) { + port = url.getDefaultPort(); + } + /* + * Robot rules apply only to host, protocol, and port where robots.txt + * is hosted (cf. NUTCH-1752). Consequently + */ + String cacheKey = protocol + ":" + host + ":" + port; + return cacheKey; + } +} diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java new file mode 100644 index 000000000..7b19cbcd6 --- /dev/null +++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java @@ -0,0 +1,57 @@ +/** + * Licensed to DigitalPebble Ltd under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * DigitalPebble licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.digitalpebble.storm.crawler.protocol; + +import org.apache.storm.guava.cache.CacheBuilder; +import org.apache.storm.guava.cache.CacheLoader; +import org.apache.storm.guava.cache.LoadingCache; +import crawlercommons.robots.BaseRobotRules; + +/** + * Provides an in-memory, singleton, thread-safe cache for robots rules. + */ +public class MemoryRobotsCache implements RobotsCache { + + private static final long MAX_SIZE = 1000; + + private static final MemoryRobotsCache INSTANCE = new MemoryRobotsCache(); + + private static LoadingCache CACHE; + + public static MemoryRobotsCache getInstance() { + return INSTANCE; + } + + private MemoryRobotsCache() { + CACHE = CacheBuilder.newBuilder().maximumSize(MAX_SIZE).build( new CacheLoader() { + @Override + public BaseRobotRules load(String s) throws Exception { + return null; + } + }); + } + + public BaseRobotRules get(String key) { + return CACHE.getIfPresent(key); + } + + public void put(String key, BaseRobotRules rules) { + CACHE.put(key, rules); + } + +} diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotRulesParser.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotRulesParser.java index 2b35113e1..076835e45 100644 --- a/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotRulesParser.java +++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotRulesParser.java @@ -46,7 +46,7 @@ public abstract class RobotRulesParser { public static final Logger LOG = LoggerFactory .getLogger(RobotRulesParser.class); - protected static final Hashtable CACHE = new Hashtable(); + protected RobotsCache cache; /** * A {@link BaseRobotRules} object appropriate for use when the diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java new file mode 100644 index 000000000..5789a72f7 --- /dev/null +++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java @@ -0,0 +1,40 @@ +/** + * Licensed to DigitalPebble Ltd under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * DigitalPebble licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.digitalpebble.storm.crawler.protocol; + +import crawlercommons.robots.BaseRobotRules; + +/** + * This interface defines the methods that must be implemented by a cache for Robots rules. + */ +public interface RobotsCache { + + /** + * + * @param key Cache key + * @return Returns the robots rules for the key, or null if there's a cache miss. + */ + public BaseRobotRules get(String key); + + /** + * + * @param key Cache key + * @param rules Robots rules to associate with the key + */ + public void put(String key, BaseRobotRules rules); +} diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java index c8c587fde..5694f0387 100644 --- a/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java +++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java @@ -17,21 +17,16 @@ package com.digitalpebble.storm.crawler.protocol.http; -import java.net.URL; -import java.util.Collections; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import backtype.storm.Config; - -import com.digitalpebble.storm.crawler.protocol.Protocol; -import com.digitalpebble.storm.crawler.protocol.ProtocolResponse; -import com.digitalpebble.storm.crawler.protocol.RobotRulesParser; +import com.digitalpebble.storm.crawler.protocol.*; import com.digitalpebble.storm.crawler.util.ConfUtils; import com.digitalpebble.storm.crawler.util.KeyValues; - import crawlercommons.robots.BaseRobotRules; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URL; +import java.util.Collections; /** * This class is used for parsing robots for urls belonging to HTTP protocol. It @@ -44,17 +39,28 @@ public class HttpRobotRulesParser extends RobotRulesParser { .getLogger(HttpRobotRulesParser.class); protected boolean allowForbidden = false; - HttpRobotRulesParser() { - } + private RobotsCache cache; + /** + * @param conf The topology {@link backtype.storm.Config}. + * Default constructor uses an in-memory robots rules cache, + * {@link com.digitalpebble.storm.crawler.protocol.MemoryRobotsCache} + */ public HttpRobotRulesParser(Config conf) { + this.cache = MemoryRobotsCache.getInstance(); setConf(conf); } - public void setConf(Config conf) { - super.setConf(conf); - allowForbidden = ConfUtils.getBoolean(conf, "http.robots.403.allow", - true); + /** + * + * @param conf The topology {@link backtype.storm.Config}. + * @param cache The {@link com.digitalpebble.storm.crawler.protocol.RobotsCache} + * to use for the parser. + */ + + public HttpRobotRulesParser(Config conf, RobotsCache cache) { + this.cache = cache; + setConf(conf); } /** @@ -62,7 +68,7 @@ public void setConf(Config conf) { */ protected static String getCacheKey(URL url) { String protocol = url.getProtocol().toLowerCase(); // normalize to lower - // case + // case String host = url.getHost().toLowerCase(); // normalize to lower case int port = url.getPort(); if (port == -1) { @@ -76,6 +82,12 @@ protected static String getCacheKey(URL url) { return cacheKey; } + public void setConf(Config conf) { + super.setConf(conf); + allowForbidden = ConfUtils.getBoolean(conf, "http.robots.403.allow", + true); + } + /** * Get the rules from robots.txt which applies for the given {@code url}. * Robot rules are cached for a unique combination of host, protocol, and @@ -83,17 +95,14 @@ protected static String getCacheKey(URL url) { * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and * the rules are cached to avoid re-fetching and re-parsing it again. * - * @param http - * The {@link Protocol} object - * @param url - * URL robots.txt applies to - * + * @param http The {@link Protocol} object + * @param url URL robots.txt applies to * @return {@link BaseRobotRules} holding the rules from robots.txt */ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { String cacheKey = getCacheKey(url); - BaseRobotRules robotRules = CACHE.get(cacheKey); + BaseRobotRules robotRules = cache.get(cacheKey); boolean cacheRule = true; @@ -105,7 +114,7 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { try { ProtocolResponse response = http.getProtocolOutput(new URL(url, "/robots.txt").toString(), Collections - . emptyMap()); + .emptyMap()); // try one level of redirection ? if (response.getStatusCode() == 301 @@ -127,7 +136,7 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { redir = new URL(redirection); } response = http.getProtocolOutput(redir.toString(), - Collections. emptyMap()); + Collections.emptyMap()); } } @@ -155,11 +164,11 @@ else if (response.getStatusCode() >= 500) { } if (cacheRule) { - CACHE.put(cacheKey, robotRules); // cache rules for host + cache.put(cacheKey, robotRules); // cache rules for host if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { // cache also for the redirected host - CACHE.put(getCacheKey(redir), robotRules); + cache.put(getCacheKey(redir), robotRules); } } } diff --git a/src/main/resources/urlfilters.json b/src/main/resources/urlfilters.json index 670dae37d..7a5e1fc7a 100644 --- a/src/main/resources/urlfilters.json +++ b/src/main/resources/urlfilters.json @@ -20,6 +20,11 @@ "params": { "regexFilterFile": "default-regex-filters.txt" } + }, + { + "class": "com.digitalpebble.storm.crawler.filtering.RobotsURLFilter", + "name": "RobotsURLFilter", + "params": {} } ] From 06cfc082242ea9d10d49c41b2f78b3ad820aa479 Mon Sep 17 00:00:00 2001 From: Jake Dodd Date: Fri, 19 Dec 2014 08:09:34 -0800 Subject: [PATCH 2/2] ParserBolt conditionally fetches robots.txt --- .../storm/crawler/bolt/ParserBolt.java | 19 ++++++++++++ .../crawler/filtering/RobotsURLFilter.java | 22 +------------ .../crawler/protocol/MemoryRobotsCache.java | 31 ++++++++++++++----- .../storm/crawler/protocol/RobotsCache.java | 4 +++ .../protocol/http/HttpRobotRulesParser.java | 24 ++------------ 5 files changed, 51 insertions(+), 49 deletions(-) diff --git a/src/main/java/com/digitalpebble/storm/crawler/bolt/ParserBolt.java b/src/main/java/com/digitalpebble/storm/crawler/bolt/ParserBolt.java index 5be36ba99..35348a3d6 100644 --- a/src/main/java/com/digitalpebble/storm/crawler/bolt/ParserBolt.java +++ b/src/main/java/com/digitalpebble/storm/crawler/bolt/ParserBolt.java @@ -27,6 +27,9 @@ import java.util.Map; import java.util.Set; +import backtype.storm.Config; +import com.digitalpebble.storm.crawler.protocol.Protocol; +import com.digitalpebble.storm.crawler.protocol.ProtocolFactory; import org.apache.commons.lang.StringUtils; import org.apache.html.dom.HTMLDocumentImpl; import org.apache.tika.Tika; @@ -89,9 +92,14 @@ public class ParserBolt extends BaseRichBolt { private boolean upperCaseElementNames = true; private Class HTMLMapperClass = IdentityHtmlMapper.class; + private ProtocolFactory protocolFactory; + public void prepare(Map conf, TopologyContext context, OutputCollector collector) { + Config config = new Config(); + config.putAll(conf); + String urlconfigfile = ConfUtils.getString(conf, "urlfilters.config.file", "urlfilters.json"); @@ -150,6 +158,8 @@ public void prepare(Map conf, TopologyContext context, LOG.debug("Tika loaded in " + (end - start) + " msec"); + this.protocolFactory = new ProtocolFactory(config); + this.collector = collector; this.eventMeters = context.registerMetric("parser-meter", @@ -264,6 +274,15 @@ public void execute(Tuple tuple) { List links = linkHandler.getLinks(); Set slinks = new HashSet(links.size()); + + Protocol protocol = protocolFactory.getProtocol(url_); + + // TODO This is a method call with non-explicit side effects...yuck + // Calling getRobotsRules will seed the cache with the rules for this + // URL, if not already present, ensuring they'll be available downstream + // to the robots url filter. There's got to be a better way to do this. + protocol.getRobotRules(url); + for (Link l : links) { if (StringUtils.isBlank(l.getUri())) continue; diff --git a/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java b/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java index cfdce02a8..d79b782f6 100644 --- a/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java +++ b/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java @@ -37,7 +37,7 @@ public class RobotsURLFilter implements URLFilter { public String filter(String URL) { try { URL url = new URL(URL); - String key = getCacheKey(url); + String key = cache.getCacheKey(url); BaseRobotRules rules = cache.get(key); // If we have a cache miss, return the URL if (rules == null) @@ -56,24 +56,4 @@ public void configure(JsonNode paramNode) { this.cache = MemoryRobotsCache.getInstance(); } - /** - * Compose unique key to store and access robot rules in cache for given URL - */ - private static String getCacheKey(URL url) { - // TODO This method is a direct port from HttpRobotsRulesParser. We should consolidate - - String protocol = url.getProtocol().toLowerCase(); // normalize to lower - // case - String host = url.getHost().toLowerCase(); // normalize to lower case - int port = url.getPort(); - if (port == -1) { - port = url.getDefaultPort(); - } - /* - * Robot rules apply only to host, protocol, and port where robots.txt - * is hosted (cf. NUTCH-1752). Consequently - */ - String cacheKey = protocol + ":" + host + ":" + port; - return cacheKey; - } } diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java index 7b19cbcd6..32bd5de26 100644 --- a/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java +++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java @@ -17,11 +17,14 @@ package com.digitalpebble.storm.crawler.protocol; +import org.apache.storm.guava.cache.Cache; import org.apache.storm.guava.cache.CacheBuilder; import org.apache.storm.guava.cache.CacheLoader; import org.apache.storm.guava.cache.LoadingCache; import crawlercommons.robots.BaseRobotRules; +import java.net.URL; + /** * Provides an in-memory, singleton, thread-safe cache for robots rules. */ @@ -31,19 +34,14 @@ public class MemoryRobotsCache implements RobotsCache { private static final MemoryRobotsCache INSTANCE = new MemoryRobotsCache(); - private static LoadingCache CACHE; + private static Cache CACHE; public static MemoryRobotsCache getInstance() { return INSTANCE; } private MemoryRobotsCache() { - CACHE = CacheBuilder.newBuilder().maximumSize(MAX_SIZE).build( new CacheLoader() { - @Override - public BaseRobotRules load(String s) throws Exception { - return null; - } - }); + CACHE = CacheBuilder.newBuilder().maximumSize(MAX_SIZE).build(); } public BaseRobotRules get(String key) { @@ -54,4 +52,23 @@ public void put(String key, BaseRobotRules rules) { CACHE.put(key, rules); } + /** + * Compose unique key to store and access robot rules in cache for given URL + */ + public String getCacheKey(URL url) { + String protocol = url.getProtocol().toLowerCase(); // normalize to lower + // case + String host = url.getHost().toLowerCase(); // normalize to lower case + int port = url.getPort(); + if (port == -1) { + port = url.getDefaultPort(); + } + /* + * Robot rules apply only to host, protocol, and port where robots.txt + * is hosted (cf. NUTCH-1752). Consequently + */ + String cacheKey = protocol + ":" + host + ":" + port; + return cacheKey; + } + } diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java index 5789a72f7..39a70c186 100644 --- a/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java +++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java @@ -19,6 +19,8 @@ import crawlercommons.robots.BaseRobotRules; +import java.net.URL; + /** * This interface defines the methods that must be implemented by a cache for Robots rules. */ @@ -37,4 +39,6 @@ public interface RobotsCache { * @param rules Robots rules to associate with the key */ public void put(String key, BaseRobotRules rules); + + public String getCacheKey(URL url); } diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java index 5694f0387..3452ec51f 100644 --- a/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java +++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java @@ -63,25 +63,6 @@ public HttpRobotRulesParser(Config conf, RobotsCache cache) { setConf(conf); } - /** - * Compose unique key to store and access robot rules in cache for given URL - */ - protected static String getCacheKey(URL url) { - String protocol = url.getProtocol().toLowerCase(); // normalize to lower - // case - String host = url.getHost().toLowerCase(); // normalize to lower case - int port = url.getPort(); - if (port == -1) { - port = url.getDefaultPort(); - } - /* - * Robot rules apply only to host, protocol, and port where robots.txt - * is hosted (cf. NUTCH-1752). Consequently - */ - String cacheKey = protocol + ":" + host + ":" + port; - return cacheKey; - } - public void setConf(Config conf) { super.setConf(conf); allowForbidden = ConfUtils.getBoolean(conf, "http.robots.403.allow", @@ -101,7 +82,7 @@ public void setConf(Config conf) { */ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { - String cacheKey = getCacheKey(url); + String cacheKey = cache.getCacheKey(url); BaseRobotRules robotRules = cache.get(cacheKey); boolean cacheRule = true; @@ -168,7 +149,8 @@ else if (response.getStatusCode() >= 500) { if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { // cache also for the redirected host - cache.put(getCacheKey(redir), robotRules); + String redirKey = cache.getCacheKey(redir); + cache.put(redirKey, robotRules); } } }