From 3165c4d485d555f7729503dd7cae377f90fab949 Mon Sep 17 00:00:00 2001
From: Jake Dodd <jake@ontopic.io>
Date: Wed, 17 Dec 2014 08:45:50 -0800
Subject: [PATCH 1/2] Updated robots rules caching + a RobotsURLFilter

This commit adds an interface for robots caches, an in-memory,
thread-safe cache implementation, and a basic URLFilter for robots
rules.
---
 .../crawler/filtering/RobotsURLFilter.java    | 79 +++++++++++++++++++
 .../crawler/protocol/MemoryRobotsCache.java   | 57 +++++++++++++
 .../crawler/protocol/RobotRulesParser.java    |  2 +-
 .../storm/crawler/protocol/RobotsCache.java   | 40 ++++++++++
 .../protocol/http/HttpRobotRulesParser.java   | 65 ++++++++-------
 src/main/resources/urlfilters.json            |  5 ++
 6 files changed, 219 insertions(+), 29 deletions(-)
 create mode 100644 src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java
 create mode 100644 src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java
 create mode 100644 src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java

diff --git a/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java b/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java
new file mode 100644
index 000000000..cfdce02a8
--- /dev/null
+++ b/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to DigitalPebble Ltd under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * DigitalPebble licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.digitalpebble.storm.crawler.filtering;
+
+import com.digitalpebble.storm.crawler.protocol.MemoryRobotsCache;
+import com.digitalpebble.storm.crawler.protocol.RobotsCache;
+import com.fasterxml.jackson.databind.JsonNode;
+import crawlercommons.robots.BaseRobotRules;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+
+/**
+ * This {@link com.digitalpebble.storm.crawler.filtering.URLFilter} filters outlinks
+ * using the robots rules for the domain. If the rules for the domain aren't found
+ * in the cache, the outlink will pass the filter.
+ */
+public class RobotsURLFilter implements URLFilter {
+
+    private RobotsCache cache;
+
+    public String filter(String URL) {
+        try {
+            URL url = new URL(URL);
+            String key = getCacheKey(url);
+            BaseRobotRules rules = cache.get(key);
+            // If we have a cache miss, return the URL
+            if (rules == null)
+                return URL;
+            if (rules.isAllowed(URL))
+                return URL;
+            else
+                return null;
+        } catch (MalformedURLException e) {
+            return null;
+        }
+    }
+
+    public void configure(JsonNode paramNode) {
+        //TODO Specify the cache in the config
+        this.cache = MemoryRobotsCache.getInstance();
+    }
+
+    /**
+     * Compose unique key to store and access robot rules in cache for given URL
+     */
+    private static String getCacheKey(URL url) {
+        // TODO This method is a direct port from HttpRobotsRulesParser. We should consolidate
+
+        String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+        // case
+        String host = url.getHost().toLowerCase(); // normalize to lower case
+        int port = url.getPort();
+        if (port == -1) {
+            port = url.getDefaultPort();
+        }
+        /*
+         * Robot rules apply only to host, protocol, and port where robots.txt
+         * is hosted (cf. NUTCH-1752). Consequently
+         */
+        String cacheKey = protocol + ":" + host + ":" + port;
+        return cacheKey;
+    }
+}
diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java
new file mode 100644
index 000000000..7b19cbcd6
--- /dev/null
+++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to DigitalPebble Ltd under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * DigitalPebble licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.digitalpebble.storm.crawler.protocol;
+
+import org.apache.storm.guava.cache.CacheBuilder;
+import org.apache.storm.guava.cache.CacheLoader;
+import org.apache.storm.guava.cache.LoadingCache;
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * Provides an in-memory, singleton, thread-safe cache for robots rules.
+ */
+public class MemoryRobotsCache implements RobotsCache {
+
+    private static final long MAX_SIZE = 1000;
+
+    private static final MemoryRobotsCache INSTANCE = new MemoryRobotsCache();
+
+    private static LoadingCache<String, BaseRobotRules> CACHE;
+
+    public static MemoryRobotsCache getInstance() {
+        return INSTANCE;
+    }
+
+    private MemoryRobotsCache() {
+        CACHE = CacheBuilder.newBuilder().maximumSize(MAX_SIZE).build( new CacheLoader<String, BaseRobotRules>() {
+            @Override
+            public BaseRobotRules load(String s) throws Exception {
+                return null;
+            }
+        });
+    }
+
+    public BaseRobotRules get(String key) {
+        return CACHE.getIfPresent(key);
+    }
+
+    public void put(String key, BaseRobotRules rules) {
+        CACHE.put(key, rules);
+    }
+
+}
diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotRulesParser.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotRulesParser.java
index 2b35113e1..076835e45 100644
--- a/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotRulesParser.java
+++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotRulesParser.java
@@ -46,7 +46,7 @@ public abstract class RobotRulesParser {
     public static final Logger LOG = LoggerFactory
             .getLogger(RobotRulesParser.class);
 
-    protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules>();
+    protected RobotsCache cache;
 
     /**
      * A {@link BaseRobotRules} object appropriate for use when the
diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java
new file mode 100644
index 000000000..5789a72f7
--- /dev/null
+++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to DigitalPebble Ltd under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * DigitalPebble licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.digitalpebble.storm.crawler.protocol;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This interface defines the methods that must be implemented by a cache for Robots rules.
+ */
+public interface RobotsCache {
+
+    /**
+     *
+     * @param key Cache key
+     * @return Returns the robots rules for the key, or null if there's a cache miss.
+     */
+    public BaseRobotRules get(String key);
+
+    /**
+     *
+     * @param key Cache key
+     * @param rules Robots rules to associate with the key
+     */
+    public void put(String key, BaseRobotRules rules);
+}
diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java
index c8c587fde..5694f0387 100644
--- a/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java
+++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java
@@ -17,21 +17,16 @@
 
 package com.digitalpebble.storm.crawler.protocol.http;
 
-import java.net.URL;
-import java.util.Collections;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import backtype.storm.Config;
-
-import com.digitalpebble.storm.crawler.protocol.Protocol;
-import com.digitalpebble.storm.crawler.protocol.ProtocolResponse;
-import com.digitalpebble.storm.crawler.protocol.RobotRulesParser;
+import com.digitalpebble.storm.crawler.protocol.*;
 import com.digitalpebble.storm.crawler.util.ConfUtils;
 import com.digitalpebble.storm.crawler.util.KeyValues;
-
 import crawlercommons.robots.BaseRobotRules;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.URL;
+import java.util.Collections;
 
 /**
  * This class is used for parsing robots for urls belonging to HTTP protocol. It
@@ -44,17 +39,28 @@ public class HttpRobotRulesParser extends RobotRulesParser {
             .getLogger(HttpRobotRulesParser.class);
     protected boolean allowForbidden = false;
 
-    HttpRobotRulesParser() {
-    }
+    private RobotsCache cache;
 
+    /**
+     * @param conf The topology {@link backtype.storm.Config}.
+     * Default constructor uses an in-memory robots rules cache,
+     * {@link com.digitalpebble.storm.crawler.protocol.MemoryRobotsCache}
+     */
     public HttpRobotRulesParser(Config conf) {
+        this.cache = MemoryRobotsCache.getInstance();
         setConf(conf);
     }
 
-    public void setConf(Config conf) {
-        super.setConf(conf);
-        allowForbidden = ConfUtils.getBoolean(conf, "http.robots.403.allow",
-                true);
+    /**
+     *
+     * @param conf The topology {@link backtype.storm.Config}.
+     * @param cache The {@link com.digitalpebble.storm.crawler.protocol.RobotsCache}
+     * to use for the parser.
+     */
+
+    public HttpRobotRulesParser(Config conf, RobotsCache cache) {
+        this.cache = cache;
+        setConf(conf);
     }
 
     /**
@@ -62,7 +68,7 @@ public void setConf(Config conf) {
      */
     protected static String getCacheKey(URL url) {
         String protocol = url.getProtocol().toLowerCase(); // normalize to lower
-                                                           // case
+        // case
         String host = url.getHost().toLowerCase(); // normalize to lower case
         int port = url.getPort();
         if (port == -1) {
@@ -76,6 +82,12 @@ protected static String getCacheKey(URL url) {
         return cacheKey;
     }
 
+    public void setConf(Config conf) {
+        super.setConf(conf);
+        allowForbidden = ConfUtils.getBoolean(conf, "http.robots.403.allow",
+                true);
+    }
+
     /**
      * Get the rules from robots.txt which applies for the given {@code url}.
      * Robot rules are cached for a unique combination of host, protocol, and
@@ -83,17 +95,14 @@ protected static String getCacheKey(URL url) {
      * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and
      * the rules are cached to avoid re-fetching and re-parsing it again.
      *
-     * @param http
-     *            The {@link Protocol} object
-     * @param url
-     *            URL robots.txt applies to
-     *
+     * @param http The {@link Protocol} object
+     * @param url  URL robots.txt applies to
      * @return {@link BaseRobotRules} holding the rules from robots.txt
      */
     public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
 
         String cacheKey = getCacheKey(url);
-        BaseRobotRules robotRules = CACHE.get(cacheKey);
+        BaseRobotRules robotRules = cache.get(cacheKey);
 
         boolean cacheRule = true;
 
@@ -105,7 +114,7 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
             try {
                 ProtocolResponse response = http.getProtocolOutput(new URL(url,
                         "/robots.txt").toString(), Collections
-                        .<String, String[]> emptyMap());
+                        .<String, String[]>emptyMap());
 
                 // try one level of redirection ?
                 if (response.getStatusCode() == 301
@@ -127,7 +136,7 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
                             redir = new URL(redirection);
                         }
                         response = http.getProtocolOutput(redir.toString(),
-                                Collections.<String, String[]> emptyMap());
+                                Collections.<String, String[]>emptyMap());
                     }
                 }
 
@@ -155,11 +164,11 @@ else if (response.getStatusCode() >= 500) {
             }
 
             if (cacheRule) {
-                CACHE.put(cacheKey, robotRules); // cache rules for host
+                cache.put(cacheKey, robotRules); // cache rules for host
                 if (redir != null
                         && !redir.getHost().equalsIgnoreCase(url.getHost())) {
                     // cache also for the redirected host
-                    CACHE.put(getCacheKey(redir), robotRules);
+                    cache.put(getCacheKey(redir), robotRules);
                 }
             }
         }
diff --git a/src/main/resources/urlfilters.json b/src/main/resources/urlfilters.json
index 670dae37d..7a5e1fc7a 100644
--- a/src/main/resources/urlfilters.json
+++ b/src/main/resources/urlfilters.json
@@ -20,6 +20,11 @@
       "params": {
         "regexFilterFile": "default-regex-filters.txt"
       }
+    },
+    {
+      "class": "com.digitalpebble.storm.crawler.filtering.RobotsURLFilter",
+      "name": "RobotsURLFilter",
+      "params": {}
     }
 
   ]

From 06cfc082242ea9d10d49c41b2f78b3ad820aa479 Mon Sep 17 00:00:00 2001
From: Jake Dodd <jake@ontopic.io>
Date: Fri, 19 Dec 2014 08:09:34 -0800
Subject: [PATCH 2/2] ParserBolt conditionally fetches robots.txt

---
 .../storm/crawler/bolt/ParserBolt.java        | 19 ++++++++++++
 .../crawler/filtering/RobotsURLFilter.java    | 22 +------------
 .../crawler/protocol/MemoryRobotsCache.java   | 31 ++++++++++++++-----
 .../storm/crawler/protocol/RobotsCache.java   |  4 +++
 .../protocol/http/HttpRobotRulesParser.java   | 24 ++------------
 5 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/src/main/java/com/digitalpebble/storm/crawler/bolt/ParserBolt.java b/src/main/java/com/digitalpebble/storm/crawler/bolt/ParserBolt.java
index 5be36ba99..35348a3d6 100644
--- a/src/main/java/com/digitalpebble/storm/crawler/bolt/ParserBolt.java
+++ b/src/main/java/com/digitalpebble/storm/crawler/bolt/ParserBolt.java
@@ -27,6 +27,9 @@
 import java.util.Map;
 import java.util.Set;
 
+import backtype.storm.Config;
+import com.digitalpebble.storm.crawler.protocol.Protocol;
+import com.digitalpebble.storm.crawler.protocol.ProtocolFactory;
 import org.apache.commons.lang.StringUtils;
 import org.apache.html.dom.HTMLDocumentImpl;
 import org.apache.tika.Tika;
@@ -89,9 +92,14 @@ public class ParserBolt extends BaseRichBolt {
     private boolean upperCaseElementNames = true;
     private Class HTMLMapperClass = IdentityHtmlMapper.class;
 
+    private ProtocolFactory protocolFactory;
+
     public void prepare(Map conf, TopologyContext context,
             OutputCollector collector) {
 
+        Config config = new Config();
+        config.putAll(conf);
+
         String urlconfigfile = ConfUtils.getString(conf,
                 "urlfilters.config.file", "urlfilters.json");
 
@@ -150,6 +158,8 @@ public void prepare(Map conf, TopologyContext context,
 
         LOG.debug("Tika loaded in " + (end - start) + " msec");
 
+        this.protocolFactory = new ProtocolFactory(config);
+
         this.collector = collector;
 
         this.eventMeters = context.registerMetric("parser-meter",
@@ -264,6 +274,15 @@ public void execute(Tuple tuple) {
 
         List<Link> links = linkHandler.getLinks();
         Set<String> slinks = new HashSet<String>(links.size());
+
+        Protocol protocol = protocolFactory.getProtocol(url_);
+
+        // TODO This is a method call with non-explicit side effects...yuck
+        // Calling getRobotsRules will seed the cache with the rules for this
+        // URL, if not already present, ensuring they'll be available downstream
+        // to the robots url filter. There's got to be a better way to do this.
+        protocol.getRobotRules(url);
+
         for (Link l : links) {
             if (StringUtils.isBlank(l.getUri()))
                 continue;
diff --git a/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java b/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java
index cfdce02a8..d79b782f6 100644
--- a/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java
+++ b/src/main/java/com/digitalpebble/storm/crawler/filtering/RobotsURLFilter.java
@@ -37,7 +37,7 @@ public class RobotsURLFilter implements URLFilter {
     public String filter(String URL) {
         try {
             URL url = new URL(URL);
-            String key = getCacheKey(url);
+            String key = cache.getCacheKey(url);
             BaseRobotRules rules = cache.get(key);
             // If we have a cache miss, return the URL
             if (rules == null)
@@ -56,24 +56,4 @@ public void configure(JsonNode paramNode) {
         this.cache = MemoryRobotsCache.getInstance();
     }
 
-    /**
-     * Compose unique key to store and access robot rules in cache for given URL
-     */
-    private static String getCacheKey(URL url) {
-        // TODO This method is a direct port from HttpRobotsRulesParser. We should consolidate
-
-        String protocol = url.getProtocol().toLowerCase(); // normalize to lower
-        // case
-        String host = url.getHost().toLowerCase(); // normalize to lower case
-        int port = url.getPort();
-        if (port == -1) {
-            port = url.getDefaultPort();
-        }
-        /*
-         * Robot rules apply only to host, protocol, and port where robots.txt
-         * is hosted (cf. NUTCH-1752). Consequently
-         */
-        String cacheKey = protocol + ":" + host + ":" + port;
-        return cacheKey;
-    }
 }
diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java
index 7b19cbcd6..32bd5de26 100644
--- a/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java
+++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/MemoryRobotsCache.java
@@ -17,11 +17,14 @@
 
 package com.digitalpebble.storm.crawler.protocol;
 
+import org.apache.storm.guava.cache.Cache;
 import org.apache.storm.guava.cache.CacheBuilder;
 import org.apache.storm.guava.cache.CacheLoader;
 import org.apache.storm.guava.cache.LoadingCache;
 import crawlercommons.robots.BaseRobotRules;
 
+import java.net.URL;
+
 /**
  * Provides an in-memory, singleton, thread-safe cache for robots rules.
  */
@@ -31,19 +34,14 @@ public class MemoryRobotsCache implements RobotsCache {
 
     private static final MemoryRobotsCache INSTANCE = new MemoryRobotsCache();
 
-    private static LoadingCache<String, BaseRobotRules> CACHE;
+    private static Cache<String, BaseRobotRules> CACHE;
 
     public static MemoryRobotsCache getInstance() {
         return INSTANCE;
     }
 
     private MemoryRobotsCache() {
-        CACHE = CacheBuilder.newBuilder().maximumSize(MAX_SIZE).build( new CacheLoader<String, BaseRobotRules>() {
-            @Override
-            public BaseRobotRules load(String s) throws Exception {
-                return null;
-            }
-        });
+        CACHE = CacheBuilder.newBuilder().maximumSize(MAX_SIZE).build();
     }
 
     public BaseRobotRules get(String key) {
@@ -54,4 +52,23 @@ public void put(String key, BaseRobotRules rules) {
         CACHE.put(key, rules);
     }
 
+    /**
+     * Compose unique key to store and access robot rules in cache for given URL
+     */
+    public String getCacheKey(URL url) {
+        String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+        // case
+        String host = url.getHost().toLowerCase(); // normalize to lower case
+        int port = url.getPort();
+        if (port == -1) {
+            port = url.getDefaultPort();
+        }
+        /*
+         * Robot rules apply only to host, protocol, and port where robots.txt
+         * is hosted (cf. NUTCH-1752). Consequently
+         */
+        String cacheKey = protocol + ":" + host + ":" + port;
+        return cacheKey;
+    }
+
 }
diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java
index 5789a72f7..39a70c186 100644
--- a/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java
+++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/RobotsCache.java
@@ -19,6 +19,8 @@
 
 import crawlercommons.robots.BaseRobotRules;
 
+import java.net.URL;
+
 /**
  * This interface defines the methods that must be implemented by a cache for Robots rules.
  */
@@ -37,4 +39,6 @@ public interface RobotsCache {
      * @param rules Robots rules to associate with the key
      */
     public void put(String key, BaseRobotRules rules);
+
+    public String getCacheKey(URL url);
 }
diff --git a/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java b/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java
index 5694f0387..3452ec51f 100644
--- a/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java
+++ b/src/main/java/com/digitalpebble/storm/crawler/protocol/http/HttpRobotRulesParser.java
@@ -63,25 +63,6 @@ public HttpRobotRulesParser(Config conf, RobotsCache cache) {
         setConf(conf);
     }
 
-    /**
-     * Compose unique key to store and access robot rules in cache for given URL
-     */
-    protected static String getCacheKey(URL url) {
-        String protocol = url.getProtocol().toLowerCase(); // normalize to lower
-        // case
-        String host = url.getHost().toLowerCase(); // normalize to lower case
-        int port = url.getPort();
-        if (port == -1) {
-            port = url.getDefaultPort();
-        }
-        /*
-         * Robot rules apply only to host, protocol, and port where robots.txt
-         * is hosted (cf. NUTCH-1752). Consequently
-         */
-        String cacheKey = protocol + ":" + host + ":" + port;
-        return cacheKey;
-    }
-
     public void setConf(Config conf) {
         super.setConf(conf);
         allowForbidden = ConfUtils.getBoolean(conf, "http.robots.403.allow",
@@ -101,7 +82,7 @@ public void setConf(Config conf) {
      */
     public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
 
-        String cacheKey = getCacheKey(url);
+        String cacheKey = cache.getCacheKey(url);
         BaseRobotRules robotRules = cache.get(cacheKey);
 
         boolean cacheRule = true;
@@ -168,7 +149,8 @@ else if (response.getStatusCode() >= 500) {
                 if (redir != null
                         && !redir.getHost().equalsIgnoreCase(url.getHost())) {
                     // cache also for the redirected host
-                    cache.put(getCacheKey(redir), robotRules);
+                    String redirKey = cache.getCacheKey(redir);
+                    cache.put(redirKey, robotRules);
                 }
             }
         }