NUTCH-1031 Delegate parsing of robots.txt to crawler-commons

git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1465159 13f79535-47bb-0310-9956-ffa450edef68
apache · Apr 5, 2013 · 9bade80 · 9bade80
1 parent 2f1ca3e
commit 9bade80
Show file tree

Hide file tree

Showing 12 changed files with 504 additions and 951 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)
+
 * NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)
 
 * NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
@@ -74,6 +74,7 @@
 		<dependency org="oro" name="oro" rev="2.0.8" />
 
 		<dependency org="com.google.guava" name="guava" rev="11.0.2" />
+                <dependency org="com.google.code.crawler-commons" name="crawler-commons" rev="0.2" />
 
 		<!--Configuration: test -->
 

diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -51,6 +51,7 @@
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.*;
 
+import crawlercommons.robots.BaseRobotRules;
 
 /**
  * A queue-based fetcher.
@@ -671,8 +672,8 @@ public void run() {
               }
               redirecting = false;
               Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
-              RobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
-              if (!rules.isAllowed(fit.u)) {
+              BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
+              if (!rules.isAllowed(fit.u.toString())) {
                 // unblock
                 fetchQueues.finishFetchItem(fit, true);
                 if (LOG.isDebugEnabled()) {

diff --git a/src/java/org/apache/nutch/protocol/EmptyRobotRules.java b/src/java/org/apache/nutch/protocol/EmptyRobotRules.java
diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java
@@ -25,6 +25,8 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.plugin.Pluggable;
 
+import crawlercommons.robots.BaseRobotRules;
+
 
 /** A retriever of url content.  Implemented by protocol extensions. */
 public interface Protocol extends Pluggable, Configurable {
@@ -59,5 +61,6 @@ public interface Protocol extends Pluggable, Configurable {
    * @param datum page datum
    * @return robot rules (specific for this url or default), never null
    */
-  RobotRules getRobotRules(Text url, CrawlDatum datum);
+  BaseRobotRules getRobotRules(Text url, CrawlDatum datum);
 }
+
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+// JDK imports
+import java.io.File;
+import java.io.FileReader;
+import java.io.LineNumberReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.StringTokenizer;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.Text;
+
+import com.google.common.io.Files;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
+import crawlercommons.robots.SimpleRobotRulesParser;
+
+/**
+ * This class uses crawler-commons for handling the parsing of {@code robots.txt} files.
+ * It emits SimpleRobotRules objects, which describe the download permissions
+ * as described in SimpleRobotRulesParser.
+ */
+public abstract class RobotRulesParser implements Configurable {
+
+  public static final Logger LOG = LoggerFactory.getLogger(RobotRulesParser.class);
+
+  protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules> ();
+
+  /**
+   *  A {@link BaseRobotRules} object appropriate for use
+   *  when the {@code robots.txt} file is empty or missing;
+   *  all requests are allowed.
+   */
+  public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
+
+  /**
+   *  A {@link BaseRobotRules} object appropriate for use when the 
+   *  {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}
+   *  response; all requests are disallowed. 
+   */
+  public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
+
+  private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
+  private Configuration conf;
+  protected String agentNames;
+
+  public RobotRulesParser() { }
+
+  public RobotRulesParser(Configuration conf) {
+    setConf(conf);
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // Grab the agent names we advertise to robots files.
+    String agentName = conf.get("http.agent.name");
+    if (null == agentName) {
+      throw new RuntimeException("Agent name not configured!");
+    }
+
+    String agentNames = conf.get("http.robots.agents");
+    StringTokenizer tok = new StringTokenizer(agentNames, ",");
+    ArrayList<String> agents = new ArrayList<String>();
+    while (tok.hasMoreTokens()) {
+      agents.add(tok.nextToken().trim());
+    }
+
+    /**
+     * If there are no agents for robots-parsing, use the
+     * default agent-string. If both are present, our agent-string
+     * should be the first one we advertise to robots-parsing.
+     */
+    if (agents.size() == 0) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("No agents listed in 'http.robots.agents' property!");
+      }
+    } else { 
+      StringBuffer combinedAgentsString = new StringBuffer(agentName);
+      int index = 0;
+
+      if ((agents.get(0)).equalsIgnoreCase(agentName))
+        index++;
+      else if (LOG.isErrorEnabled()) {
+        LOG.error("Agent we advertise (" + agentName
+            + ") not listed first in 'http.robots.agents' property!");
+      }
+
+      // append all the agents from the http.robots.agents property
+      for(; index < agents.size(); index++) {
+        combinedAgentsString.append(", " + agents.get(index));
+      }
+
+      // always make sure "*" is included in the end
+      combinedAgentsString.append(", *");
+      this.agentNames = combinedAgentsString.toString();
+    }
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler commons
+   *    
+   * @param url A string containing url
+   * @param content Contents of the robots file in a byte array 
+   * @param contentType The 
+   * @param robotName A string containing value of  
+   * @return BaseRobotRules object 
+   */
+  public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) {
+    return robotParser.parseContent(url, content, contentType, robotName); 
+  }
+
+  public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url) {
+    URL u = null;
+    try {
+      u = new URL(url.toString());
+    } catch (Exception e) {
+      return EMPTY_RULES;
+    }
+    return getRobotRulesSet(protocol, u);
+  }
+
+  public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
+
+  /** command-line main for testing */
+  public static void main(String[] argv) {
+
+    if (argv.length < 3) {
+      System.err.println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n");
+      System.err.println("\tThe <robots-file> will be parsed as a robots.txt file,");
+      System.err.println("\tusing the given <agent-name> to select rules.  URLs ");
+      System.err.println("\twill be read (one per line) from <url-file>, and tested");
+      System.err.println("\tagainst the rules. Multiple agent names can be specified using spaces.");
+      System.exit(-1);
+    }
+
+    try {
+      StringBuilder agentNames = new StringBuilder();
+      for(int counter = 2; counter < argv.length; counter++) 
+        agentNames.append(argv[counter]).append(",");
+
+      agentNames.deleteCharAt(agentNames.length()-1);
+
+      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
+      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());
+
+      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+      String testPath = testsIn.readLine().trim();
+      while (testPath != null) {
+        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +
+            ":\t" + testPath);
+        testPath = testsIn.readLine();
+      }
+      testsIn.close();
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+}
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -32,15 +32,16 @@
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
 import org.apache.nutch.util.GZIPUtils;
 import org.apache.nutch.util.DeflateUtils;
 
-
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 
+// crawler-commons imports
+import crawlercommons.robots.BaseRobotRules;
+
 /**
  * @author J&eacute;r&ocirc;me Charron
  */
@@ -51,7 +52,7 @@ public abstract class HttpBase implements Protocol {
 
   private static final byte[] EMPTY_CONTENT = new byte[0];
 
-  private RobotRulesParser robots = null;
+  private HttpRobotRulesParser robots = null;
 
   /** The proxy hostname. */ 
   protected String proxyHost = null;
@@ -105,7 +106,7 @@ public HttpBase(Logger logger) {
     if (logger != null) {
       this.logger = logger;
     }
-    robots = new RobotRulesParser();
+    robots = new HttpRobotRulesParser();
   }
 
   // Inherited Javadoc
@@ -138,7 +139,6 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
     String urlString = url.toString();
     try {
       URL u = new URL(urlString);
-      String host = null;
       Response response = getResponse(u, datum, false); // make a request
 
       int code = response.getCode();
@@ -381,18 +381,16 @@ protected static void main(HttpBase http, String[] args) throws Exception {
       System.out.println("Content:");
       String text = new String(content.getContent());
       System.out.println(text);
-    }
-
+    }  
   }
 
-
   protected abstract Response getResponse(URL url,
                                           CrawlDatum datum,
                                           boolean followRedirects)
     throws ProtocolException, IOException;
 
-  public RobotRules getRobotRules(Text url, CrawlDatum datum) {
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
     return robots.getRobotRulesSet(this, url);
   }
-
 }
+