Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NUTCH-2996 Use new SimpleRobotRulesParser API entry point crawler-commons 1.4 #766

Merged
merged 2 commits into from Aug 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
34 changes: 21 additions & 13 deletions conf/nutch-default.xml
Expand Up @@ -72,9 +72,18 @@
<property>
<name>http.agent.name</name>
<value></value>
<description>HTTP 'User-Agent' request header. MUST NOT be empty -
<description>'User-Agent' name: a single word uniquely identifying your crawler.

The value is used to select the group of robots.txt rules addressing your
crawler. It is also sent as part of the HTTP 'User-Agent' request header.

This property MUST NOT be empty -
please set this to a single word uniquely related to your organization.

Following RFC 9309 the 'User-Agent' name (aka. 'product token')
&quot;MUST contain only uppercase and lowercase letters ('a-z' and
'A-Z'), underscores ('_'), and hyphens ('-').&quot;

NOTE: You should also check other related properties:

http.robots.agents
Expand All @@ -84,7 +93,6 @@
http.agent.version

and set their values appropriately.

</description>
</property>

Expand All @@ -95,13 +103,13 @@
parser would look for in robots.txt. Multiple agents can be provided using
comma as a delimiter. eg. mybot,foo-spider,bar-crawler

The ordering of agents does NOT matter and the robots parser would make
decision based on the agent which matches first to the robots rules.
Also, there is NO need to add a wildcard (ie. "*") to this string as the
robots parser would smartly take care of a no-match situation.
The ordering of agents does NOT matter and the robots.txt parser combines
all rules to any of the agent names. Also, there is NO need to add
a wildcard (ie. "*") to this string as the robots parser would smartly
take care of a no-match situation.

If no value is specified, by default HTTP agent (ie. 'http.agent.name')
would be used for user agent matching by the robots parser.
is used for user-agent matching by the robots parser.
</description>
</property>

Expand Down Expand Up @@ -166,19 +174,19 @@
<property>
<name>http.agent.url</name>
<value></value>
<description>A URL to advertise in the User-Agent header. This will
<description>A URL to advertise in the User-Agent header. This will
appear in parenthesis after the agent name. Custom dictates that this
should be a URL of a page explaining the purpose and behavior of this
should be a URL to a page that explains the purpose and behavior of this
crawler.
</description>
</property>

<property>
<name>http.agent.email</name>
<value></value>
<description>An email address to advertise in the HTTP 'From' request
header and User-Agent header. A good practice is to mangle this
address (e.g. 'info at example dot com') to avoid spamming.
<description>An email address to advertise in the HTTP 'User-Agent' (and
'From') request headers. A good practice is to mangle this address
(e.g. 'info at example dot com') to avoid spamming.
</description>
</property>

Expand All @@ -202,7 +210,7 @@
<name>http.agent.rotate.file</name>
<value>agents.txt</value>
<description>
File containing alternative user agent names to be used instead of
File containing alternative user-agent names to be used instead of
http.agent.name on a rotating basis if http.agent.rotate is true.
Each line of the file should contain exactly one agent
specification including name, version, description, URL, etc.
Expand Down
71 changes: 51 additions & 20 deletions src/java/org/apache/nutch/protocol/RobotRulesParser.java
Expand Up @@ -24,12 +24,13 @@
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -96,7 +97,7 @@ public abstract class RobotRulesParser implements Tool {
}

protected Configuration conf;
protected String agentNames;
protected Set<String> agentNames;

/** set of host names or IPs to be explicitly excluded from robots.txt checking */
protected Set<String> allowList = new HashSet<>();
Expand All @@ -114,6 +115,7 @@ public RobotRulesParser(Configuration conf) {
/**
* Set the {@link Configuration} object
*/
@Override
public void setConf(Configuration conf) {
this.conf = conf;

Expand All @@ -122,26 +124,30 @@ public void setConf(Configuration conf) {
if (agentName == null || (agentName = agentName.trim()).isEmpty()) {
throw new RuntimeException("Agent name not configured!");
}
agentNames = agentName;
agentNames = new LinkedHashSet<>();
if (!agentName.equals("*")) {
/*
* skip wildcard "*" - crawler-commons' SimpleRobotRulesParser expects an
* empty set of agent names to use the wildcard rules
*/
agentNames.add(agentName.toLowerCase());
}

// If there are any other agents specified, append those to the list of
// agents
String otherAgents = conf.get("http.robots.agents");
if (otherAgents != null && !otherAgents.trim().isEmpty()) {
StringTokenizer tok = new StringTokenizer(otherAgents, ",");
StringBuilder sb = new StringBuilder(agentNames);
while (tok.hasMoreTokens()) {
String str = tok.nextToken().trim();
if (str.equals("*") || str.equals(agentName)) {
// skip wildcard "*" or agent name itself
// (required for backward compatibility, cf. NUTCH-1715 and
// NUTCH-1718)
String[] otherAgents = conf.getStrings("http.robots.agents");
if (otherAgents != null && otherAgents.length > 0) {
for (String otherAgent : otherAgents) {
otherAgent = otherAgent.toLowerCase();
if (otherAgent.equals("*") || otherAgent.equalsIgnoreCase(agentName)) {
/*
* skip wildcard "*" or agent name itself (required for backward
* compatibility, cf. NUTCH-1715 and NUTCH-1718)
*/
} else {
sb.append(",").append(str);
agentNames.add(otherAgent);
}
}

agentNames = sb.toString();
}

String[] confAllowList = conf.getStrings("http.robot.rules.allowlist");
Expand All @@ -166,6 +172,7 @@ public void setConf(Configuration conf) {
/**
* Get the {@link Configuration} object
*/
@Override
public Configuration getConf() {
return conf;
}
Expand All @@ -188,10 +195,10 @@ public boolean isAllowListed(URL url) {

/**
* Parses the robots content using the {@link SimpleRobotRulesParser} from
* crawler commons
* crawler-commons
*
* @param url
* A string containing url
* The robots.txt URL
* @param content
* Contents of the robots file in a byte array
* @param contentType
Expand All @@ -201,11 +208,32 @@ public boolean isAllowListed(URL url) {
* matching
* @return BaseRobotRules object
*/
@Deprecated
public BaseRobotRules parseRules(String url, byte[] content,
String contentType, String robotName) {
return robotParser.parseContent(url, content, contentType, robotName);
}

/**
* Parses the robots content using the {@link SimpleRobotRulesParser} from
* crawler-commons
*
* @param url
* The robots.txt URL
* @param content
* Contents of the robots file in a byte array
* @param contentType
* The content type of the robots file
* @param robotNames
* A collection containing all the robots agent names used by parser
* for matching
* @return BaseRobotRules object
*/
public BaseRobotRules parseRules(String url, byte[] content,
String contentType, Collection<String> robotNames) {
return robotParser.parseContent(url, content, contentType, robotNames);
}

/**
* Fetch robots.txt (or it's protocol-specific equivalent) which applies to
* the given URL, parse it and return the set of robot rules applicable for
Expand Down Expand Up @@ -274,8 +302,9 @@ public int run(String[] args) {
"\tit is allowed by the robots.txt rules. Other parts of the URLs",
"\t(mainly the host) are ignored.",
"",
"<agent-names>\tcomma-separated list of agent names",
"<agent-names>\tuser-agent name (aka. \"product token\")",
"\tused to select rules from the robots.txt file.",
"\tMultiple agent names can be passed as comma-separated string.",
"\tIf no agent name is given the properties http.agent.name",
"\tand http.robots.agents are used.",
"\tIf also http.agent.name and http.robots.agents are empty,",
Expand Down Expand Up @@ -353,7 +382,8 @@ public int run(String[] args) {
}
}

System.out.println("Testing robots.txt for agent names: " + agentNames);
System.out.println("Testing robots.txt for agent names: "
+ (agentNames.isEmpty() ? "* (any other agent)" : agentNames));

LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
String testPath;
Expand Down Expand Up @@ -393,6 +423,7 @@ public int run(String[] args) {
*/
private static class TestRobotRulesParser extends RobotRulesParser {

@Override
public void setConf(Configuration conf) {
/*
* Make sure that agent name is not empty so that
Expand Down
Expand Up @@ -16,6 +16,8 @@
*/
package org.apache.nutch.protocol.http.api;

import java.util.Set;

import org.junit.Assert;
import org.junit.Test;

Expand Down Expand Up @@ -94,6 +96,64 @@ public TestRobotRulesParser() {
parser = new HttpRobotRulesParser();
}

private void testRulesOnPaths(String agent, String[] paths,
boolean[] results) {
for (int counter = 0; counter < paths.length; counter++) {
boolean res = rules.isAllowed(paths[counter]);
Assert.assertTrue(
"testing on agent (" + agent + "), and " + "path " + paths[counter]
+ " got " + res + ", expected " + results[counter],
res == results[counter]);
}
}

/**
* Test that the robots rules are interpreted correctly by the robots rules
* parser.
*/
@Test
public void testRobotsAgent() {
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, Set.of(SINGLE_AGENT1.toLowerCase()));
testRulesOnPaths(SINGLE_AGENT1, TEST_PATHS, RESULTS_AGENT1);

rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, Set.of(SINGLE_AGENT2.toLowerCase()));
testRulesOnPaths(SINGLE_AGENT2, TEST_PATHS, RESULTS_AGENT2);

rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, Set.of(MULTIPLE_AGENTS.toLowerCase().split("\\s*,\\s*")));
testRulesOnPaths(MULTIPLE_AGENTS, TEST_PATHS, RESULTS_AGENT1_AND_AGENT2);
}

/**
* Test that the crawl delay is extracted from the robots file for respective
* agent. If its not specified for a given agent, default value must be
* returned.
*/
@Test
public void testCrawlDelay() {
// for SINGLE_AGENT1, the crawl delay of 10 seconds, i.e. 10000 msec must be
// returned by the parser
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, Set.of(SINGLE_AGENT1.toLowerCase()));
Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT1 + " : ",
(rules.getCrawlDelay() == 10000));

// for SINGLE_AGENT2, the crawl delay of 20 seconds, i.e. 20000 msec must be
// returned by the parser
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, Set.of(SINGLE_AGENT2.toLowerCase()));
Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT2 + " : ",
(rules.getCrawlDelay() == 20000));

// for UNKNOWN_AGENT, the default crawl delay must be returned.
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, Set.of(UNKNOWN_AGENT.toLowerCase()));
Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
(rules.getCrawlDelay() == Long.MIN_VALUE));
}

/**
* Test that the robots rules are interpreted correctly by the robots rules
* parser.
Expand All @@ -103,36 +163,15 @@ public TestRobotRulesParser() {
public void testRobotsAgentDeprecatedAPIMethod() {
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT1);

for (int counter = 0; counter < TEST_PATHS.length; counter++) {
Assert.assertTrue(
"testing on agent (" + SINGLE_AGENT1 + "), and " + "path "
+ TEST_PATHS[counter] + " got "
+ rules.isAllowed(TEST_PATHS[counter]),
rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT1[counter]);
}
testRulesOnPaths(SINGLE_AGENT1, TEST_PATHS, RESULTS_AGENT1);

rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT2);

for (int counter = 0; counter < TEST_PATHS.length; counter++) {
Assert.assertTrue(
"testing on agent (" + SINGLE_AGENT2 + "), and " + "path "
+ TEST_PATHS[counter] + " got "
+ rules.isAllowed(TEST_PATHS[counter]),
rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT2[counter]);
}
testRulesOnPaths(SINGLE_AGENT2, TEST_PATHS, RESULTS_AGENT2);

rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, MULTIPLE_AGENTS);

for (int counter = 0; counter < TEST_PATHS.length; counter++) {
Assert.assertTrue(
"testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
+ TEST_PATHS[counter] + " got "
+ rules.isAllowed(TEST_PATHS[counter]),
rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT1_AND_AGENT2[counter]);
}
testRulesOnPaths(MULTIPLE_AGENTS, TEST_PATHS, RESULTS_AGENT1_AND_AGENT2);
}

/**
Expand Down