Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/net/protocols/Response.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ public static enum TruncatedContentReason {
};

/**
* Get the URL used to retrieve this response.
* Get the URL the protocol actually used when requesting the Response.
* @return {@link java.net.URL}
*/
public URL getUrl();
Expand Down
13 changes: 13 additions & 0 deletions src/java/org/apache/nutch/protocol/Protocol.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/
package org.apache.nutch.protocol;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;

Expand All @@ -40,6 +41,18 @@ public interface Protocol extends Pluggable, Configurable {
*/
ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum);

/**
* Resolve a relative URL against a base URL using the protocol's URL
* library.
*
* @param base the base URL the relative URL is resolved against
* @param relative the relative URL string (typically a Location: header value)
* @return resolved absolute URL
* @throws MalformedURLException if the URL is malformed
*/
default URL resolveUrl(URL base, String relative) throws MalformedURLException {
return new URL(base, relative);
}
/**
* Retrieve robot rules applicable for this URL.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
import org.apache.nutch.util.GZIPUtils;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.DeflateUtils;
import org.apache.hadoop.util.StringUtils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,8 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url,
LOG.debug("Following robots.txt redirect: {} -> {}", robotsUrlRedir,
redirectionLocation);
try {
robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation);
robotsUrlRedir = ((HttpBase) http).resolveUrl(robotsUrlRedir,
redirectionLocation);
} catch (MalformedURLException e) {
LOG.info(
"Failed to resolve redirect location for robots.txt: {} -> {} ({})",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,17 @@ public class HttpResponse implements Response {
client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941
code = client.executeMethod(get);

// When followRedirects=true HC3 walks the redirect chain internally;
// getURI() returns the final URI. Capture it so getUrl() honors the
// contract — without this, robots.txt redirects via this plugin
// report the original URL even though a different URL was fetched.
try {
this.url = new URL(get.getURI().toString());
} catch (org.apache.commons.httpclient.URIException
| java.net.MalformedURLException e) {
// Keep the input URL or try to normalize it?
}

Header[] heads = get.getResponseHeaders();

for (int i = 0; i < heads.length; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.lang.invoke.MethodHandles;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.net.ProxySelector;
import java.net.SocketAddress;
Expand Down Expand Up @@ -58,6 +59,7 @@
import okhttp3.Gzip;
import okhttp3.Handshake;
import okhttp3.Headers;
import okhttp3.HttpUrl;
import okhttp3.Interceptor;
import okhttp3.OkHttpClient;
import okhttp3.Protocol;
Expand Down Expand Up @@ -436,6 +438,22 @@ protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
return new OkHttpResponse(this, url, datum);
}

/**
* Resolve a relative URL using OkHttp's {@link HttpUrl} parser, which is
* more lenient than Java's {@link URL} (handles malformed protocol-relative
* slashes such as {@code https:////host/path}, IDN→punycode, host case
* normalization, etc.). Falls back to Java URL if HttpUrl cannot parse.
*/
@Override
public URL resolveUrl(URL base, String relative) throws MalformedURLException {
HttpUrl baseHttpUrl = HttpUrl.get(base);
HttpUrl resolved = baseHttpUrl.resolve(relative);
if (resolved != null) {
return resolved.url();
}
return super.resolveUrl(base, relative);
}

public static void main(String[] args) throws Exception {
OkHttp okhttp = new OkHttp();
okhttp.setConf(NutchConfiguration.create());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ public boolean booleanValue() {
public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum)
throws ProtocolException, IOException {

this.url = url;
this.url = url; // provisional; overwritten below with the normalized form

Request.Builder rb = new Request.Builder().url(url);

Expand Down Expand Up @@ -102,7 +102,14 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum)
}

Request request = rb.build();
okhttp3.Call call = okhttp.getClient(url).newCall(request);

// OkHttp parsed the URL via HttpUrl; that's the form actually going on
// the wire (IDN→punycode, repeated-slash repair, host lowercasing).
this.url = request.url().url();
if (LOG.isDebugEnabled() && !this.url.toString().equals(url.toString())) {
LOG.debug("The normalized URL different from the requested URL: {} -> {}", url, this.url);
}
okhttp3.Call call = okhttp.getClient(this.url).newCall(request);

// ensure that Response and underlying ResponseBody are closed
try (okhttp3.Response response = call.execute()) {
Expand Down