diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java index 3fbe932667..2f52784373 100644 --- a/src/java/org/apache/nutch/net/protocols/Response.java +++ b/src/java/org/apache/nutch/net/protocols/Response.java @@ -85,7 +85,7 @@ public static enum TruncatedContentReason { }; /** - * Get the URL used to retrieve this response. + * Get the URL the protocol actually used when requesting the Response. * @return {@link java.net.URL} */ public URL getUrl(); diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java index 2514eae33e..34805febb7 100644 --- a/src/java/org/apache/nutch/protocol/Protocol.java +++ b/src/java/org/apache/nutch/protocol/Protocol.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.protocol; +import java.net.MalformedURLException; import java.net.URL; import java.util.List; @@ -40,6 +41,18 @@ public interface Protocol extends Pluggable, Configurable { */ ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum); +/** + * Resolve a relative URL against a base URL using the protocol's URL + * library. + * + * @param base the base URL the relative URL is resolved against + * @param relative the relative URL string (typically a Location: header value) + * @return resolved absolute URL + * @throws MalformedURLException if the URL is malformed + */ + default URL resolveUrl(URL base, String relative) throws MalformedURLException { + return new URL(base, relative); + } /** * Retrieve robot rules applicable for this URL. * diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index caa3f861ea..c6c6bf7f99 100755 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -49,7 +49,6 @@ import org.apache.nutch.util.GZIPUtils; import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.DeflateUtils; -import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.IntWritable; diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 9da92698fd..7c5d8aa0aa 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -170,7 +170,8 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url, LOG.debug("Following robots.txt redirect: {} -> {}", robotsUrlRedir, redirectionLocation); try { - robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation); + robotsUrlRedir = ((HttpBase) http).resolveUrl(robotsUrlRedir, + redirectionLocation); } catch (MalformedURLException e) { LOG.info( "Failed to resolve redirect location for robots.txt: {} -> {} ({})", diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java index 87ee0bb8ac..a7109bce4a 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java @@ -121,6 +121,17 @@ public class HttpResponse implements Response { client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941 code = client.executeMethod(get); + // When followRedirects=true HC3 walks the redirect chain internally; + // getURI() returns the final URI. Capture it so getUrl() honors the + // contract — without this, robots.txt redirects via this plugin + // report the original URL even though a different URL was fetched. + try { + this.url = new URL(get.getURI().toString()); + } catch (org.apache.commons.httpclient.URIException + | java.net.MalformedURLException e) { + // Keep the input URL or try to normalize it? + } + Header[] heads = get.getResponseHeaders(); for (int i = 0; i < heads.length; i++) { diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index a9d2b14d42..8c898a0e27 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -20,6 +20,7 @@ import java.lang.invoke.MethodHandles; import java.net.InetAddress; import java.net.InetSocketAddress; +import java.net.MalformedURLException; import java.net.Proxy; import java.net.ProxySelector; import java.net.SocketAddress; @@ -58,6 +59,7 @@ import okhttp3.Gzip; import okhttp3.Handshake; import okhttp3.Headers; +import okhttp3.HttpUrl; import okhttp3.Interceptor; import okhttp3.OkHttpClient; import okhttp3.Protocol; @@ -436,6 +438,22 @@ protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) return new OkHttpResponse(this, url, datum); } + /** + * Resolve a relative URL using OkHttp's {@link HttpUrl} parser, which is + * more lenient than Java's {@link URL} (handles malformed protocol-relative + * slashes such as {@code https:////host/path}, IDN→punycode, host case + * normalization, etc.). Falls back to Java URL if HttpUrl cannot parse. + */ + @Override + public URL resolveUrl(URL base, String relative) throws MalformedURLException { + HttpUrl baseHttpUrl = HttpUrl.get(base); + HttpUrl resolved = baseHttpUrl.resolve(relative); + if (resolved != null) { + return resolved.url(); + } + return super.resolveUrl(base, relative); + } + public static void main(String[] args) throws Exception { OkHttp okhttp = new OkHttp(); okhttp.setConf(NutchConfiguration.create()); diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java index 605c03390f..7616bc886e 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java @@ -71,7 +71,7 @@ public boolean booleanValue() { public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum) throws ProtocolException, IOException { - this.url = url; + this.url = url; // provisional; overwritten below with the normalized form Request.Builder rb = new Request.Builder().url(url); @@ -102,7 +102,14 @@ public OkHttpResponse(OkHttp okhttp, URL url, CrawlDatum datum) } Request request = rb.build(); - okhttp3.Call call = okhttp.getClient(url).newCall(request); + + // OkHttp parsed the URL via HttpUrl; that's the form actually going on + // the wire (IDN→punycode, repeated-slash repair, host lowercasing). + this.url = request.url().url(); + if (LOG.isDebugEnabled() && !this.url.toString().equals(url.toString())) { + LOG.debug("The normalized URL different from the requested URL: {} -> {}", url, this.url); + } + okhttp3.Call call = okhttp.getClient(this.url).newCall(request); // ensure that Response and underlying ResponseBody are closed try (okhttp3.Response response = call.execute()) {