From ba86ddfcc2431ae41accd8f56cc4ffcba16d1cc3 Mon Sep 17 00:00:00 2001 From: Julien Nioche Date: Wed, 10 Feb 2016 15:38:26 +0000 Subject: [PATCH] NUTCH-2213 : do not store the headers verbatim if the response was compressed --- .../org/apache/nutch/protocol/http/HttpResponse.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java index 77772f0063..f6d7e4dacd 100644 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java @@ -238,10 +238,6 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum datum) haveSeenNonContinueStatus = code != 100; // 100 is "Continue" } - if (httpHeaders != null) { - headers.add("_response.headers_", httpHeaders.toString()); - } - String transferEncoding = getHeader(Response.TRANSFER_ENCODING); if (transferEncoding != null && "chunked" .equalsIgnoreCase(transferEncoding.trim())) { @@ -256,6 +252,11 @@ public HttpResponse(HttpBase http, URL url, CrawlDatum datum) } else if ("deflate".equals(contentEncoding)) { content = http.processDeflateEncoded(content, url); } else { + // store the headers verbatim only if the response was not compressed + // as the content length reported with not match otherwise + if (httpHeaders != null) { + headers.add("_response.headers_", httpHeaders.toString()); + } if (Http.LOG.isTraceEnabled()) { Http.LOG.trace("fetched " + content.length + " bytes from " + url); }