Skip to content

Commit

Permalink
http protocol avoid pulling content for redirections, fixes #455
Browse files Browse the repository at this point in the history
  • Loading branch information
jnioche committed Apr 19, 2017
1 parent 49cedaf commit 3860d07
Showing 1 changed file with 9 additions and 5 deletions.
Expand Up @@ -51,6 +51,7 @@
import org.slf4j.LoggerFactory;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.protocol.AbstractHttpProtocol;
import com.digitalpebble.stormcrawler.protocol.ProtocolResponse;
import com.digitalpebble.stormcrawler.util.ConfUtils;
Expand Down Expand Up @@ -221,12 +222,15 @@ public ProtocolResponse handleResponse(HttpResponse response)

MutableBoolean trimmed = new MutableBoolean();

byte[] bytes = HttpProtocol.toByteArray(response.getEntity(),
maxContent, trimmed);
byte[] bytes = new byte[] {};

if (trimmed.booleanValue()) {
metadata.setValue("http.trimmed", "true");
LOG.warn("HTTP content trimmed to {}", bytes.length);
if (!Status.REDIRECTION.equals(Status.fromHTTPCode(status))) {
bytes = HttpProtocol.toByteArray(response.getEntity(), maxContent,
trimmed);
if (trimmed.booleanValue()) {
metadata.setValue("http.trimmed", "true");
LOG.warn("HTTP content trimmed to {}", bytes.length);
}
}

if (storeHTTPHeaders) {
Expand Down

0 comments on commit 3860d07

Please sign in to comment.