From 0c3837cade31b4904462af223ddc64f52e31c479 Mon Sep 17 00:00:00 2001 From: omer Date: Thu, 30 Mar 2017 08:59:04 +0300 Subject: [PATCH] Add handling for http cookies - issue #32 --- .../protocol/AbstractHttpProtocol.java | 345 +++++------ .../protocol/httpclient/HttpProtocol.java | 535 +++++++++--------- .../stormcrawler/util/CookieConverter.java | 168 ++++++ .../util/CookieConverterTest.java | 320 +++++++++++ 4 files changed, 942 insertions(+), 426 deletions(-) create mode 100644 core/src/main/java/com/digitalpebble/stormcrawler/util/CookieConverter.java create mode 100644 core/src/test/java/com/digitalpebble/stormcrawler/util/CookieConverterTest.java diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java index dff8e82aa..1c96e5d5b 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/AbstractHttpProtocol.java @@ -1,171 +1,174 @@ -/** - * Licensed to DigitalPebble Ltd under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * DigitalPebble licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.digitalpebble.stormcrawler.protocol; - -import java.util.HashSet; -import java.util.Set; - -import org.apache.commons.lang.StringUtils; -import org.apache.storm.Config; - -import com.digitalpebble.stormcrawler.Metadata; -import com.digitalpebble.stormcrawler.util.ConfUtils; - -import crawlercommons.robots.BaseRobotRules; - -public abstract class AbstractHttpProtocol implements Protocol { - - private com.digitalpebble.stormcrawler.protocol.HttpRobotRulesParser robots; - - protected boolean skipRobots = false; - - protected boolean storeHTTPHeaders = false; - - @Override - public void configure(Config conf) { - this.skipRobots = ConfUtils.getBoolean(conf, "http.skip.robots", false); - this.storeHTTPHeaders = ConfUtils.getBoolean(conf, - "http.store.headers", false); - robots = new HttpRobotRulesParser(conf); - } - - @Override - public BaseRobotRules getRobotRules(String url) { - if (this.skipRobots) - return RobotRulesParser.EMPTY_RULES; - return robots.getRobotRulesSet(this, url); - } - - @Override - public void cleanup() { - } - - public static String getAgentString(Config conf) { - return getAgentString(ConfUtils.getString(conf, "http.agent.name"), - ConfUtils.getString(conf, "http.agent.version"), - ConfUtils.getString(conf, "http.agent.description"), - ConfUtils.getString(conf, "http.agent.url"), - ConfUtils.getString(conf, "http.agent.email")); - } - - protected static String getAgentString(String agentName, - String agentVersion, String agentDesc, String agentURL, - String agentEmail) { - - StringBuilder buf = new StringBuilder(); - - buf.append(agentName); - - if (StringUtils.isNotBlank(agentVersion)) { - buf.append("/"); - buf.append(agentVersion); - } - - boolean hasAgentDesc = StringUtils.isNotBlank(agentDesc); - boolean hasAgentURL = StringUtils.isNotBlank(agentURL); - boolean hasAgentEmail = StringUtils.isNotBlank(agentEmail); - - if (hasAgentDesc || hasAgentEmail || hasAgentURL) { - buf.append(" ("); - - if (hasAgentDesc) { - buf.append(agentDesc); - if (hasAgentURL || hasAgentEmail) - buf.append("; "); - } - - if (hasAgentURL) { - buf.append(agentURL); - if (hasAgentEmail) - buf.append("; "); - } - - if (hasAgentEmail) { - buf.append(agentEmail); - } - - buf.append(")"); - } - - return buf.toString(); - } - - /** Called by extensions of this class **/ - protected static void main(AbstractHttpProtocol protocol, String args[]) - throws Exception { - Config conf = new Config(); - - ConfUtils.loadConf(args[0], conf); - protocol.configure(conf); - - Set threads = new HashSet<>(); - - class Fetchable implements Runnable { - String url; - - Fetchable(String url) { - this.url = url; - } - - public void run() { - - StringBuilder stringB = new StringBuilder(); - stringB.append(url).append("\n"); - - if (!protocol.skipRobots) { - BaseRobotRules rules = protocol.getRobotRules(url); - stringB.append("is allowed : ") - .append(rules.isAllowed(url)); - } - - Metadata md = new Metadata(); - long start = System.currentTimeMillis(); - ProtocolResponse response; - try { - response = protocol.getProtocolOutput(url, md); - stringB.append(response.getMetadata()).append("\n"); - stringB.append("status code: " + response.getStatusCode()) - .append("\n"); - stringB.append( - "content length: " + response.getContent().length) - .append("\n"); - long timeFetching = System.currentTimeMillis() - start; - stringB.append("fetched in : " + timeFetching + " msec"); - System.out.println(stringB); - } catch (Exception e) { - e.printStackTrace(); - } finally { - threads.remove(this); - } - } - } - - for (int i = 1; i < args.length; i++) { - Fetchable p = new Fetchable(args[i]); - threads.add(p); - new Thread(p).start(); - } - - while (threads.size() > 0) { - Thread.sleep(1000); - } - - protocol.cleanup(); - System.exit(0); - } - -} +/** + * Licensed to DigitalPebble Ltd under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * DigitalPebble licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.digitalpebble.stormcrawler.protocol; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.lang.StringUtils; +import org.apache.storm.Config; + +import com.digitalpebble.stormcrawler.Metadata; +import com.digitalpebble.stormcrawler.util.ConfUtils; + +import crawlercommons.robots.BaseRobotRules; + +public abstract class AbstractHttpProtocol implements Protocol { + + private com.digitalpebble.stormcrawler.protocol.HttpRobotRulesParser robots; + + protected boolean skipRobots = false; + + protected boolean storeHTTPHeaders = false; + + protected boolean useCookies = false; + + @Override + public void configure(Config conf) { + this.skipRobots = ConfUtils.getBoolean(conf, "http.skip.robots", false); + this.storeHTTPHeaders = ConfUtils.getBoolean(conf, + "http.store.headers", false); + this.useCookies = ConfUtils.getBoolean(conf, "http.use.cookies", false); + robots = new HttpRobotRulesParser(conf); + } + + @Override + public BaseRobotRules getRobotRules(String url) { + if (this.skipRobots) + return RobotRulesParser.EMPTY_RULES; + return robots.getRobotRulesSet(this, url); + } + + @Override + public void cleanup() { + } + + public static String getAgentString(Config conf) { + return getAgentString(ConfUtils.getString(conf, "http.agent.name"), + ConfUtils.getString(conf, "http.agent.version"), + ConfUtils.getString(conf, "http.agent.description"), + ConfUtils.getString(conf, "http.agent.url"), + ConfUtils.getString(conf, "http.agent.email")); + } + + protected static String getAgentString(String agentName, + String agentVersion, String agentDesc, String agentURL, + String agentEmail) { + + StringBuilder buf = new StringBuilder(); + + buf.append(agentName); + + if (StringUtils.isNotBlank(agentVersion)) { + buf.append("/"); + buf.append(agentVersion); + } + + boolean hasAgentDesc = StringUtils.isNotBlank(agentDesc); + boolean hasAgentURL = StringUtils.isNotBlank(agentURL); + boolean hasAgentEmail = StringUtils.isNotBlank(agentEmail); + + if (hasAgentDesc || hasAgentEmail || hasAgentURL) { + buf.append(" ("); + + if (hasAgentDesc) { + buf.append(agentDesc); + if (hasAgentURL || hasAgentEmail) + buf.append("; "); + } + + if (hasAgentURL) { + buf.append(agentURL); + if (hasAgentEmail) + buf.append("; "); + } + + if (hasAgentEmail) { + buf.append(agentEmail); + } + + buf.append(")"); + } + + return buf.toString(); + } + + /** Called by extensions of this class **/ + protected static void main(AbstractHttpProtocol protocol, String args[]) + throws Exception { + Config conf = new Config(); + + ConfUtils.loadConf(args[0], conf); + protocol.configure(conf); + + Set threads = new HashSet<>(); + + class Fetchable implements Runnable { + String url; + + Fetchable(String url) { + this.url = url; + } + + public void run() { + + StringBuilder stringB = new StringBuilder(); + stringB.append(url).append("\n"); + + if (!protocol.skipRobots) { + BaseRobotRules rules = protocol.getRobotRules(url); + stringB.append("is allowed : ") + .append(rules.isAllowed(url)); + } + + Metadata md = new Metadata(); + long start = System.currentTimeMillis(); + ProtocolResponse response; + try { + response = protocol.getProtocolOutput(url, md); + stringB.append(response.getMetadata()).append("\n"); + stringB.append("status code: " + response.getStatusCode()) + .append("\n"); + stringB.append( + "content length: " + response.getContent().length) + .append("\n"); + long timeFetching = System.currentTimeMillis() - start; + stringB.append("fetched in : " + timeFetching + " msec"); + System.out.println(stringB); + } catch (Exception e) { + e.printStackTrace(); + } finally { + threads.remove(this); + } + } + } + + for (int i = 1; i < args.length; i++) { + Fetchable p = new Fetchable(args[i]); + threads.add(p); + new Thread(p).start(); + } + + while (threads.size() > 0) { + Thread.sleep(1000); + } + + protocol.cleanup(); + System.exit(0); + } + +} diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/httpclient/HttpProtocol.java b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/httpclient/HttpProtocol.java index fd12da053..578f30d51 100644 --- a/core/src/main/java/com/digitalpebble/stormcrawler/protocol/httpclient/HttpProtocol.java +++ b/core/src/main/java/com/digitalpebble/stormcrawler/protocol/httpclient/HttpProtocol.java @@ -1,256 +1,281 @@ -/** - * Licensed to DigitalPebble Ltd under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * DigitalPebble licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.digitalpebble.stormcrawler.protocol.httpclient; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.mutable.MutableBoolean; -import org.apache.http.Header; -import org.apache.http.HeaderIterator; -import org.apache.http.HttpEntity; -import org.apache.http.HttpHost; -import org.apache.http.HttpResponse; -import org.apache.http.StatusLine; -import org.apache.http.auth.AuthScope; -import org.apache.http.auth.UsernamePasswordCredentials; -import org.apache.http.client.ResponseHandler; -import org.apache.http.client.config.AuthSchemes; -import org.apache.http.client.config.CookieSpecs; -import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.impl.client.BasicCredentialsProvider; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.impl.conn.DefaultProxyRoutePlanner; -import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; -import org.apache.http.util.Args; -import org.apache.http.util.ByteArrayBuffer; -import org.apache.storm.Config; -import org.slf4j.LoggerFactory; - -import com.digitalpebble.stormcrawler.Metadata; -import com.digitalpebble.stormcrawler.protocol.AbstractHttpProtocol; -import com.digitalpebble.stormcrawler.protocol.ProtocolResponse; -import com.digitalpebble.stormcrawler.util.ConfUtils; - -/** - * Uses Apache httpclient to handle http and https - **/ - -public class HttpProtocol extends AbstractHttpProtocol implements - ResponseHandler { - - private static final org.slf4j.Logger LOG = LoggerFactory - .getLogger(HttpProtocol.class); - - private static final PoolingHttpClientConnectionManager CONNECTION_MANAGER = new PoolingHttpClientConnectionManager(); - - private int maxContent; - - private HttpClientBuilder builder; - - private RequestConfig requestConfig; - - @Override - public void configure(final Config conf) { - - super.configure(conf); - - // allow up to 200 connections or same as the number of threads used for - // fetching - int maxFetchThreads = ConfUtils.getInt(conf, "fetcher.threads.number", - 200); - CONNECTION_MANAGER.setMaxTotal(maxFetchThreads); - - CONNECTION_MANAGER.setDefaultMaxPerRoute(20); - - this.maxContent = ConfUtils.getInt(conf, "http.content.limit", -1); - - String userAgent = getAgentString( - ConfUtils.getString(conf, "http.agent.name"), - ConfUtils.getString(conf, "http.agent.version"), - ConfUtils.getString(conf, "http.agent.description"), - ConfUtils.getString(conf, "http.agent.url"), - ConfUtils.getString(conf, "http.agent.email")); - - builder = HttpClients.custom().setUserAgent(userAgent) - .setConnectionManager(CONNECTION_MANAGER) - .setConnectionManagerShared(true).disableRedirectHandling() - .disableAutomaticRetries(); - - int timeout = ConfUtils.getInt(conf, "http.timeout", 10000); - - RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() - .setSocketTimeout(timeout).setConnectTimeout(timeout) - .setConnectionRequestTimeout(timeout) - .setCookieSpec(CookieSpecs.STANDARD); - - String proxyHost = ConfUtils.getString(conf, "http.proxy.host", null); - int proxyPort = ConfUtils.getInt(conf, "http.proxy.port", 8080); - - boolean useProxy = proxyHost != null && proxyHost.length() > 0; - - // use a proxy? - if (useProxy) { - - String proxyUser = ConfUtils.getString(conf, "http.proxy.user", - null); - String proxyPass = ConfUtils.getString(conf, "http.proxy.pass", - null); - - if (StringUtils.isNotBlank(proxyUser) - && StringUtils.isNotBlank(proxyPass)) { - List authSchemes = new ArrayList<>(); - // Can make configurable and add more in future - authSchemes.add(AuthSchemes.BASIC); - requestConfigBuilder.setProxyPreferredAuthSchemes(authSchemes); - - BasicCredentialsProvider basicAuthCreds = new BasicCredentialsProvider(); - basicAuthCreds.setCredentials(new AuthScope(proxyHost, - proxyPort), new UsernamePasswordCredentials(proxyUser, - proxyPass)); - builder.setDefaultCredentialsProvider(basicAuthCreds); - } - - HttpHost proxy = new HttpHost(proxyHost, proxyPort); - DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner( - proxy); - builder.setRoutePlanner(routePlanner); - } - - requestConfig = requestConfigBuilder.build(); - } - - @Override - public ProtocolResponse getProtocolOutput(String url, Metadata md) - throws Exception { - - LOG.debug("HTTP connection manager stats {}", - CONNECTION_MANAGER.getTotalStats()); - - HttpGet httpget = new HttpGet(url); - httpget.setConfig(requestConfig); - - if (md != null) { - String lastModified = md.getFirstValue("last-modified"); - if (StringUtils.isNotBlank(lastModified)) { - httpget.addHeader("If-Modified-Since", lastModified); - } - - String ifNoneMatch = md.getFirstValue("etag"); - if (StringUtils.isNotBlank(ifNoneMatch)) { - httpget.addHeader("If-None-Match", ifNoneMatch); - } - } - - // no need to release the connection explicitly as this is handled - // automatically. The client itself must be closed though. - try (CloseableHttpClient httpclient = builder.build()) { - return httpclient.execute(httpget, this); - } - } - - @Override - public ProtocolResponse handleResponse(HttpResponse response) - throws IOException { - - StatusLine statusLine = response.getStatusLine(); - int status = statusLine.getStatusCode(); - - StringBuilder verbatim = new StringBuilder(); - if (storeHTTPHeaders) { - verbatim.append(statusLine.toString()).append("\r\n"); - } - - Metadata metadata = new Metadata(); - HeaderIterator iter = response.headerIterator(); - while (iter.hasNext()) { - Header header = iter.nextHeader(); - if (storeHTTPHeaders) { - verbatim.append(header.toString()).append("\r\n"); - } - metadata.addValue(header.getName().toLowerCase(Locale.ROOT), - header.getValue()); - } - - MutableBoolean trimmed = new MutableBoolean(); - - byte[] bytes = HttpProtocol.toByteArray(response.getEntity(), - maxContent, trimmed); - - if (trimmed.booleanValue()) { - metadata.setValue("http.trimmed", "true"); - LOG.warn("HTTP content trimmed to {}", bytes.length); - } - - if (storeHTTPHeaders) { - verbatim.append("\r\n"); - metadata.setValue("_response.headers_", verbatim.toString()); - } - - return new ProtocolResponse(bytes, status, metadata); - } - - private static final byte[] toByteArray(final HttpEntity entity, - int maxContent, MutableBoolean trimmed) throws IOException { - - if (entity == null) - return new byte[] {}; - - final InputStream instream = entity.getContent(); - if (instream == null) { - return null; - } - try { - Args.check(entity.getContentLength() <= Integer.MAX_VALUE, - "HTTP entity too large to be buffered in memory"); - int i = (int) entity.getContentLength(); - if (i < 0) { - i = 4096; - } - final ByteArrayBuffer buffer = new ByteArrayBuffer(i); - final byte[] tmp = new byte[4096]; - int l; - int total = 0; - while ((l = instream.read(tmp)) != -1) { - // check whether we need to trim - if (maxContent != -1 && total + l > maxContent) { - buffer.append(tmp, 0, maxContent - total); - trimmed.setValue(true); - break; - } - buffer.append(tmp, 0, l); - total += l; - } - return buffer.toByteArray(); - } finally { - instream.close(); - } - } - - public static void main(String args[]) throws Exception { - HttpProtocol.main(new HttpProtocol(), args); - } - +/** + * Licensed to DigitalPebble Ltd under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * DigitalPebble licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.digitalpebble.stormcrawler.protocol.httpclient; + +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang.mutable.MutableBoolean; +import org.apache.http.Header; +import org.apache.http.HeaderIterator; +import org.apache.http.HttpEntity; +import org.apache.http.HttpHost; +import org.apache.http.HttpResponse; +import org.apache.http.StatusLine; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.ResponseHandler; +import org.apache.http.client.config.AuthSchemes; +import org.apache.http.client.config.CookieSpecs; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.BasicCredentialsProvider; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.conn.DefaultProxyRoutePlanner; +import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; +import org.apache.http.util.Args; +import org.apache.http.util.ByteArrayBuffer; +import org.apache.storm.Config; +import org.slf4j.LoggerFactory; + +import com.digitalpebble.stormcrawler.Metadata; +import com.digitalpebble.stormcrawler.protocol.AbstractHttpProtocol; +import com.digitalpebble.stormcrawler.protocol.ProtocolResponse; +import com.digitalpebble.stormcrawler.util.ConfUtils; +import com.digitalpebble.stormcrawler.util.CookieConverter; +import org.apache.http.cookie.Cookie; + +/** + * Uses Apache httpclient to handle http and https + **/ + +public class HttpProtocol extends AbstractHttpProtocol implements + ResponseHandler { + + private static final org.slf4j.Logger LOG = LoggerFactory + .getLogger(HttpProtocol.class); + + private static final PoolingHttpClientConnectionManager CONNECTION_MANAGER = new PoolingHttpClientConnectionManager(); + + private int maxContent; + + private HttpClientBuilder builder; + + private RequestConfig requestConfig; + + public static final String RESPONSE_COOKIES_HEADER = "set-cookie"; + + @Override + public void configure(final Config conf) { + + super.configure(conf); + + // allow up to 200 connections or same as the number of threads used for + // fetching + int maxFetchThreads = ConfUtils.getInt(conf, "fetcher.threads.number", + 200); + CONNECTION_MANAGER.setMaxTotal(maxFetchThreads); + + CONNECTION_MANAGER.setDefaultMaxPerRoute(20); + + this.maxContent = ConfUtils.getInt(conf, "http.content.limit", -1); + + String userAgent = getAgentString( + ConfUtils.getString(conf, "http.agent.name"), + ConfUtils.getString(conf, "http.agent.version"), + ConfUtils.getString(conf, "http.agent.description"), + ConfUtils.getString(conf, "http.agent.url"), + ConfUtils.getString(conf, "http.agent.email")); + + builder = HttpClients.custom().setUserAgent(userAgent) + .setConnectionManager(CONNECTION_MANAGER) + .setConnectionManagerShared(true).disableRedirectHandling() + .disableAutomaticRetries(); + + int timeout = ConfUtils.getInt(conf, "http.timeout", 10000); + + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() + .setSocketTimeout(timeout).setConnectTimeout(timeout) + .setConnectionRequestTimeout(timeout) + .setCookieSpec(CookieSpecs.STANDARD); + + String proxyHost = ConfUtils.getString(conf, "http.proxy.host", null); + int proxyPort = ConfUtils.getInt(conf, "http.proxy.port", 8080); + + boolean useProxy = proxyHost != null && proxyHost.length() > 0; + + // use a proxy? + if (useProxy) { + + String proxyUser = ConfUtils.getString(conf, "http.proxy.user", + null); + String proxyPass = ConfUtils.getString(conf, "http.proxy.pass", + null); + + if (StringUtils.isNotBlank(proxyUser) + && StringUtils.isNotBlank(proxyPass)) { + List authSchemes = new ArrayList<>(); + // Can make configurable and add more in future + authSchemes.add(AuthSchemes.BASIC); + requestConfigBuilder.setProxyPreferredAuthSchemes(authSchemes); + + BasicCredentialsProvider basicAuthCreds = new BasicCredentialsProvider(); + basicAuthCreds.setCredentials(new AuthScope(proxyHost, + proxyPort), new UsernamePasswordCredentials(proxyUser, + proxyPass)); + builder.setDefaultCredentialsProvider(basicAuthCreds); + } + + HttpHost proxy = new HttpHost(proxyHost, proxyPort); + DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner( + proxy); + builder.setRoutePlanner(routePlanner); + } + + requestConfig = requestConfigBuilder.build(); + } + + @Override + public ProtocolResponse getProtocolOutput(String url, Metadata md) + throws Exception { + + LOG.debug("HTTP connection manager stats {}", + CONNECTION_MANAGER.getTotalStats()); + + HttpGet httpget = new HttpGet(url); + httpget.setConfig(requestConfig); + + if (md != null) { + String lastModified = md.getFirstValue("last-modified"); + if (StringUtils.isNotBlank(lastModified)) { + httpget.addHeader("If-Modified-Since", lastModified); + } + + String ifNoneMatch = md.getFirstValue("etag"); + if (StringUtils.isNotBlank(ifNoneMatch)) { + httpget.addHeader("If-None-Match", ifNoneMatch); + } + + if (useCookies) { + addCookiesToRequest(httpget, md); + } + } + + // no need to release the connection explicitly as this is handled + // automatically. The client itself must be closed though. + try (CloseableHttpClient httpclient = builder.build()) { + return httpclient.execute(httpget, this); + } + } + + private void addCookiesToRequest(HttpGet httpget, Metadata md) { + String[] cookieStrings = md.getValues(RESPONSE_COOKIES_HEADER); + if (cookieStrings != null && cookieStrings.length > 0) { + List cookies; + try { + cookies = CookieConverter.getCookies(cookieStrings, httpget + .getURI().toURL()); + for (Cookie c : cookies) { + httpget.addHeader("Cookie", + c.getName() + "=" + c.getValue()); + } + } catch (MalformedURLException e) { // Bad url , nothing to do + } + } + } + + @Override + public ProtocolResponse handleResponse(HttpResponse response) + throws IOException { + + StatusLine statusLine = response.getStatusLine(); + int status = statusLine.getStatusCode(); + + StringBuilder verbatim = new StringBuilder(); + if (storeHTTPHeaders) { + verbatim.append(statusLine.toString()).append("\r\n"); + } + + Metadata metadata = new Metadata(); + HeaderIterator iter = response.headerIterator(); + while (iter.hasNext()) { + Header header = iter.nextHeader(); + if (storeHTTPHeaders) { + verbatim.append(header.toString()).append("\r\n"); + } + metadata.addValue(header.getName().toLowerCase(Locale.ROOT), + header.getValue()); + } + + MutableBoolean trimmed = new MutableBoolean(); + + byte[] bytes = HttpProtocol.toByteArray(response.getEntity(), + maxContent, trimmed); + + if (trimmed.booleanValue()) { + metadata.setValue("http.trimmed", "true"); + LOG.warn("HTTP content trimmed to {}", bytes.length); + } + + if (storeHTTPHeaders) { + verbatim.append("\r\n"); + metadata.setValue("_response.headers_", verbatim.toString()); + } + + return new ProtocolResponse(bytes, status, metadata); + } + + private static final byte[] toByteArray(final HttpEntity entity, + int maxContent, MutableBoolean trimmed) throws IOException { + + if (entity == null) + return new byte[] {}; + + final InputStream instream = entity.getContent(); + if (instream == null) { + return null; + } + try { + Args.check(entity.getContentLength() <= Integer.MAX_VALUE, + "HTTP entity too large to be buffered in memory"); + int i = (int) entity.getContentLength(); + if (i < 0) { + i = 4096; + } + final ByteArrayBuffer buffer = new ByteArrayBuffer(i); + final byte[] tmp = new byte[4096]; + int l; + int total = 0; + while ((l = instream.read(tmp)) != -1) { + // check whether we need to trim + if (maxContent != -1 && total + l > maxContent) { + buffer.append(tmp, 0, maxContent - total); + trimmed.setValue(true); + break; + } + buffer.append(tmp, 0, l); + total += l; + } + return buffer.toByteArray(); + } finally { + instream.close(); + } + } + + public static void main(String args[]) throws Exception { + HttpProtocol.main(new HttpProtocol(), args); + } + } \ No newline at end of file diff --git a/core/src/main/java/com/digitalpebble/stormcrawler/util/CookieConverter.java b/core/src/main/java/com/digitalpebble/stormcrawler/util/CookieConverter.java new file mode 100644 index 000000000..ce71423ac --- /dev/null +++ b/core/src/main/java/com/digitalpebble/stormcrawler/util/CookieConverter.java @@ -0,0 +1,168 @@ +/** + * Licensed to DigitalPebble Ltd under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * DigitalPebble licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.digitalpebble.stormcrawler.util; + +import java.net.URL; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import org.apache.http.cookie.Cookie; +import org.apache.http.impl.cookie.BasicClientCookie; + +/** + * Helper to extract cookies from cookies string. + * + */ +public class CookieConverter { + + private static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat( + "EEE, dd MMM yyyy HH:mm:ss zzz"); + + /** + * Get a list of cookies based on the cookies string taken from response + * header and the target url. + * + * @param cookiesString + * the value of the http header for "Cookie" in the http + * response. + * @param targetURL + * the url for which we wish to pass the cookies in the request. + * @return List off cookies to add to the request. + */ + public static List getCookies(String[] cookiesStrings, URL targetURL) { + ArrayList list = new ArrayList(); + + for (String cs : cookiesStrings) { + String name = null; + String value = null; + + String expires = null; + String domain = null; + String path = null; + + boolean secure = false; + + String[] tokens = cs.split(";"); + + int equals = tokens[0].indexOf("="); + name = tokens[0].substring(0, equals); + value = tokens[0].substring(equals + 1); + + for (int i = 1; i < tokens.length; i++) { + String ti = tokens[i].trim(); + if (ti.equalsIgnoreCase("secure")) + secure = true; + if (ti.toLowerCase().startsWith("path=")) { + path = ti.substring(5); + } + if (ti.toLowerCase().startsWith("domain=")) { + domain = ti.substring(7); + } + if (ti.toLowerCase().startsWith("expires=")) { + expires = ti.substring(8); + } + } + + BasicClientCookie cookie = new BasicClientCookie(name, value); + + // check domain + if (domain != null) { + cookie.setDomain(domain); + + if (!checkDomainMatchToUrl(domain, targetURL.getHost())) + continue; + } + + // check path + if (path != null) { + cookie.setPath(path); + + if (!path.equals("") && !path.equals("/") + && !targetURL.getPath().startsWith(path)) + continue; + } + + // check secure + if (secure) { + cookie.setSecure(secure); + + if (!targetURL.getProtocol().equalsIgnoreCase("https")) + continue; + } + + // check expiration + if (expires != null) { + try { + Date expirationDate = DATE_FORMAT.parse(expires); + cookie.setExpiryDate(expirationDate); + + // check that it hasn't expired? + if (cookie.isExpired(new Date())) + continue; + + cookie.setExpiryDate(expirationDate); + } catch (ParseException e) { + // ignore exceptions + } + } + + // attach additional infos to cookie + list.add(cookie); + } + + return list; + } + + /** + * Helper method to check if url matches a cookie domain. + * + * @param cookieDomain + * the domain in the cookie + * @param urlHostName + * the host name of the url + * @return does the cookie match the host name + */ + public static boolean checkDomainMatchToUrl(String cookieDomain, + String urlHostName) { + try { + if (cookieDomain.startsWith(".")) { + cookieDomain = cookieDomain.substring(1); + } + String[] domainTokens = cookieDomain.split("\\."); + String[] hostTokens = urlHostName.split("\\."); + + int tokenDif = hostTokens.length - domainTokens.length; + if (tokenDif < 0) { + return false; + } + + for (int i = domainTokens.length - 1; i >= 0; i--) { + if (!domainTokens[i].equalsIgnoreCase(hostTokens[i + tokenDif])) { + return false; + } + + } + return true; + } catch (Exception e) { + return true; + } + } + +} diff --git a/core/src/test/java/com/digitalpebble/stormcrawler/util/CookieConverterTest.java b/core/src/test/java/com/digitalpebble/stormcrawler/util/CookieConverterTest.java new file mode 100644 index 000000000..e99f670dd --- /dev/null +++ b/core/src/test/java/com/digitalpebble/stormcrawler/util/CookieConverterTest.java @@ -0,0 +1,320 @@ +/** + * Licensed to DigitalPebble Ltd under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * DigitalPebble licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.digitalpebble.stormcrawler.util; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.List; +import org.apache.http.cookie.Cookie; +import org.junit.Assert; +import org.junit.Test; + +public class CookieConverterTest { + + private static String securedUrl = "https://someurl.com"; + private static String unsecuredUrl = "http://someurl.com"; + private static String dummyCookieHeader = "nice tasty test cookie header!"; + private static String dummyCookieValue = "nice tasty test cookie value!"; + + @Test + public void testSimpleCookieAndUrl() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, null, null, null, null); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl)); + Assert.assertEquals("Should have 1 cookie", 1, result.size()); + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader, result.get(0).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue, result.get(0).getValue()); + + } + + @Test + public void testNotExpiredCookie() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, null, "Tue, 11 Apr 2117 07:13:39 -0000", + null, null); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl)); + Assert.assertEquals("Should have 1 cookie", 1, result.size()); + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader, result.get(0).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue, result.get(0).getValue()); + + } + + @Test + public void testExpiredCookie() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, null, "Tue, 11 Apr 2016 07:13:39 -0000", + null, null); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl)); + Assert.assertEquals("Should have 0 cookies, since cookie was expired", + 0, result.size()); + + } + + @Test + public void testValidPath() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, null, null, "/", null); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl + "/somepage")); + Assert.assertEquals("Should have 1 cookie", 1, result.size()); + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader, result.get(0).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue, result.get(0).getValue()); + + } + + @Test + public void testValidPath2() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, null, null, "/", null); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl)); + Assert.assertEquals("Should have 1 cookie", 1, result.size()); + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader, result.get(0).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue, result.get(0).getValue()); + + } + + @Test + public void testValidPath3() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, null, null, "/someFolder", null); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl + "/someFolder")); + Assert.assertEquals("Should have 1 cookie", 1, result.size()); + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader, result.get(0).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue, result.get(0).getValue()); + + } + + @Test + public void testValidPath4() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, null, null, "/someFolder", null); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl + "/someFolder/SomeOtherFolder")); + Assert.assertEquals("Should have 1 cookie", 1, result.size()); + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader, result.get(0).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue, result.get(0).getValue()); + + } + + @Test + public void testInvalidPath() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, null, null, "/someFolder", null); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl + "/someOtherFolder/SomeFolder")); + Assert.assertEquals("path mismatch, should have 0 cookies", 0, + result.size()); + + } + + @Test + public void testValidDomain() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, "someurl.com", null, null, null); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl + "/someFolder/SomeOtherFolder")); + Assert.assertEquals("Should have 1 cookie", 1, result.size()); + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader, result.get(0).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue, result.get(0).getValue()); + + } + + @Test + public void testInvalidDomain() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, "someOtherUrl.com", null, null, null); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl + "/someFolder/SomeOtherFolder")); + Assert.assertEquals("Domain is not valid - Should have 0 cookies", 0, + result.size()); + } + + @Test + public void testSecurFlagHttp() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, null, null, null, Boolean.TRUE); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(unsecuredUrl + "/someFolder/SomeOtherFolder")); + Assert.assertEquals( + "Target url is not secured - Should have 0 cookies", 0, + result.size()); + } + + @Test + public void testSecurFlagHttpS() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, null, null, null, Boolean.TRUE); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(securedUrl + "/someFolder/SomeOtherFolder")); + Assert.assertEquals("Target url is secured - Should have 1 cookie", 1, + result.size()); + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader, result.get(0).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue, result.get(0).getValue()); + } + + @Test + public void testFullCookie() { + String[] cookiesStrings = new String[1]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, "someurl.com", + "Tue, 11 Apr 2117 07:13:39 -0000", "/", true); + cookiesStrings[0] = dummyCookieString; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(securedUrl + "/someFolder/SomeOtherFolder")); + Assert.assertEquals("Should have 1 cookie", 1, result.size()); + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader, result.get(0).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue, result.get(0).getValue()); + } + + @Test + public void test2Cookies() { + String[] cookiesStrings = new String[2]; + String dummyCookieString = buildCookieString(dummyCookieHeader, + dummyCookieValue, "someurl.com", + "Tue, 11 Apr 2117 07:13:39 -0000", "/", true); + String dummyCookieString2 = buildCookieString(dummyCookieHeader + "2", + dummyCookieValue + "2", "someurl.com", + "Tue, 11 Apr 2117 07:13:39 -0000", "/", true); + cookiesStrings[0] = dummyCookieString; + cookiesStrings[1] = dummyCookieString2; + List result = CookieConverter.getCookies(cookiesStrings, + getUrl(securedUrl + "/someFolder/SomeOtherFolder")); + Assert.assertEquals("Should have 2 cookies", 2, result.size()); + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader, result.get(0).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue, result.get(0).getValue()); + + Assert.assertEquals("Cookie header should be as defined", + dummyCookieHeader + "2", result.get(1).getName()); + Assert.assertEquals("Cookie value should be as defined", + dummyCookieValue + "2", result.get(1).getValue()); + } + + @Test + public void testDomainsChecker() { + boolean result = CookieConverter.checkDomainMatchToUrl(".example.com", + "www.example.com"); + Assert.assertEquals("domain is valid", true, result); + } + + @Test + public void testDomainsChecker2() { + boolean result = CookieConverter.checkDomainMatchToUrl(".example.com", + "example.com"); + Assert.assertEquals("domain is valid", true, result); + } + + @Test + public void testDomainsChecker3() { + boolean result = CookieConverter.checkDomainMatchToUrl("example.com", + "www.example.com"); + Assert.assertEquals("domain is valid", true, result); + } + + @Test + public void testDomainsChecker4() { + boolean result = CookieConverter.checkDomainMatchToUrl("example.com", + "anotherexample.com"); + Assert.assertEquals("domain is not valid", false, result); + } + + private URL getUrl(String urlString) { + try { + return new URL(urlString); + } catch (MalformedURLException e) { + return null; + } + } + + private String buildCookieString(String header, String value, + String domain, String expires, String path, Boolean secure) { + StringBuilder builder = new StringBuilder( + buildCookiePart(header, value)); + if (domain != null) { + builder.append(buildCookiePart("domain", domain)); + } + + if (expires != null) { + builder.append(buildCookiePart("expires", expires)); + } + + if (path != null) { + builder.append(buildCookiePart("path", path)); + } + + if (secure != null) { + builder.append("secure;"); + } + + return builder.toString(); + + } + + private String buildCookiePart(String partName, String partValue) { + return partName + "=" + partValue + ";"; + } + +}