From dfc6214f281d9f39fbe911ff062a6a27b9ef932e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Mon, 24 Aug 2015 15:29:18 +0200 Subject: [PATCH 01/14] add cleaned up version of momer's protocol-selenium plugin --- ivy/ivy.xml | 7 ++ src/plugin/build.xml | 1 + src/plugin/protocol-selenium/build.xml | 35 ++++++++ src/plugin/protocol-selenium/ivy.xml | 41 +++++++++ src/plugin/protocol-selenium/plugin.xml | 51 +++++++++++ .../apache/nutch/protocol/selenium/Http.java | 58 +++++++++++++ .../nutch/protocol/selenium/HttpResponse.java | 86 +++++++++++++++++++ .../nutch/protocol/selenium/package.html | 5 ++ src/plugin/protocol-selenium/src/pom.xml | 12 +++ .../nutch/protocol/htmlunit/package.html | 5 ++ 10 files changed, 301 insertions(+) create mode 100644 src/plugin/protocol-selenium/build.xml create mode 100644 src/plugin/protocol-selenium/ivy.xml create mode 100644 src/plugin/protocol-selenium/plugin.xml create mode 100644 src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java create mode 100644 src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java create mode 100644 src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html create mode 100644 src/plugin/protocol-selenium/src/pom.xml create mode 100644 src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html diff --git a/ivy/ivy.xml b/ivy/ivy.xml index d05dcf4635..b1efcb6a63 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -159,6 +159,13 @@ + + + + + + + diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 3c8df80398..25508276a5 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -62,6 +62,7 @@ + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/protocol-selenium/ivy.xml b/src/plugin/protocol-selenium/ivy.xml new file mode 100644 index 0000000000..dc7a2c8e20 --- /dev/null +++ b/src/plugin/protocol-selenium/ivy.xml @@ -0,0 +1,41 @@ + + + + + + + + + + Apache Nutch + + + + + + + + + + + + + + + + diff --git a/src/plugin/protocol-selenium/plugin.xml b/src/plugin/protocol-selenium/plugin.xml new file mode 100644 index 0000000000..313e89fe9c --- /dev/null +++ b/src/plugin/protocol-selenium/plugin.xml @@ -0,0 +1,51 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java new file mode 100644 index 0000000000..eba5779298 --- /dev/null +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java @@ -0,0 +1,58 @@ +package org.apache.nutch.protocol.selenium; + +// JDK imports +import java.io.IOException; +import java.net.URL; +import java.util.Collection; +import java.util.HashSet; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.http.api.HttpBase; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.storage.WebPage; +import org.apache.nutch.storage.WebPage.Field; + +import org.apache.nutch.protocol.selenium.HttpResponse; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class Http extends HttpBase { + + public static final Logger LOG = LoggerFactory.getLogger(Http.class); + + private static final Collection FIELDS = new HashSet(); + + static { + FIELDS.add(WebPage.Field.MODIFIED_TIME); + FIELDS.add(WebPage.Field.HEADERS); + } + + public Http() { + super(LOG); + } + + @Override + public void setConf(Configuration conf) { + super.setConf(conf); + } + + public static void main(String[] args) throws Exception { + Http http = new Http(); + http.setConf(NutchConfiguration.create()); + main(http, args); + } + + @Override + protected Response getResponse(URL url, WebPage page, boolean redirect) + throws ProtocolException, IOException { + return new HttpResponse(this, url, page, getConf()); + } + + @Override + public Collection getFields() { + return FIELDS; + } +} diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java new file mode 100644 index 0000000000..37969e64f4 --- /dev/null +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -0,0 +1,86 @@ +package org.apache.nutch.protocol.selenium; + +// JDK imports + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.SpellCheckedMetadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.http.api.HttpException; +import org.apache.nutch.storage.WebPage; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.support.ui.WebDriverWait; + +import java.io.EOFException; +import java.io.IOException; +import java.io.PushbackInputStream; +import java.net.URL; + +// import org.apache.nutch.crawl.CrawlDatum; + +/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */ + +public class HttpResponse implements Response { + + private Http http; + private URL url; + private String orig; + private String base; + private byte[] content; + private int code; + private Metadata headers = new SpellCheckedMetadata(); + + /** + * The nutch configuration + */ + private Configuration conf = null; + + public HttpResponse(Http http, URL url, WebPage page, Configuration conf) throws ProtocolException, IOException { + + this.conf = conf; + this.http = http; + this.url = url; + this.orig = url.toString(); + this.base = url.toString(); + + FirefoxDriver driver = new FirefoxDriver(); + try { + driver.get(url.toString()); + // Wait for the page to load, timeout after 3 seconds + new WebDriverWait(driver, 3); + + String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML"); + code = 200; + content = innerHtml.getBytes("UTF-8"); + } finally { + driver.close(); + } + } + + public URL getUrl() { + return url; + } + + public int getCode() { + return code; + } + + public String getHeader(String name) { + return headers.get(name); + } + + /* ------------------------- * + * * + * ------------------------- */ + + public Metadata getHeaders() { + return headers; + } + + public byte[] getContent() { + return content; + } +} diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html new file mode 100644 index 0000000000..75cd5b5bca --- /dev/null +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html @@ -0,0 +1,5 @@ + + +

Protocol plugin which supports retrieving documents via selenium.

+ + diff --git a/src/plugin/protocol-selenium/src/pom.xml b/src/plugin/protocol-selenium/src/pom.xml new file mode 100644 index 0000000000..8067007c3e --- /dev/null +++ b/src/plugin/protocol-selenium/src/pom.xml @@ -0,0 +1,12 @@ + + + 4.0.0 + + groupId + protocol-selenium + 1.0-SNAPSHOT + + + \ No newline at end of file diff --git a/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html new file mode 100644 index 0000000000..bb2a98eb7a --- /dev/null +++ b/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html @@ -0,0 +1,5 @@ + + +

Protocol plugin which supports retrieving documents via the htmlunit.

+ + From 87c875f10c5565e9b13a35bef33f60fcbf9b95d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Mon, 24 Aug 2015 15:39:59 +0200 Subject: [PATCH 02/14] remove pom file --- src/plugin/protocol-selenium/src/pom.xml | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 src/plugin/protocol-selenium/src/pom.xml diff --git a/src/plugin/protocol-selenium/src/pom.xml b/src/plugin/protocol-selenium/src/pom.xml deleted file mode 100644 index 8067007c3e..0000000000 --- a/src/plugin/protocol-selenium/src/pom.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - 4.0.0 - - groupId - protocol-selenium - 1.0-SNAPSHOT - - - \ No newline at end of file From 12a42d086052c3398c7823230da4786d13742b07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 09:59:35 +0200 Subject: [PATCH 03/14] remove package.html files --- .../src/java/org/apache/nutch/protocol/selenium/package.html | 5 ----- .../classes/org/apache/nutch/protocol/htmlunit/package.html | 5 ----- 2 files changed, 10 deletions(-) delete mode 100644 src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html delete mode 100644 src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html deleted file mode 100644 index 75cd5b5bca..0000000000 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html +++ /dev/null @@ -1,5 +0,0 @@ - - -

Protocol plugin which supports retrieving documents via selenium.

- - diff --git a/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html deleted file mode 100644 index bb2a98eb7a..0000000000 --- a/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html +++ /dev/null @@ -1,5 +0,0 @@ - - -

Protocol plugin which supports retrieving documents via the htmlunit.

- - From 7b309cc98e5dfdf9fd99d35261dd2aa2fc052144 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 10:01:07 +0200 Subject: [PATCH 04/14] add license headers --- .../apache/nutch/protocol/selenium/Http.java | 17 +++++++++++++++++ .../nutch/protocol/selenium/HttpResponse.java | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java index eba5779298..59ddfbaa20 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.nutch.protocol.selenium; // JDK imports diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index 37969e64f4..9714503b97 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.nutch.protocol.selenium; // JDK imports From 36313709959a57159ef164064da526ed38500540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 10:01:29 +0200 Subject: [PATCH 05/14] streamline imports --- .../org/apache/nutch/protocol/selenium/HttpResponse.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index 9714503b97..b500eed927 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -17,8 +17,6 @@ package org.apache.nutch.protocol.selenium; -// JDK imports - import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.SpellCheckedMetadata; @@ -31,9 +29,7 @@ import org.openqa.selenium.firefox.FirefoxDriver; import org.openqa.selenium.support.ui.WebDriverWait; -import java.io.EOFException; import java.io.IOException; -import java.io.PushbackInputStream; import java.net.URL; // import org.apache.nutch.crawl.CrawlDatum; @@ -63,7 +59,7 @@ public HttpResponse(Http http, URL url, WebPage page, Configuration conf) throws this.orig = url.toString(); this.base = url.toString(); - FirefoxDriver driver = new FirefoxDriver(); + WebDriver driver = new FirefoxDriver(); try { driver.get(url.toString()); // Wait for the page to load, timeout after 3 seconds From 2b5b4128f4f39ac156e0bb1aece6ccb315738185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 10:02:32 +0200 Subject: [PATCH 06/14] clean up throws and imports --- .../org/apache/nutch/protocol/selenium/HttpResponse.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index b500eed927..25b382afcd 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -21,8 +21,6 @@ import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.SpellCheckedMetadata; import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.protocol.http.api.HttpException; import org.apache.nutch.storage.WebPage; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; @@ -30,6 +28,7 @@ import org.openqa.selenium.support.ui.WebDriverWait; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.net.URL; // import org.apache.nutch.crawl.CrawlDatum; @@ -51,7 +50,7 @@ public class HttpResponse implements Response { */ private Configuration conf = null; - public HttpResponse(Http http, URL url, WebPage page, Configuration conf) throws ProtocolException, IOException { + public HttpResponse(Http http, URL url, WebPage page, Configuration conf) throws UnsupportedEncodingException { this.conf = conf; this.http = http; From 52b9a39e44533024696a7bf03305771d9a3e0616 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 10:02:54 +0200 Subject: [PATCH 07/14] remove same-package-import --- .../src/java/org/apache/nutch/protocol/selenium/Http.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java index 59ddfbaa20..b086811b17 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java @@ -31,8 +31,6 @@ import org.apache.nutch.storage.WebPage; import org.apache.nutch.storage.WebPage.Field; -import org.apache.nutch.protocol.selenium.HttpResponse; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; From 7ccf089433348772b1dfdfa399082134cb00396f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 10:05:04 +0200 Subject: [PATCH 08/14] add clean target to build.xml --- src/plugin/build.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 25508276a5..1a7a4cdc01 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -153,5 +153,6 @@ + From e207a717a94ad2b3cc734d3fa8e0c53bcbf0964f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 10:05:47 +0200 Subject: [PATCH 09/14] move dependencies from nutch core to plugin --- ivy/ivy.xml | 7 ------- src/plugin/protocol-selenium/ivy.xml | 6 ++++++ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index b1efcb6a63..d05dcf4635 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -159,13 +159,6 @@ - - - - - - - diff --git a/src/plugin/protocol-selenium/ivy.xml b/src/plugin/protocol-selenium/ivy.xml index dc7a2c8e20..f1d63c1f83 100644 --- a/src/plugin/protocol-selenium/ivy.xml +++ b/src/plugin/protocol-selenium/ivy.xml @@ -36,6 +36,12 @@ + + + + + + From 4677ec03d8c21aef5f4b553839d71497561df3f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 10:52:32 +0200 Subject: [PATCH 10/14] add override annotations --- .../org/apache/nutch/protocol/selenium/HttpResponse.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index 25b382afcd..6f04f6ab45 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -72,26 +72,27 @@ public HttpResponse(Http http, URL url, WebPage page, Configuration conf) throws } } + @Override public URL getUrl() { return url; } + @Override public int getCode() { return code; } + @Override public String getHeader(String name) { return headers.get(name); } - /* ------------------------- * - * * - * ------------------------- */ - + @Override public Metadata getHeaders() { return headers; } + @Override public byte[] getContent() { return content; } From 5b8810a7667ed554cdb8ad949cfa1209357a343f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 10:59:04 +0200 Subject: [PATCH 11/14] add supposedly working webdriverwait with timeout from configuration's http.timeout field --- .../apache/nutch/protocol/selenium/HttpResponse.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index 6f04f6ab45..257971e363 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -17,6 +17,7 @@ package org.apache.nutch.protocol.selenium; +import com.google.common.base.Predicate; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.SpellCheckedMetadata; @@ -62,8 +63,13 @@ public HttpResponse(Http http, URL url, WebPage page, Configuration conf) throws try { driver.get(url.toString()); // Wait for the page to load, timeout after 3 seconds - new WebDriverWait(driver, 3); - + WebDriverWait webDriverWait = new WebDriverWait(driver, conf.getInt("http.timeout", 10000)); + webDriverWait.until(new Predicate() { + @Override + public boolean apply(WebDriver webDriver) { + return webDriver.findElement(By.tagName("body")) != null; + } + }); String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML"); code = 200; content = innerHtml.getBytes("UTF-8"); From b9c3fb1c3fa3430988b11013bae6287783d99574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 11:16:40 +0200 Subject: [PATCH 12/14] add configurable minimum wait period(described as render time) before attempting to fetch document body. --- conf/nutch-default.xml | 8 ++++++++ .../nutch/protocol/selenium/HttpResponse.java | 14 ++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 576a080935..8df58c68e4 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -196,6 +196,14 @@ The default network timeout, in milliseconds. + + http.min.render + 1500 + The default minimum amount of time a render-supporting fetcher should wait before returning page content, + in milliseconds. Should be higher than http.timeout, or unpredictable behaviour might occur. + + + http.max.delays 100 diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index 257971e363..14bf693425 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -27,6 +27,8 @@ import org.openqa.selenium.WebDriver; import org.openqa.selenium.firefox.FirefoxDriver; import org.openqa.selenium.support.ui.WebDriverWait; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.UnsupportedEncodingException; @@ -38,6 +40,7 @@ public class HttpResponse implements Response { + private static final Logger LOG = LoggerFactory.getLogger(HttpResponse.class); private Http http; private URL url; private String orig; @@ -56,14 +59,19 @@ public HttpResponse(Http http, URL url, WebPage page, Configuration conf) throws this.conf = conf; this.http = http; this.url = url; - this.orig = url.toString(); this.base = url.toString(); WebDriver driver = new FirefoxDriver(); try { + int timeout = http.getTimeout(); + + // This should be extracted to a HTTPRenderBase class or similar + int sleep = conf.getInt("http.min.render", 1500); + driver.get(url.toString()); // Wait for the page to load, timeout after 3 seconds - WebDriverWait webDriverWait = new WebDriverWait(driver, conf.getInt("http.timeout", 10000)); + WebDriverWait webDriverWait = new WebDriverWait(driver, timeout); + Thread.sleep(Math.min(sleep, timeout)); webDriverWait.until(new Predicate() { @Override public boolean apply(WebDriver webDriver) { @@ -73,6 +81,8 @@ public boolean apply(WebDriver webDriver) { String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML"); code = 200; content = innerHtml.getBytes("UTF-8"); + } catch (InterruptedException e) { + LOG.warn("WebDriver was interrupted before trying to fetch response", e); } finally { driver.close(); } From 480804786288e00602f1d4f746f3c20656423deb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 11:17:22 +0200 Subject: [PATCH 13/14] cleanup unused code --- .../org/apache/nutch/protocol/selenium/HttpResponse.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java index 14bf693425..b290487857 100644 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java @@ -30,21 +30,14 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URL; -// import org.apache.nutch.crawl.CrawlDatum; - -/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */ - public class HttpResponse implements Response { private static final Logger LOG = LoggerFactory.getLogger(HttpResponse.class); private Http http; private URL url; - private String orig; - private String base; private byte[] content; private int code; private Metadata headers = new SpellCheckedMetadata(); @@ -59,7 +52,6 @@ public HttpResponse(Http http, URL url, WebPage page, Configuration conf) throws this.conf = conf; this.http = http; this.url = url; - this.base = url.toString(); WebDriver driver = new FirefoxDriver(); try { From ab2ba618d3d75e43a710613f026b2a3abeadbf08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eivind=20Vegsundv=C3=A5g?= Date: Tue, 25 Aug 2015 12:14:06 +0200 Subject: [PATCH 14/14] correct typo --- conf/nutch-default.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 8df58c68e4..85c5222b10 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -200,7 +200,7 @@ http.min.render 1500 The default minimum amount of time a render-supporting fetcher should wait before returning page content, - in milliseconds. Should be higher than http.timeout, or unpredictable behaviour might occur. + in milliseconds. Should be lower than http.timeout, or unpredictable behaviour might occur.