From d9486a5567ceb9a6c77e6fe3994350f37a433510 Mon Sep 17 00:00:00 2001 From: Balaji Date: Wed, 14 Oct 2015 20:10:16 -0700 Subject: [PATCH] fix for NUTCH-2141 contributed by Balaji Gurumurthy --- .../nutch/protocol/interactiveselenium/HttpResponse.java | 3 +-- .../handlers/DefalultMultiInteractionHandler.java | 8 +++++--- .../handlers/DefaultClickAllAjaxLinksHandler.java | 8 +++++--- .../interactiveselenium/handlers/DefaultHandler.java | 4 +++- .../handlers/InteractiveSeleniumHandler.java | 2 +- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java index 548153af4b..a1ccf29dcf 100644 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java +++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java @@ -277,8 +277,7 @@ private void readPlainContent(URL url) throws IOException { WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf); - handler.processDriver(driver); - processedPage += HttpWebClient.getHTMLContent(driver, conf); + processedPage += handler.processDriver(driver); HttpWebClient.cleanUpDriver(driver); } diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java index d27b474fdb..f3c0f6fea8 100644 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java +++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java @@ -32,10 +32,11 @@ public class DefalultMultiInteractionHandler implements private static final Logger LOG = LoggerFactory .getLogger(DefalultMultiInteractionHandler.class); - public void processDriver(WebDriver driver) { + public String processDriver(WebDriver driver) { + // loop and get multiple pages in this string + String accumulatedData = ""; try { - // loop and get multiple pages in this string - String accumulatedData = ""; + // append the string to the last page's driver JavascriptExecutor jsx = (JavascriptExecutor) driver; jsx.executeScript("document.body.innerHTML=document.body.innerHTML " @@ -43,6 +44,7 @@ public void processDriver(WebDriver driver) { } catch (Exception e) { LOG.info(StringUtils.stringifyException(e)); } + return accumulatedData; } public boolean shouldProcessURL(String URL) { diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java index 4d97e17d75..e3423d5401 100644 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java +++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java @@ -38,10 +38,11 @@ public class DefaultClickAllAjaxLinksHandler implements InteractiveSeleniumHandl private static final Logger LOG = LoggerFactory .getLogger(DefaultClickAllAjaxLinksHandler.class); - public void processDriver(WebDriver driver) { - + public String processDriver(WebDriver driver) { + + String accumulatedData = ""; try { - String accumulatedData = ""; + driver.findElement(By.tagName("body")).getAttribute("innerHTML"); Configuration conf = NutchConfiguration.create(); @@ -78,6 +79,7 @@ public void processDriver(WebDriver driver) { } catch (Exception e) { LOG.info(StringUtils.stringifyException(e)); } + return accumulatedData; } public boolean shouldProcessURL(String URL) { diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java index 70f9245a18..ae7b97e9b6 100644 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java +++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java @@ -20,7 +20,9 @@ import org.openqa.selenium.WebDriver; public class DefaultHandler implements InteractiveSeleniumHandler { - public void processDriver(WebDriver driver) {} + public String processDriver(WebDriver driver) { + return null; + } public boolean shouldProcessURL(String URL) { return true; diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java index 81877a7476..9ce1e26302 100644 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java +++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java @@ -20,6 +20,6 @@ import org.openqa.selenium.WebDriver; public interface InteractiveSeleniumHandler { - public void processDriver(WebDriver driver); + public String processDriver(WebDriver driver); public boolean shouldProcessURL(String URL); }