From cd742353283410cc2f750af16a3ca6e286525193 Mon Sep 17 00:00:00 2001 From: Asitang Mishra Date: Mon, 21 Sep 2015 14:32:41 -0700 Subject: [PATCH] made changes for NUTCH-2108 and formatted the previously unformatted code for this plugin --- .../protocol/selenium/HttpWebClient.java | 188 ++++++++++-------- .../interactiveselenium/HttpResponse.java | 117 ++++++----- .../handlers/DefaultHandler.java | 16 +- .../handlers/InteractiveSeleniumHandler.java | 9 +- 4 files changed, 195 insertions(+), 135 deletions(-) diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java index 8cd670186a..23fe3005f4 100644 --- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java +++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java @@ -46,106 +46,129 @@ public class HttpWebClient { - private static final Logger LOG = LoggerFactory.getLogger(HttpWebClient.class); + private static final Logger LOG = LoggerFactory + .getLogger(HttpWebClient.class); public static ThreadLocal threadWebDriver = new ThreadLocal() { @Override - protected WebDriver initialValue() - { + protected WebDriver initialValue() { FirefoxProfile profile = new FirefoxProfile(); profile.setPreference("permissions.default.stylesheet", 2); profile.setPreference("permissions.default.image", 2); - profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false"); + profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", + "false"); WebDriver driver = new FirefoxDriver(profile); return driver; }; }; public static WebDriver getDriverForPage(String url, Configuration conf) { - WebDriver driver = null; - DesiredCapabilities capabilities = null; - long pageLoadWait = conf.getLong("libselenium.page.load.delay", 3); + WebDriver driver = null; + DesiredCapabilities capabilities = null; + long pageLoadWait = conf.getLong("libselenium.page.load.delay", 3); - try { - String driverType = conf.get("selenium.driver", "firefox"); - switch (driverType) { - case "firefox": - driver = new FirefoxDriver(); - break; - case "chrome": - driver = new ChromeDriver(); - break; - case "safari": - driver = new SafariDriver(); - break; - case "opera": - driver = new OperaDriver(); - break; - case "remote": - String seleniumHubHost = conf.get("selenium.hub.host", "localhost"); - int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444")); - String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub"); - String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http"); - String seleniumGridDriver = conf.get("selenium.grid.driver","firefox"); - String seleniumGridBinary = conf.get("selenium.grid.binary"); - - switch (seleniumGridDriver){ - case "firefox": - capabilities = DesiredCapabilities.firefox(); - capabilities.setBrowserName("firefox"); - capabilities.setJavascriptEnabled(true); - capabilities.setCapability("firefox_binary",seleniumGridBinary); - driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities); - break; - default: - LOG.error("The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType); - driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox()); - break; - } - default: - LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType); - driver = new FirefoxDriver(); - break; + try { + String driverType = conf.get("selenium.driver", "firefox"); + switch (driverType) { + case "firefox": + driver = new FirefoxDriver(); + break; + case "chrome": + driver = new ChromeDriver(); + break; + case "safari": + driver = new SafariDriver(); + break; + case "opera": + driver = new OperaDriver(); + break; + case "remote": + String seleniumHubHost = conf.get("selenium.hub.host", "localhost"); + int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", + "4444")); + String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub"); + String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http"); + String seleniumGridDriver = conf.get("selenium.grid.driver", "firefox"); + String seleniumGridBinary = conf.get("selenium.grid.binary"); + + switch (seleniumGridDriver) { + case "firefox": + capabilities = DesiredCapabilities.firefox(); + capabilities.setBrowserName("firefox"); + capabilities.setJavascriptEnabled(true); + capabilities.setCapability("firefox_binary", seleniumGridBinary); + driver = new RemoteWebDriver(new URL(seleniumHubProtocol, + seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities); + break; + default: + LOG.error( + "The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", + driverType); + driver = new RemoteWebDriver(new URL(seleniumHubProtocol, + seleniumHubHost, seleniumHubPort, seleniumHubPath), + DesiredCapabilities.firefox()); + break; } - LOG.debug("Selenium {} WebDriver selected.", driverType); - - driver.get(url); - new WebDriverWait(driver, pageLoadWait); - } catch (Exception e) { - throw new RuntimeException(e); + default: + LOG.error( + "The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", + driverType); + driver = new FirefoxDriver(); + break; } + LOG.debug("Selenium {} WebDriver selected.", driverType); - return driver; + driver.get(url); + new WebDriverWait(driver, pageLoadWait); + } catch (Exception e) { + throw new RuntimeException(e); + } + + return driver; } public static String getHTMLContent(WebDriver driver, Configuration conf) { - if (conf.getBoolean("selenium.take.screenshot", false)) { - takeScreenshot(driver, conf); - } + if (conf.getBoolean("selenium.take.screenshot", false)) { + takeScreenshot(driver, conf); + } + + return driver.findElement(By.tagName("body")).getAttribute("innerHTML"); + } + + public static String getHTMLContent(String multiProcessedData, + WebDriver driver, Configuration conf) { + if (conf.getBoolean("selenium.take.screenshot", false)) { + takeScreenshot(driver, conf); + } + if (multiProcessedData == null) return driver.findElement(By.tagName("body")).getAttribute("innerHTML"); + else + return multiProcessedData; } public static void cleanUpDriver(WebDriver driver) { - if (driver != null) { - try { - driver.quit(); - } catch (Exception e) { - throw new RuntimeException(e); - } + if (driver != null) { + try { + driver.quit(); + } catch (Exception e) { + throw new RuntimeException(e); } + } } /** * Function for obtaining the HTML BODY using the selected - * {@link org.openqa.selenium.WebDriver}. - * There are a number of configuration properties within - * nutch-site.xml which determine whether to - * take screenshots of the rendered pages and persist them - * as timestamped .png's into HDFS. - * @param url the URL to fetch and render - * @param conf the {@link org.apache.hadoop.conf.Configuration} + * {@link org.openqa.selenium.WebDriver}. There are a number of configuration + * properties within nutch-site.xml which determine whether to + * take screenshots of the rendered pages and persist them as timestamped + * .png's into HDFS. + * + * @param url + * the URL to fetch and render + * @param conf + * the {@link org.apache.hadoop.conf.Configuration} * @return the rendered inner HTML page */ public static String getHtmlPage(String url, Configuration conf) { @@ -156,10 +179,12 @@ public static String getHtmlPage(String url, Configuration conf) { takeScreenshot(driver, conf); } - String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML"); + String innerHtml = driver.findElement(By.tagName("body")).getAttribute( + "innerHTML"); return innerHtml; - // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit + // I'm sure this catch statement is a code smell ; borrowing it from + // lib-htmlunit } catch (Exception e) { throw new RuntimeException(e); } finally { @@ -174,22 +199,29 @@ public static String getHtmlPage(String url) { private static void takeScreenshot(WebDriver driver, Configuration conf) { try { String url = driver.getCurrentUrl(); - File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); + File srcFile = ((TakesScreenshot) driver) + .getScreenshotAs(OutputType.FILE); LOG.debug("In-memory screenshot taken of: {}", url); FileSystem fs = FileSystem.get(conf); - Path screenshotPath = new Path(conf.get("selenium.screenshot.location") + "/" + srcFile.getName()); + Path screenshotPath = new Path(conf.get("selenium.screenshot.location") + + "/" + srcFile.getName()); if (screenshotPath != null) { OutputStream os = null; if (!fs.exists(screenshotPath)) { - LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName()); + LOG.debug( + "No existing screenshot already exists... creating new file at {} {}.", + screenshotPath, srcFile.getName()); os = fs.create(screenshotPath); } InputStream is = new BufferedInputStream(new FileInputStream(srcFile)); IOUtils.copyBytes(is, os, conf); - LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); + LOG.debug("Screenshot for {} successfully saved to: {} {}", url, + screenshotPath, srcFile.getName()); } else { - LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " - + "'selenium.screenshot.location' is absent from nutch-site.xml.", url); + LOG.warn( + "Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " + + "'selenium.screenshot.location' is absent from nutch-site.xml.", + url); } } catch (Exception e) { throw new RuntimeException(e); diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java index 548153af4b..ce6f546b2c 100644 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java +++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java @@ -36,8 +36,8 @@ import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.http.api.HttpException; import org.apache.nutch.protocol.http.api.HttpBase; +import org.apache.nutch.protocol.interactiveselenium.handlers.InteractiveSeleniumHandler; import org.openqa.selenium.WebDriver; - import org.apache.nutch.protocol.selenium.HttpWebClient; /* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */ @@ -56,7 +56,8 @@ public class HttpResponse implements Response { /** The nutch configuration */ private Configuration conf = null; - public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException { + public HttpResponse(Http http, URL url, CrawlDatum datum) + throws ProtocolException, IOException { this.conf = http.getConf(); this.http = http; @@ -138,7 +139,8 @@ public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolExcepti reqStr.append("\r\n"); if (datum.getModifiedTime() > 0) { - reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime())); + reqStr.append("If-Modified-Since: " + + HttpDateFormat.toString(datum.getModifiedTime())); reqStr.append("\r\n"); } reqStr.append("\r\n"); @@ -149,8 +151,8 @@ public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolExcepti req.flush(); PushbackInputStream in = // process response - new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE), - Http.BUFFER_SIZE); + new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), + Http.BUFFER_SIZE), Http.BUFFER_SIZE); StringBuffer line = new StringBuffer(); @@ -166,9 +168,10 @@ public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolExcepti // Get Content type header String contentType = getHeader(Response.CONTENT_TYPE); - // handle with Selenium only if content type in HTML or XHTML + // handle with Selenium only if content type in HTML or XHTML if (contentType != null) { - if (contentType.contains("text/html") || contentType.contains("application/xhtml")) { + if (contentType.contains("text/html") + || contentType.contains("application/xhtml")) { readPlainContent(url); } else { try { @@ -178,11 +181,13 @@ public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolExcepti try { contentLength = Integer.parseInt(contentLengthString.trim()); } catch (NumberFormatException ex) { - throw new HttpException("bad content length: " + contentLengthString); + throw new HttpException("bad content length: " + + contentLengthString); } } - if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) { + if (http.getMaxContent() >= 0 + && contentLength > http.getMaxContent()) { contentLength = http.getMaxContent(); } @@ -208,7 +213,7 @@ public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolExcepti } } } - } + } } finally { if (socket != null) @@ -216,9 +221,10 @@ public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolExcepti } } - /* ------------------------- * - * * - * ------------------------- */ + /* + * ------------------------- * * + * ------------------------- + */ public URL getUrl() { return url; @@ -240,53 +246,62 @@ public byte[] getContent() { return content; } - /* ------------------------- * - * * - * ------------------------- */ + /* + * ------------------------- * * + * ------------------------- + */ private void loadSeleniumHandlers() { - if (handlers != null) return; + if (handlers != null) + return; - String handlerConfig = this.conf.get("interactiveselenium.handlers", "DefaultHandler"); + String handlerConfig = this.conf.get("interactiveselenium.handlers", + "DefaultHandler"); String[] handlerNames = handlerConfig.split(","); handlers = new InteractiveSeleniumHandler[handlerNames.length]; for (int i = 0; i < handlerNames.length; i++) { - try { - String classToLoad = this.getClass().getPackage().getName() + "." + handlerNames[i]; - handlers[i] = InteractiveSeleniumHandler.class.cast(Class.forName(classToLoad).newInstance()); - Http.LOG.info("Successfully loaded " + classToLoad); - } catch (ClassNotFoundException e) { - Http.LOG.info("Unable to load Handler class for: " + handlerNames[i]); - } catch (InstantiationException e) { - Http.LOG.info("Unable to instantiate Handler: " + handlerNames[i]); - } catch (IllegalAccessException e) { - Http.LOG.info("Illegal access with Handler: " + handlerNames[i]); - } + try { + String classToLoad = this.getClass().getPackage().getName() + "." + + handlerNames[i]; + handlers[i] = InteractiveSeleniumHandler.class.cast(Class.forName( + classToLoad).newInstance()); + Http.LOG.info("Successfully loaded " + classToLoad); + } catch (ClassNotFoundException e) { + Http.LOG.info("Unable to load Handler class for: " + handlerNames[i]); + } catch (InstantiationException e) { + Http.LOG.info("Unable to instantiate Handler: " + handlerNames[i]); + } catch (IllegalAccessException e) { + Http.LOG.info("Illegal access with Handler: " + handlerNames[i]); + } } } private void readPlainContent(URL url) throws IOException { if (handlers == null) - loadSeleniumHandlers(); + loadSeleniumHandlers(); String processedPage = ""; for (InteractiveSeleniumHandler handler : this.handlers) { - if (! handler.shouldProcessURL(url.toString())) { - continue; - } + if (!handler.shouldProcessURL(url.toString())) { + continue; + } + + WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf); + handler.processDriver(driver); - WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf); + String multiProcessedData = handler.multiProcessDriver(driver); - handler.processDriver(driver); - processedPage += HttpWebClient.getHTMLContent(driver, conf); + processedPage += HttpWebClient.getHTMLContent(multiProcessedData, driver, + conf); - HttpWebClient.cleanUpDriver(driver); + HttpWebClient.cleanUpDriver(driver); } content = processedPage.getBytes("UTF-8"); } - private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { + private int parseStatusLine(PushbackInputStream in, StringBuffer line) + throws IOException, HttpException { readLine(in, line, false); int codeStart = line.indexOf(" "); @@ -301,13 +316,15 @@ private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IO try { code = Integer.parseInt(line.substring(codeStart + 1, codeEnd)); } catch (NumberFormatException e) { - throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e); + throw new HttpException("bad status line '" + line + "': " + + e.getMessage(), e); } return code; } - private void processHeaderLine(StringBuffer line) throws IOException, HttpException { + private void processHeaderLine(StringBuffer line) throws IOException, + HttpException { int colonIndex = line.indexOf(":"); // key is up to colon if (colonIndex == -1) { @@ -333,24 +350,26 @@ private void processHeaderLine(StringBuffer line) throws IOException, HttpExcept } // Adds headers to our headers Metadata - private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { + private void parseHeaders(PushbackInputStream in, StringBuffer line) + throws IOException, HttpException { while (readLine(in, line, true) != 0) { // handle HTTP responses with missing blank line after headers int pos; - if (((pos = line.indexOf("