Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NUTCH-3036 Upgrade org.seleniumhq.selenium:selenium-java dependency i… #807

Merged
merged 2 commits into from Mar 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -68,6 +68,7 @@ On the "Import Project" screen select the "Import project from external model" r
Click "Create". On the next screen the "Eclipse projects directory" should be already set to the nutch folder.
Leave the "Create module files near .classpath files" radio button selected.
Click "Next" on the next screens. On the project SDK screen select Java 11 and click "Create".
**N.B.** For anyone on a Mac with a homebrew-installed openjdk, you need to use the directory under _libexec_: `<openjdk11_directory>/libexec/openjdk.jdk/Contents/Home`.

Once the project is imported, you will see a popup saying "Ant build scripts found", "Frameworks detected - IvyIDEA Framework detected". Click "Import".
If you don't get the pop-up, I'd suggest going through the steps again as this happens from time to time. There is another
Expand Down
Expand Up @@ -16,31 +16,21 @@
*/
package org.apache.nutch.protocol.htmlunit;

import java.lang.invoke.MethodHandles;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.concurrent.TimeUnit;

import com.gargoylesoftware.htmlunit.WebClient;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.TimeoutException;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.*;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import org.openqa.selenium.io.TemporaryFilesystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.gargoylesoftware.htmlunit.WebClient;
import java.io.*;
import java.lang.invoke.MethodHandles;
import java.time.Duration;
import java.time.temporal.ChronoUnit;

public class HtmlUnitWebDriver extends HtmlUnitDriver {

Expand Down Expand Up @@ -75,14 +65,15 @@ public static WebDriver getDriverForPage(String url, Configuration conf) {
enableCss = conf.getBoolean("htmlunit.enable.css", false);
javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
enableRedirect = redirects <= 0 ? false : true;
enableRedirect = redirects > 0;
maxRedirects = redirects;

WebDriver driver = null;

try {
driver = new HtmlUnitWebDriver();
driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
driver.manage().timeouts().pageLoadTimeout(Duration.of(pageLoadTimout,
ChronoUnit.SECONDS));
driver.get(url);
} catch(Exception e) {
if(e instanceof TimeoutException) {
Expand Down
Expand Up @@ -301,7 +301,7 @@ public void setConf(Configuration conf) {
if (parts.length == 2) {
this.hostCookies.put(parts[0], parts[1]);
} else {
LOG.warn("Unable to parse cookie file correctly at: " + word);
LOG.warn("Unable to parse cookie file correctly at: {}", word);
}
}
}
Expand Down Expand Up @@ -332,8 +332,8 @@ public void setConf(Configuration conf) {
ciphers = ((SSLSocketFactory) SSLSocketFactory.getDefault()).getDefaultCipherSuites();
}

this.tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
this.tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
this.tlsPreferredProtocols = new HashSet<>(Arrays.asList(protocols));
this.tlsPreferredCipherSuites = new HashSet<>(Arrays.asList(ciphers));

logConf();
}
Expand Down Expand Up @@ -402,7 +402,7 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
} else if (code == 400) { // bad request, mark as GONE
if (this.logger.isTraceEnabled()) {
this.logger.trace("400 Bad request: " + u);
this.logger.trace("400 Bad request: {}", u);
}
return new ProtocolOutput(c,
new ProtocolStatus(ProtocolStatus.GONE, u));
Expand Down Expand Up @@ -435,11 +435,6 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
}
}

/*
* -------------------------- * </implementation:Protocol> *
* --------------------------
*/

public String getProxyHost() {
return this.proxyHost;
}
Expand Down Expand Up @@ -569,37 +564,35 @@ public Set<String> getTlsPreferredProtocols() {
private static String getAgentString(String agentName, String agentVersion,
String agentDesc, String agentURL, String agentEmail) {

if ((agentName == null) || (agentName.trim().length() == 0)) {
if (LOG.isErrorEnabled()) {
if (((agentName == null) || (agentName.trim().isEmpty())) && LOG.isErrorEnabled()) {
LOG.error("No User-Agent string set (http.agent.name)!");
}
}

StringBuffer buf = new StringBuffer();
StringBuilder buf = new StringBuilder();

buf.append(agentName);
if (agentVersion != null && !agentVersion.trim().isEmpty()) {
buf.append("/");
buf.append(agentVersion);
}
if (((agentDesc != null) && (agentDesc.length() != 0))
|| ((agentEmail != null) && (agentEmail.length() != 0))
|| ((agentURL != null) && (agentURL.length() != 0))) {
if (((agentDesc != null) && (!agentDesc.isEmpty()))
|| ((agentEmail != null) && (!agentEmail.isEmpty()))
|| ((agentURL != null) && (!agentURL.isEmpty()))) {
buf.append(" (");

if ((agentDesc != null) && (agentDesc.length() != 0)) {
if ((agentDesc != null) && (!agentDesc.isEmpty())) {
buf.append(agentDesc);
if ((agentURL != null) || (agentEmail != null))
buf.append("; ");
}

if ((agentURL != null) && (agentURL.length() != 0)) {
if ((agentURL != null) && (!agentURL.isEmpty())) {
buf.append(agentURL);
if (agentEmail != null)
buf.append("; ");
}

if ((agentEmail != null) && (agentEmail.length() != 0))
if ((agentEmail != null) && (!agentEmail.isEmpty()))
buf.append(agentEmail);

buf.append(")");
Expand All @@ -609,15 +602,15 @@ private static String getAgentString(String agentName, String agentVersion,

protected void logConf() {
if (this.logger.isInfoEnabled()) {
this.logger.info("http.proxy.host = " + this.proxyHost);
this.logger.info("http.proxy.port = " + this.proxyPort);
this.logger.info("http.proxy.exception.list = " + this.useProxy);
this.logger.info("http.timeout = " + this.timeout);
this.logger.info("http.content.limit = " + this.maxContent);
this.logger.info("http.agent = " + this.userAgent);
this.logger.info("http.accept.language = " + this.acceptLanguage);
this.logger.info("http.accept = " + this.accept);
this.logger.info("http.enable.cookie.header = " + isCookieEnabled());
this.logger.info("http.proxy.host = {}", this.proxyHost);
this.logger.info("http.proxy.port = {}", this.proxyPort);
this.logger.info("http.proxy.exception.list = {}", this.useProxy);
this.logger.info("http.timeout = {}", this.timeout);
this.logger.info("http.content.limit = {}", this.maxContent);
this.logger.info("http.agent = {}", this.userAgent);
this.logger.info("http.accept.language = {}", this.acceptLanguage);
this.logger.info("http.accept = {}", this.accept);
this.logger.info("http.enable.cookie.header = {}", isCookieEnabled());
}
}

Expand All @@ -644,9 +637,8 @@ public byte[] processGzipEncoded(byte[] compressed, URL url)
throw new IOException("unzipBestEffort returned null");

if (LOG.isTraceEnabled()) {
LOG.trace("fetched " + compressed.length
+ " bytes of compressed content (expanded to " + content.length
+ " bytes) from " + url);
LOG.trace("fetched {} bytes of compressed content (expanded to {} " +
"bytes) from {}", compressed.length, content.length, url);
}
return content;
}
Expand Down Expand Up @@ -674,9 +666,8 @@ public byte[] processDeflateEncoded(byte[] compressed, URL url)
throw new IOException("inflateBestEffort returned null");

if (LOG.isTraceEnabled()) {
LOG.trace("fetched " + compressed.length
+ " bytes of compressed content (expanded to " + content.length
+ " bytes) from " + url);
LOG.trace("fetched {} bytes of compressed content (expanded to {} " +
"bytes) from {}", compressed.length, content.length, url);
}
return content;
}
Expand Down Expand Up @@ -736,11 +727,11 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
*/
private static HashMap<String, String> arrayToMap(String[] input) {
if (input == null || input.length == 0) {
return new HashMap<String, String>();
return new HashMap<>();
}
HashMap<String, String> hm = new HashMap<>();
for (int i = 0; i < input.length; i++) {
if (!"".equals(input[i].trim())) {
if (!input[i].trim().isEmpty()) {
hm.put(input[i], input[i]);
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/plugin/lib-selenium/README.md
Expand Up @@ -23,7 +23,7 @@


Your can run Nutch in Docker. Check some examples at https://github.com/sbatururimi/nutch-test.
Don't forget to update Dockefile to point to the original Nutch repository when updated.
Don't forget to update Dockerfile to point to the original Nutch repository when updated.

# Contributors
Stas Batururimi [s.batururimi@gmail.com]
Expand Down
34 changes: 26 additions & 8 deletions src/plugin/lib-selenium/howto_upgrade_selenium.md
Expand Up @@ -15,18 +15,36 @@
limitations under the License.
-->

1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml
1. Upgrade various driver versions dependency in `src/plugin/lib-selenium/ivy.xml`

2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
2. Upgrade Selenium's own dependencies in `src/plugin/lib-selenium/plugin.xml`

To get a list of dependencies and their versions execute:
$ ant -f ./build-ivy.xml
$ ls lib | sed 's/^/ <library name="/g' | sed 's/$/">\n <export name="*"\/>\n <\/library>/g'
```
$ ant -f ./build-ivy.xml
$ ls lib | sed 's/^/ <library name="/g' | sed 's/$/">\n <export name="*"\/>\n <\/library>/g'
```
Note that all dependent libraries are exported for a "library" plugin `lib-selenium`.

Note that all dependent libraries are exported for a "library" plugin ("lib-selenium").
N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can install GNU Sed as follows

N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows

$ brew install gnu-sed --with-default-names
`$ brew install gnu-sed --with-default-names`

You can then restart your terminal and the Regex + Sed command should work just fine!

3. In the `src/plugin/lib-selenium/plugin.xml` replace all lines between
`<!-- Begin dependencies -->`
and
`<!-- End of dependencies -->`
with the output of the command above.

4. Remove the locally "installed" dependencies in `src/plugin/lib-selenium/lib/`:

`$ rm -rf lib/`

5. Build Nutch and run all unit tests:

```
$ cd ../../../
$ ant clean runtime test
```
2 changes: 1 addition & 1 deletion src/plugin/lib-selenium/ivy.xml
Expand Up @@ -38,7 +38,7 @@

<dependencies>
<!-- begin selenium dependencies -->
<dependency org="org.seleniumhq.selenium" name="selenium-java" rev="4.7.2" />
<dependency org="org.seleniumhq.selenium" name="selenium-java" rev="4.18.1" />
<!-- end selenium dependencies -->
</dependencies>

Expand Down