Skip to content

Commit

Permalink
NUTCH-1678 Remove dependency on org.apache.oro
Browse files Browse the repository at this point in the history
- replace oro with java.util.regex classes in OutlinkExtractor, plugins parse-js and index-more
- extend unit tests of parse-js
- correct pointer to Java regex syntax (instead of "Perl5")
  • Loading branch information
sebastian-nagel committed Oct 10, 2018
1 parent 69e9e92 commit b2d9058
Show file tree
Hide file tree
Showing 12 changed files with 169 additions and 178 deletions.
56 changes: 0 additions & 56 deletions LICENSE.txt
Expand Up @@ -1079,62 +1079,6 @@ http://www.python.org. Full license is here:

http://www.python.org/download/releases/2.4.2/license/

lib/jakarta-oro-2.0.8.jar

/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2000-2002 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
* must not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
* name, without prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/

lib/jetty-ext/commons-el.jar

/*
Expand Down
4 changes: 4 additions & 0 deletions conf/parse-plugins.xml
Expand Up @@ -51,6 +51,10 @@
<plugin id="parse-zip" />
</mimeType>

<mimeType name="application/javascript">
<plugin id="parse-js" />
</mimeType>

<mimeType name="application/x-javascript">
<plugin id="parse-js" />
</mimeType>
Expand Down
3 changes: 2 additions & 1 deletion conf/regex-normalize.xml.template
Expand Up @@ -17,7 +17,8 @@
-->
<!-- This is the configuration file for the RegexUrlNormalize Class.
This is intended so that users can specify substitutions to be
done on URLs. The regex engine that is used is Perl5 compatible.
done on URLs using the Java regex syntax, see
https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
The rules are applied to URLs in the order they occur in this file. -->

<!-- WATCH OUT: an xml parser reads this file an ampersands must be
Expand Down
3 changes: 1 addition & 2 deletions ivy/ivy.xml
Expand Up @@ -69,7 +69,6 @@
<dependency org="xerces" name="xercesImpl" rev="2.9.1" />
<dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" />
<dependency org="xalan" name="serializer" rev="2.7.1" />
<dependency org="oro" name="oro" rev="2.0.8" />

<dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default" />

Expand Down Expand Up @@ -137,7 +136,7 @@
<!-- Uncomment this to use MongoDB as Gora backend. -->
<!--
<dependency org="org.apache.gora" name="gora-mongodb" rev="0.8" conf="*->default" />
-->
-->
<!-- Uncomment this to use OrientDB as Gora backend. -->
<!--
<dependency org="org.apache.gora" name="gora-orientdb" rev="0.8" conf="*->default" />
Expand Down
45 changes: 20 additions & 25 deletions src/java/org/apache/nutch/parse/OutlinkExtractor.java
Expand Up @@ -21,19 +21,14 @@
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternCompiler;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.PatternMatcherInput;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;

/**
* Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
* plain text using Regular Expressions.
Expand All @@ -60,7 +55,8 @@ public class OutlinkExtractor {
* </a>
*/
private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
private static final Pattern URL_PATTERN = Pattern.compile(
"([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)");

/**
* Extracts <code>Outlink</code> from given plain text. Applying this method
Expand All @@ -72,7 +68,8 @@ public class OutlinkExtractor {
*
* @return Array of <code>Outlink</code>s within found in plainText
*/
public static Outlink[] getOutlinks(final String plainText, Configuration conf) {
public static Outlink[] getOutlinks(final String plainText,
Configuration conf) {
return OutlinkExtractor.getOutlinks(plainText, "", conf);
}

Expand All @@ -89,23 +86,20 @@ public static Outlink[] getOutlinks(final String plainText, Configuration conf)
*/
public static Outlink[] getOutlinks(final String plainText, String anchor,
Configuration conf) {
long start = System.currentTimeMillis();
final List<Outlink> outlinks = new ArrayList<Outlink>();

try {
final PatternCompiler cp = new Perl5Compiler();
final Pattern pattern = cp.compile(URL_PATTERN,
Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
| Perl5Compiler.MULTILINE_MASK);
final PatternMatcher matcher = new Perl5Matcher();
if (plainText == null) {
return new Outlink[0];
}

final PatternMatcherInput input = new PatternMatcherInput(plainText);
long start = System.currentTimeMillis();
final List<Outlink> outlinks = new ArrayList<>();

MatchResult result;
try {
Matcher matcher = URL_PATTERN.matcher(plainText);
String url;

// loop the matches
while (matcher.contains(input, pattern)) {
// Check for stuff!
while (matcher.find()) {
// if this is taking too long, stop matching
// (SHOULD really check cpu time used so that heavily loaded systems
// do not unnecessarily hit this limit.)
Expand All @@ -115,8 +109,9 @@ public static Outlink[] getOutlinks(final String plainText, String anchor,
}
break;
}
result = matcher.getMatch();
url = result.group(0);

url = matcher.group().trim();

try {
outlinks.add(new Outlink(url, anchor));
} catch (MalformedURLException mue) {
Expand Down
Expand Up @@ -5,6 +5,9 @@
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.apache.avro.util.Utf8;
import org.apache.commons.lang.time.DateUtils;
Expand All @@ -17,12 +20,6 @@
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.util.MimeUtil;
import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.apache.oro.text.regex.Perl5Pattern;
import org.apache.solr.common.util.DateUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -224,18 +221,16 @@ static String[] getParts(String mimeType) {
// Patterns used to extract filename from possible non-standard
// HTTP header "Content-Disposition". Typically it looks like:
// Content-Disposition: inline; filename="foo.ppt"
private PatternMatcher matcher = new Perl5Matcher();

private Configuration conf;
static Perl5Pattern patterns[] = { null, null };

static Pattern patterns[] = { null, null };

static {
Perl5Compiler compiler = new Perl5Compiler();
try {
// order here is important
patterns[0] = (Perl5Pattern) compiler
.compile("\\bfilename=['\"](.+)['\"]");
patterns[1] = (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b");
} catch (MalformedPatternException e) {
patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]");
patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
} catch (PatternSyntaxException e) {
// just ignore
}
}
Expand All @@ -246,12 +241,10 @@ private NutchDocument resetTitle(NutchDocument doc, WebPage page, String url) {
if (contentDisposition == null)
return doc;

MatchResult result;
for (int i = 0; i < patterns.length; i++) {
if (matcher.contains(contentDisposition.toString(), patterns[i])) {
result = matcher.getMatch();
doc.removeField("title");
doc.add("title", result.group(1));
Matcher matcher = patterns[i].matcher(contentDisposition);
if (matcher.find()) {
doc.add("title", matcher.group(1));
break;
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/plugin/parse-js/plugin.xml
Expand Up @@ -36,7 +36,7 @@
point="org.apache.nutch.parse.Parser">
<implementation id="JSParser"
class="org.apache.nutch.parse.js.JSParseFilter">
<parameter name="contentType" value="application/x-javascript"/>
<parameter name="contentType" value="application/x-javascript|application/javascript"/>
<parameter name="pathSuffix" value="js"/>
</implementation>
</extension>
Expand All @@ -45,7 +45,7 @@
point="org.apache.nutch.parse.ParseFilter">
<implementation id="JSParseFilter"
class="org.apache.nutch.parse.js.JSParseFilter">
<parameter name="contentType" value="application/x-javascript"/>
<parameter name="contentType" value="application/x-javascript|application/javascript"/>
<parameter name="pathSuffix" value=""/>
</implementation>
</extension>
Expand Down
24 changes: 24 additions & 0 deletions src/plugin/parse-js/sample/parse_pure_js_test.js
@@ -0,0 +1,24 @@
// test data for link extraction from "pure" JavaScript

function selectProvider(form) {
provider = form.elements['searchProvider'].value;
if (provider == "any") {
if (Math.random() > 0.5) {
provider = "lucid";
} else {
provider = "sl";
}
}

if (provider == "lucid") {
form.action = "http://search.lucidimagination.com/p:nutch";
} else if (provider == "sl") {
form.action = "http://search-lucene.com/nutch";
}

days = 90; // cookie will be valid for 90 days
date = new Date();
date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
expires = "; expires=" + date.toGMTString();
document.cookie = "searchProvider=" + provider + expires + "; path=/";
}

0 comments on commit b2d9058

Please sign in to comment.