NUTCH-1678 Remove dependency on org.apache.oro

- replace oro with java.util.regex classes in OutlinkExtractor, plugins parse-js and index-more - extend unit tests of parse-js - correct pointer to Java regex syntax (instead of "Perl5")
apache · Oct 10, 2018 · b2d9058 · b2d9058
1 parent 69e9e92
commit b2d9058
Show file tree

Hide file tree

Showing 12 changed files with 169 additions and 178 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1079,62 +1079,6 @@ http://www.python.org. Full license is here:
 
   http://www.python.org/download/releases/2.4.2/license/
 
-lib/jakarta-oro-2.0.8.jar
-
-/* ====================================================================
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 2000-2002 The Apache Software Foundation.  All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- *    if any, must include the following acknowledgment:
- *       "This product includes software developed by the
- *        Apache Software Foundation (http://www.apache.org/)."
- *    Alternately, this acknowledgment may appear in the software itself,
- *    if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
- *    must not be used to endorse or promote products derived from this
- *    software without prior written permission. For written
- *    permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache" 
- *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
- *    name, without prior written permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation.  For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
 lib/jetty-ext/commons-el.jar
 
 /*

diff --git a/conf/parse-plugins.xml b/conf/parse-plugins.xml
@@ -51,6 +51,10 @@
 		<plugin id="parse-zip" />
 	</mimeType>
 
+	<mimeType name="application/javascript">
+		<plugin id="parse-js" />
+	</mimeType>
+
 	<mimeType name="application/x-javascript">
 		<plugin id="parse-js" />
 	</mimeType>

diff --git a/conf/regex-normalize.xml.template b/conf/regex-normalize.xml.template
@@ -17,7 +17,8 @@
 -->
 <!-- This is the configuration file for the RegexUrlNormalize Class.
      This is intended so that users can specify substitutions to be
-     done on URLs. The regex engine that is used is Perl5 compatible.
+     done on URLs using the Java regex syntax, see
+     https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
      The rules are applied to URLs in the order they occur in this file.  -->
 
 <!-- WATCH OUT: an xml parser reads this file an ampersands must be

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
@@ -69,7 +69,6 @@
     <dependency org="xerces" name="xercesImpl" rev="2.9.1" />
     <dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" />
     <dependency org="xalan" name="serializer" rev="2.7.1" />
-    <dependency org="oro" name="oro" rev="2.0.8" />
 
     <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default" />
 
@@ -137,7 +136,7 @@
     <!-- Uncomment this to use MongoDB as Gora backend. -->
     <!--
     <dependency org="org.apache.gora" name="gora-mongodb" rev="0.8" conf="*->default" />
-    -->   
+    -->
     <!-- Uncomment this to use OrientDB as Gora backend. -->
     <!--
     <dependency org="org.apache.gora" name="gora-orientdb" rev="0.8" conf="*->default" />

diff --git a/src/java/org/apache/nutch/parse/OutlinkExtractor.java b/src/java/org/apache/nutch/parse/OutlinkExtractor.java
@@ -21,19 +21,14 @@
 import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.Pattern;
-import org.apache.oro.text.regex.PatternCompiler;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.PatternMatcherInput;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-
 /**
  * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
  * plain text using Regular Expressions.
@@ -60,7 +55,8 @@ public class OutlinkExtractor {
 
    *      </a>
    */
-  private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+  private static final Pattern URL_PATTERN = Pattern.compile(
+      "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)");
 
   /**
    * Extracts <code>Outlink</code> from given plain text. Applying this method
@@ -72,7 +68,8 @@ public class OutlinkExtractor {
    * 
    * @return Array of <code>Outlink</code>s within found in plainText
    */
-  public static Outlink[] getOutlinks(final String plainText, Configuration conf) {
+  public static Outlink[] getOutlinks(final String plainText,
+      Configuration conf) {
     return OutlinkExtractor.getOutlinks(plainText, "", conf);
   }
 
@@ -89,23 +86,20 @@ public static Outlink[] getOutlinks(final String plainText, Configuration conf)
    */
   public static Outlink[] getOutlinks(final String plainText, String anchor,
       Configuration conf) {
-    long start = System.currentTimeMillis();
-    final List<Outlink> outlinks = new ArrayList<Outlink>();
 
-    try {
-      final PatternCompiler cp = new Perl5Compiler();
-      final Pattern pattern = cp.compile(URL_PATTERN,
-          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-              | Perl5Compiler.MULTILINE_MASK);
-      final PatternMatcher matcher = new Perl5Matcher();
+    if (plainText == null) {
+      return new Outlink[0];
+    }
 
-      final PatternMatcherInput input = new PatternMatcherInput(plainText);
+    long start = System.currentTimeMillis();
+    final List<Outlink> outlinks = new ArrayList<>();
 
-      MatchResult result;
+    try {
+      Matcher matcher = URL_PATTERN.matcher(plainText);
       String url;
 
-      // loop the matches
-      while (matcher.contains(input, pattern)) {
+      // Check for stuff!
+      while (matcher.find()) {
         // if this is taking too long, stop matching
         // (SHOULD really check cpu time used so that heavily loaded systems
         // do not unnecessarily hit this limit.)
@@ -115,8 +109,9 @@ public static Outlink[] getOutlinks(final String plainText, String anchor,
           }
           break;
         }
-        result = matcher.getMatch();
-        url = result.group(0);
+
+        url = matcher.group().trim();
+
         try {
           outlinks.add(new Outlink(url, anchor));
         } catch (MalformedURLException mue) {

diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -5,6 +5,9 @@
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
 
 import org.apache.avro.util.Utf8;
 import org.apache.commons.lang.time.DateUtils;
@@ -17,12 +20,6 @@
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 import org.apache.nutch.util.MimeUtil;
-import org.apache.oro.text.regex.MalformedPatternException;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.apache.oro.text.regex.Perl5Pattern;
 import org.apache.solr.common.util.DateUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -224,18 +221,16 @@ static String[] getParts(String mimeType) {
   // Patterns used to extract filename from possible non-standard
   // HTTP header "Content-Disposition". Typically it looks like:
   // Content-Disposition: inline; filename="foo.ppt"
-  private PatternMatcher matcher = new Perl5Matcher();
-
   private Configuration conf;
-  static Perl5Pattern patterns[] = { null, null };
+
+  static Pattern patterns[] = { null, null };
+
   static {
-    Perl5Compiler compiler = new Perl5Compiler();
     try {
       // order here is important
-      patterns[0] = (Perl5Pattern) compiler
-          .compile("\\bfilename=['\"](.+)['\"]");
-      patterns[1] = (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b");
-    } catch (MalformedPatternException e) {
+      patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]");
+      patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
+    } catch (PatternSyntaxException e) {
       // just ignore
     }
   }
@@ -246,12 +241,10 @@ private NutchDocument resetTitle(NutchDocument doc, WebPage page, String url) {
     if (contentDisposition == null)
       return doc;
 
-    MatchResult result;
     for (int i = 0; i < patterns.length; i++) {
-      if (matcher.contains(contentDisposition.toString(), patterns[i])) {
-        result = matcher.getMatch();
-        doc.removeField("title");
-        doc.add("title", result.group(1));
+      Matcher matcher = patterns[i].matcher(contentDisposition);
+      if (matcher.find()) {
+        doc.add("title", matcher.group(1));
         break;
       }
     }

diff --git a/src/plugin/parse-js/plugin.xml b/src/plugin/parse-js/plugin.xml
@@ -36,7 +36,7 @@
               point="org.apache.nutch.parse.Parser">
       <implementation id="JSParser"
          class="org.apache.nutch.parse.js.JSParseFilter">
-        <parameter name="contentType" value="application/x-javascript"/>
+        <parameter name="contentType" value="application/x-javascript|application/javascript"/>
         <parameter name="pathSuffix"  value="js"/>
       </implementation>
    </extension>
@@ -45,7 +45,7 @@
               point="org.apache.nutch.parse.ParseFilter">
       <implementation id="JSParseFilter"
          class="org.apache.nutch.parse.js.JSParseFilter">
-        <parameter name="contentType" value="application/x-javascript"/>
+        <parameter name="contentType" value="application/x-javascript|application/javascript"/>
         <parameter name="pathSuffix"  value=""/>
       </implementation>
    </extension>

diff --git a/src/plugin/parse-js/sample/parse_pure_js_test.js b/src/plugin/parse-js/sample/parse_pure_js_test.js
@@ -0,0 +1,24 @@
+// test data for link extraction from "pure" JavaScript
+
+function selectProvider(form) {
+    provider = form.elements['searchProvider'].value;
+    if (provider == "any") {
+        if (Math.random() > 0.5) {
+            provider = "lucid";
+        } else {
+            provider = "sl";
+        }
+    }
+
+    if (provider == "lucid") {
+        form.action = "http://search.lucidimagination.com/p:nutch";
+    } else if (provider == "sl") {
+        form.action = "http://search-lucene.com/nutch";
+    }
+
+    days = 90; // cookie will be valid for 90 days
+    date = new Date();
+    date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
+    expires = "; expires=" + date.toGMTString();
+    document.cookie = "searchProvider=" + provider + expires + "; path=/";
+}