NUTCH-2064 URLNormalizer basic to encode reserved chars and decode no…

…n-reserved chars git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1713615 13f79535-47bb-0310-9956-ffa450edef68
apache · Nov 10, 2015 · ed76030 · ed76030
1 parent 28d9fda
commit ed76030
Show file tree

Hide file tree

Showing 3 changed files with 192 additions and 0 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
 Release Report: http://s.apache.org/nutch11
 
+* NUTCH-2064 URLNormalizer basic to encode reserved chars and decode non-reserved chars (markus, snagel)
+
 * NUTCH-2159 Ensure that all WebApp files are copied into generated artifacts for 1.X Webapp (lewismc)
 
 * NUTCH-2154 Nutch REST API (DB) suffering NullPointerException (Aron Ahmadia, Sujen Shah via mattmann)

diff --git a/...ormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/...ormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -23,6 +23,9 @@
 import java.net.MalformedURLException;
 import java.net.URISyntaxException;
 import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.Locale;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.hadoop.conf.Configured;
@@ -37,6 +40,9 @@
  * <ul>
  * <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li>
  * <li>remove default ports, e.g. 80 for protocol <code>http://</code></li>
+ * <li>normalize <a href=
+ * "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI">
+ * percent-encoding</a> in URL paths</li>
  * </ul>
  */
 public class BasicURLNormalizer extends Configured implements URLNormalizer {
@@ -50,8 +56,41 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer {
   private final static Pattern hasNormalizablePathPattern = Pattern
       .compile("/[./]|[.]/");
 
+  /**
+   * Nutch 1098 - finds URL encoded parts of the URL
+   */
+  private final static Pattern unescapeRulePattern = Pattern
+      .compile("%([0-9A-Fa-f]{2})");
+
+  // charset used for encoding URLs before escaping
+  private final static Charset utf8 = Charset.forName("UTF-8");
+
+  /** look-up table for characters which should not be escaped in URL paths */
+  private final static boolean[] unescapedCharacters = new boolean[128];
+  static {
+    for (int c = 0; c < 128; c++) {
+      /* https://tools.ietf.org/html/rfc3986#section-2.2
+       * For consistency, percent-encoded octets in the ranges of ALPHA
+       * (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
+       * underscore (%5F), or tilde (%7E) should not be created by URI
+       * producers and, when found in a URI, should be decoded to their
+       * corresponding unreserved characters by URI normalizers.
+       */
+      if ((0x41 <= c && c <= 0x5A)
+        || (0x61 <= c && c <= 0x7A)
+        || (0x30 <= c && c <= 0x39)
+        || c == 0x2D || c == 0x2E
+        || c == 0x5F || c == 0x7E) {
+        unescapedCharacters[c] = true;
+      } else {
+        unescapedCharacters[c] = false;
+      }
+    }
+  }
+
   public String normalize(String urlString, String scope)
       throws MalformedURLException {
+
     if ("".equals(urlString)) // permit empty
       return urlString;
 
@@ -100,7 +139,14 @@ public String normalize(String urlString, String scope)
         changed = true;
         file = file2;
       }
+    }
 
+    // properly encode characters in path/file using percent-encoding
+    String file2 = unescapePath(file);
+    file2 = escapePath(file2);
+    if (!file.equals(file2)) {
+      changed = true;
+      file = file2;
     }
 
     if (changed)
@@ -141,6 +187,84 @@ private String getFileWithNormalizedPath(URL url)
 
     return file;
   }
+
+  /**
+   * Remove % encoding from path segment in URL for characters which should be
+   * unescaped according to <a
+   * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
+   */
+  private String unescapePath(String path) {
+    StringBuilder sb = new StringBuilder();
+
+    Matcher matcher = unescapeRulePattern.matcher(path);
+
+    int end = -1;
+    int letter;
+
+    // Traverse over all encoded groups
+    while (matcher.find()) {
+      // Append everything up to this group
+      sb.append(path.substring(end + 1, matcher.start()));
+
+      // Get the integer representation of this hexadecimal encoded character
+      letter = Integer.valueOf(matcher.group().substring(1), 16);
+
+      if (letter < 128 && unescapedCharacters[letter]) {
+        // character should be unescaped in URLs
+        sb.append(new Character((char)letter));
+      } else {
+        // Append the encoded character as uppercase
+        sb.append(matcher.group().toUpperCase(Locale.ROOT));
+      }
+
+      end = matcher.start() + 2;
+    }
+
+    letter = path.length();
+
+    // Append the rest if there's anything
+    if (end <= letter - 1) {
+      sb.append(path.substring(end + 1, letter));
+    }
+
+    // Ok!
+    return sb.toString();
+  }
+
+  /**
+   * Convert path segment of URL from Unicode to UTF-8 and escape all
+   * characters which should be escaped according to <a
+   * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
+   */
+  private String escapePath(String path) {
+    StringBuilder sb = new StringBuilder(path.length());
+
+    // Traverse over all bytes in this URL
+    for (byte b: path.getBytes(utf8)) {
+      // Is this a control character?
+      if (b < 33 || b == 91 || b == 93) {
+        // Start escape sequence 
+        sb.append('%');
+
+        // Get this byte's hexadecimal representation 
+        String hex = Integer.toHexString(b & 0xFF).toUpperCase();
+
+        // Do we need to prepend a zero?
+        if (hex.length() % 2 != 0 ) {
+          sb.append('0');
+          sb.append(hex);
+        } else {
+          // No, append this hexadecimal representation
+          sb.append(hex);
+        }
+      } else {
+        // No, just append this character as-is
+        sb.append((char)b);
+      }
+    }
+
+    return sb.toString();
+  }
 
   public static void main(String args[]) throws IOException {
     BasicURLNormalizer normalizer = new BasicURLNormalizer();

diff --git a/...lizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/...lizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -34,6 +34,72 @@ public TestBasicURLNormalizer() {
     conf = NutchConfiguration.create();
     normalizer.setConf(conf);
   }
+
+  @Test
+  public void testNUTCH1098() throws Exception {
+    // check that % encoding is normalized
+    normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
+
+    // check that % encoding works correctly at end of URL
+    normalizeTest("http://foo.com/%66oo.htm%6c", "http://foo.com/foo.html");
+    normalizeTest("http://foo.com/%66oo.ht%6dl", "http://foo.com/foo.html");
+
+    // check that % decoder do not overlap strings
+    normalizeTest("http://foo.com/%66oo.ht%6d%6c", "http://foo.com/foo.html");
+
+    // check that % decoder leaves high bit chars alone
+    normalizeTest("http://foo.com/%66oo.htm%C0", "http://foo.com/foo.htm%C0");
+
+    // check that % decoder leaves control chars alone
+    normalizeTest("http://foo.com/%66oo.htm%1A", "http://foo.com/foo.htm%1A");
+
+    // check that % decoder converts to upper case letters
+    normalizeTest("http://foo.com/%66oo.htm%c0", "http://foo.com/foo.htm%C0");
+
+    // check that % decoder leaves encoded spaces alone
+    normalizeTest("http://foo.com/you%20too.html", "http://foo.com/you%20too.html");
+
+    // check that spaces are encoded into %20
+    normalizeTest("http://foo.com/you too.html", "http://foo.com/you%20too.html");
+
+    // check that encoded # are not decoded
+    normalizeTest("http://foo.com/file.html%23cz", "http://foo.com/file.html%23cz");
+
+    // check that encoded / are not decoded
+    normalizeTest("http://foo.com/fast/dir%2fcz", "http://foo.com/fast/dir%2Fcz");
+
+    // check that control chars are encoded
+    normalizeTest("http://foo.com/\u001a!", "http://foo.com/%1A!");
+
+    // check that control chars are always encoded into 2 digits
+    normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!");
+
+    // check encoding of spanish chars
+    normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", "http://mydomain.com/en%20Espa%C3%B1ol.aspx");
+  }
+
+  @Test
+  public void testNUTCH2064() throws Exception {
+    // Ampersand and colon and other punctuation characters are not to be unescaped
+    normalizeTest("http://x.com/s?q=a%26b&m=10", "http://x.com/s?q=a%26b&m=10");
+    normalizeTest("http://x.com/show?http%3A%2F%2Fx.com%2Fb",
+        "http://x.com/show?http%3A%2F%2Fx.com%2Fb");
+    normalizeTest("http://google.com/search?q=c%2B%2B",
+        "http://google.com/search?q=c%2B%2B");
+    // do also not touch the query part which is application/x-www-form-urlencoded
+    normalizeTest("http://x.com/s?q=a+b", "http://x.com/s?q=a+b");
+    // and keep Internationalized domain names
+    // http://bücher.de/ may be http://xn--bcher-kva.de/
+    // but definitely not http://b%C3%BCcher.de/
+    normalizeTest("http://b\u00fccher.de/", "http://b\u00fccher.de/");
+    // test whether percent-encoding works together with other normalizations
+    normalizeTest("http://x.com/./a/../%66.html", "http://x.com/f.html");
+    // [ and ] need escaping as well
+    normalizeTest("http://x.com/?x[y]=1", "http://x.com/?x%5By%5D=1");
+    // boundary test for first character outside the ASCII range (U+0080)
+    normalizeTest("http://x.com/foo\u0080", "http://x.com/foo%C2%80");
+    normalizeTest("http://x.com/foo%c2%80", "http://x.com/foo%C2%80");
+  }
 
   @Test
   public void testNormalizer() throws Exception {