Skip to content

Commit

Permalink
NUTCH-2064 URLNormalizer basic to encode reserved chars and decode no…
Browse files Browse the repository at this point in the history
…n-reserved chars

git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1713615 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
jnioche committed Nov 10, 2015
1 parent 28d9fda commit ed76030
Show file tree
Hide file tree
Showing 3 changed files with 192 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ Nutch Change Log
Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
Release Report: http://s.apache.org/nutch11

* NUTCH-2064 URLNormalizer basic to encode reserved chars and decode non-reserved chars (markus, snagel)

* NUTCH-2159 Ensure that all WebApp files are copied into generated artifacts for 1.X Webapp (lewismc)

* NUTCH-2154 Nutch REST API (DB) suffering NullPointerException (Aron Ahmadia, Sujen Shah via mattmann)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configured;
Expand All @@ -37,6 +40,9 @@
* <ul>
* <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li>
* <li>remove default ports, e.g. 80 for protocol <code>http://</code></li>
* <li>normalize <a href=
* "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI">
* percent-encoding</a> in URL paths</li>
* </ul>
*/
public class BasicURLNormalizer extends Configured implements URLNormalizer {
Expand All @@ -50,8 +56,41 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer {
private final static Pattern hasNormalizablePathPattern = Pattern
.compile("/[./]|[.]/");

/**
* Nutch 1098 - finds URL encoded parts of the URL
*/
private final static Pattern unescapeRulePattern = Pattern
.compile("%([0-9A-Fa-f]{2})");

// charset used for encoding URLs before escaping
private final static Charset utf8 = Charset.forName("UTF-8");

/** look-up table for characters which should not be escaped in URL paths */
private final static boolean[] unescapedCharacters = new boolean[128];
static {
for (int c = 0; c < 128; c++) {
/* https://tools.ietf.org/html/rfc3986#section-2.2
* For consistency, percent-encoded octets in the ranges of ALPHA
* (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
* underscore (%5F), or tilde (%7E) should not be created by URI
* producers and, when found in a URI, should be decoded to their
* corresponding unreserved characters by URI normalizers.
*/
if ((0x41 <= c && c <= 0x5A)
|| (0x61 <= c && c <= 0x7A)
|| (0x30 <= c && c <= 0x39)
|| c == 0x2D || c == 0x2E
|| c == 0x5F || c == 0x7E) {
unescapedCharacters[c] = true;
} else {
unescapedCharacters[c] = false;
}
}
}

public String normalize(String urlString, String scope)
throws MalformedURLException {

if ("".equals(urlString)) // permit empty
return urlString;

Expand Down Expand Up @@ -100,7 +139,14 @@ public String normalize(String urlString, String scope)
changed = true;
file = file2;
}
}

// properly encode characters in path/file using percent-encoding
String file2 = unescapePath(file);
file2 = escapePath(file2);
if (!file.equals(file2)) {
changed = true;
file = file2;
}

if (changed)
Expand Down Expand Up @@ -141,6 +187,84 @@ private String getFileWithNormalizedPath(URL url)

return file;
}

/**
* Remove % encoding from path segment in URL for characters which should be
* unescaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
*/
private String unescapePath(String path) {
StringBuilder sb = new StringBuilder();

Matcher matcher = unescapeRulePattern.matcher(path);

int end = -1;
int letter;

// Traverse over all encoded groups
while (matcher.find()) {
// Append everything up to this group
sb.append(path.substring(end + 1, matcher.start()));

// Get the integer representation of this hexadecimal encoded character
letter = Integer.valueOf(matcher.group().substring(1), 16);

if (letter < 128 && unescapedCharacters[letter]) {
// character should be unescaped in URLs
sb.append(new Character((char)letter));
} else {
// Append the encoded character as uppercase
sb.append(matcher.group().toUpperCase(Locale.ROOT));
}

end = matcher.start() + 2;
}

letter = path.length();

// Append the rest if there's anything
if (end <= letter - 1) {
sb.append(path.substring(end + 1, letter));
}

// Ok!
return sb.toString();
}

/**
* Convert path segment of URL from Unicode to UTF-8 and escape all
* characters which should be escaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
*/
private String escapePath(String path) {
StringBuilder sb = new StringBuilder(path.length());

// Traverse over all bytes in this URL
for (byte b: path.getBytes(utf8)) {
// Is this a control character?
if (b < 33 || b == 91 || b == 93) {
// Start escape sequence
sb.append('%');

// Get this byte's hexadecimal representation
String hex = Integer.toHexString(b & 0xFF).toUpperCase();

// Do we need to prepend a zero?
if (hex.length() % 2 != 0 ) {
sb.append('0');
sb.append(hex);
} else {
// No, append this hexadecimal representation
sb.append(hex);
}
} else {
// No, just append this character as-is
sb.append((char)b);
}
}

return sb.toString();
}

public static void main(String args[]) throws IOException {
BasicURLNormalizer normalizer = new BasicURLNormalizer();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,72 @@ public TestBasicURLNormalizer() {
conf = NutchConfiguration.create();
normalizer.setConf(conf);
}

@Test
public void testNUTCH1098() throws Exception {
// check that % encoding is normalized
normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");

// check that % encoding works correctly at end of URL
normalizeTest("http://foo.com/%66oo.htm%6c", "http://foo.com/foo.html");
normalizeTest("http://foo.com/%66oo.ht%6dl", "http://foo.com/foo.html");

// check that % decoder do not overlap strings
normalizeTest("http://foo.com/%66oo.ht%6d%6c", "http://foo.com/foo.html");

// check that % decoder leaves high bit chars alone
normalizeTest("http://foo.com/%66oo.htm%C0", "http://foo.com/foo.htm%C0");

// check that % decoder leaves control chars alone
normalizeTest("http://foo.com/%66oo.htm%1A", "http://foo.com/foo.htm%1A");

// check that % decoder converts to upper case letters
normalizeTest("http://foo.com/%66oo.htm%c0", "http://foo.com/foo.htm%C0");

// check that % decoder leaves encoded spaces alone
normalizeTest("http://foo.com/you%20too.html", "http://foo.com/you%20too.html");

// check that spaces are encoded into %20
normalizeTest("http://foo.com/you too.html", "http://foo.com/you%20too.html");

// check that encoded # are not decoded
normalizeTest("http://foo.com/file.html%23cz", "http://foo.com/file.html%23cz");

// check that encoded / are not decoded
normalizeTest("http://foo.com/fast/dir%2fcz", "http://foo.com/fast/dir%2Fcz");

// check that control chars are encoded
normalizeTest("http://foo.com/\u001a!", "http://foo.com/%1A!");

// check that control chars are always encoded into 2 digits
normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!");

// check encoding of spanish chars
normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", "http://mydomain.com/en%20Espa%C3%B1ol.aspx");
}

@Test
public void testNUTCH2064() throws Exception {
// Ampersand and colon and other punctuation characters are not to be unescaped
normalizeTest("http://x.com/s?q=a%26b&m=10", "http://x.com/s?q=a%26b&m=10");
normalizeTest("http://x.com/show?http%3A%2F%2Fx.com%2Fb",
"http://x.com/show?http%3A%2F%2Fx.com%2Fb");
normalizeTest("http://google.com/search?q=c%2B%2B",
"http://google.com/search?q=c%2B%2B");
// do also not touch the query part which is application/x-www-form-urlencoded
normalizeTest("http://x.com/s?q=a+b", "http://x.com/s?q=a+b");
// and keep Internationalized domain names
// http://bücher.de/ may be http://xn--bcher-kva.de/
// but definitely not http://b%C3%BCcher.de/
normalizeTest("http://b\u00fccher.de/", "http://b\u00fccher.de/");
// test whether percent-encoding works together with other normalizations
normalizeTest("http://x.com/./a/../%66.html", "http://x.com/f.html");
// [ and ] need escaping as well
normalizeTest("http://x.com/?x[y]=1", "http://x.com/?x%5By%5D=1");
// boundary test for first character outside the ASCII range (U+0080)
normalizeTest("http://x.com/foo\u0080", "http://x.com/foo%C2%80");
normalizeTest("http://x.com/foo%c2%80", "http://x.com/foo%C2%80");
}

@Test
public void testNormalizer() throws Exception {
Expand Down

0 comments on commit ed76030

Please sign in to comment.