From 6ed99b97164d1bdb463b3bdbfc0507e0e603949e Mon Sep 17 00:00:00 2001 From: Dominic Evans Date: Wed, 4 Mar 2015 16:21:46 +0000 Subject: [PATCH] PROTON-576: modified UTF-8 encoder fixes Commit 5069bb6 applied a modified version of a patch I submitted, to ensure that the UTF-8 encoder (and UTF-8 byte length calculator) would cope with surrogate pairs. This commit fixes an issue with three byte characters in the <= 0xFFFF range being incorrectly detected as invalid four byte surrogates. --- .../java/org/apache/qpid/proton/codec/EncoderImpl.java | 7 ++----- .../java/org/apache/qpid/proton/codec/StringType.java | 8 ++++++-- .../java/org/apache/qpid/proton/codec/StringTypeTest.java | 3 ++- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java b/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java index d681ffe10f..77f0efc07f 100644 --- a/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java +++ b/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java @@ -788,7 +788,7 @@ else if ((c & 0xF800) == 0) /* U+0080..U+07FF */ _buffer.put((byte)(0xC0 | ((c >> 6) & 0x1F))); _buffer.put((byte)(0x80 | (c & 0x3F))); } - else if ((c & 0xD800) != 0xD800) /* U+0800..U+FFFF - excluding surrogate pairs */ + else if ((c & 0xD800) != 0xD800 || (c & 0xDC00) == 0xDC00) /* U+0800..U+FFFF - excluding surrogate pairs */ { _buffer.put((byte)(0xE0 | ((c >> 12) & 0x0F))); _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F))); @@ -798,7 +798,7 @@ else if ((c & 0xD800) != 0xD800) /* U+0800..U+FFFF - excluding surrogate pai { int low; - if(((c & 0xDC00) == 0xDC00) || (++i == length) || ((low = string.charAt(i)) & 0xDC00) != 0xDC00) + if((++i == length) || ((low = string.charAt(i)) & 0xDC00) != 0xDC00) { throw new IllegalArgumentException("String contains invalid Unicode code points"); } @@ -812,7 +812,4 @@ else if ((c & 0xD800) != 0xD800) /* U+0800..U+FFFF - excluding surrogate pai } } } - - - } diff --git a/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java b/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java index aa988f9065..092894d892 100644 --- a/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java +++ b/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java @@ -93,10 +93,14 @@ static int calculateUTF8Length(final String s) if ((c & 0xFF80) != 0) /* U+0080.. */ { len++; - // surrogate pairs should always combine to create a code point with a 4 octet representation - if(((c & 0xF800) != 0) && ((c & 0xD800) != 0xD800)) /* U+0800.. excluding surrogate pairs */ + if(((c & 0xF800) != 0)) /* U+0800.. */ { len++; + // surrogate pairs should always combine to create a code point with a 4 octet representation + if ((c & 0xD800) == 0xD800) + { + i++; + } } } } diff --git a/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java b/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java index 550386a849..7d78f655af 100644 --- a/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java +++ b/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java @@ -29,8 +29,8 @@ import java.util.HashSet; import java.util.Set; -import org.junit.Test; import org.apache.qpid.proton.amqp.messaging.AmqpValue; +import org.junit.Test; /** * Test the encoding and decoding of {@link StringType} values. @@ -140,6 +140,7 @@ private Set generateTestData() UnicodeBlock.MUSICAL_SYMBOLS, /*UnicodeBlock.EMOTICONS,*/ /*UnicodeBlock.PLAYING_CARDS,*/ + UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS, UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A, UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B)); }