From 8d81c48a6925e35f8b19b18237da6e6452f5717c Mon Sep 17 00:00:00 2001 From: zentol Date: Tue, 10 Jun 2014 21:28:34 +0200 Subject: [PATCH 1/8] Serialized String comparison, Unicode support --- .../typeutils/base/StringComparator.java | 4 +- .../typeutils/base/StringSerializer.java | 6 +- .../eu/stratosphere/types/StringValue.java | 191 +++++++++++++++++- .../typeutils/base/StringSerializerTest.java | 6 +- 4 files changed, 199 insertions(+), 8 deletions(-) diff --git a/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringComparator.java b/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringComparator.java index b14a262ee8bb0..c44c034254b3d 100644 --- a/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringComparator.java +++ b/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringComparator.java @@ -39,9 +39,7 @@ public StringComparator(boolean ascending) { @Override public int compare(DataInputView firstSource, DataInputView secondSource) throws IOException { - String s1 = StringValue.readString(firstSource); - String s2 = StringValue.readString(secondSource); - int comp = s1.compareTo(s2); + int comp = StringValue.compareUnicode(firstSource, secondSource); return ascendingComparison ? comp : -comp; } diff --git a/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringSerializer.java b/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringSerializer.java index 8cf8b4788d931..4af1bfbd5e855 100644 --- a/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringSerializer.java +++ b/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringSerializer.java @@ -57,16 +57,16 @@ public int getLength() { @Override public void serialize(String record, DataOutputView target) throws IOException { - StringValue.writeString(record, target); + StringValue.writeUnicode(record, target); } @Override public String deserialize(String record, DataInputView source) throws IOException { - return StringValue.readString(source); + return StringValue.readUnicode(source); } @Override public void copy(DataInputView source, DataOutputView target) throws IOException { - StringValue.copyString(source, target); + StringValue.copyUnicode(source, target); } } diff --git a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java index cc970e84852c6..b21def2d071d8 100644 --- a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java +++ b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java @@ -50,7 +50,6 @@ public class StringValue implements NormalizableKey, CharSequence, private static final int HIGH_BIT2_MASK = 0x3 << 6; - private char[] value; // character value of the string value, not necessarily completely filled private int len; // length of the string value @@ -835,4 +834,194 @@ public static final void copyString(DataInput in, DataOutput out) throws IOExcep } } } + + /** + Writes a CharSequence as a variable-length encoded Unicode String. + @param cs CharSequence to write + @param out output channel + @throws IOException + */ + public static final void writeUnicode(CharSequence cs, DataOutput out) throws IOException { + if (cs == null) { + writeLength(0, out); + } else { + writeLength(Character.codePointCount(cs, 0, cs.length()) + 1, out); + for (int i = 0; i < Character.codePointCount(cs, 0, cs.length()); i++) { + int c = Character.codePointAt(cs, i); + if (c >= 65536) { + //Non-BMP Unicode character, two characters are treated as one + i++; + } + + int shift = 0; + int count = 0; + //determine number of bytes needed + while (c >= (HIGH_BIT << (shift - count))) { + shift += 8; + count++; + } + //write bytes + while (shift >= 0) { + if (shift == 0) { + out.write(c & 0x7F); + } else { + out.write((c >>> (shift - count)) | 0x80); + } + shift -= 8; + count--; + } + } + } + + } + + private static void writeLength(int lenToWrite, DataOutput out) throws IOException { + // the length we write is offset by one, because a length of zero indicates a null value + if (lenToWrite < 0) { + throw new IllegalArgumentException("CharSequence is too long."); + } + + // write the length, variable-length encoded + while (lenToWrite >= HIGH_BIT) { + out.write(lenToWrite | HIGH_BIT); + lenToWrite >>>= 7; + } + out.write(lenToWrite); + } + + /** + Reads and returns a variable-length encoded Unicode String. + @param in input channel + @return Unicode String + @throws IOException + */ + public static final String readUnicode(DataInput in) throws IOException { + int len = readLength(in); + + if(len==0){ + return ""; + } + + final int[] data = new int[len]; + + for (int i = 0; i < len; i++) { + int r = 0; + int c; + while ((c = in.readUnsignedByte()) >= HIGH_BIT) { + c &= 0x7F; + r |= c; + r <<= 7; + } + r |= c; + + data[i] = r; + } + return new String(data, 0, len); + } + + private static int readLength(DataInput in) throws IOException { + // the length we read is offset by one, because a length of zero indicates a null value + int len = in.readUnsignedByte(); + + if (len == 0) { + return 0; + } + + if (len >= HIGH_BIT) { + int shift = 7; + int curr; + len = len & 0x7f; + while ((curr = in.readUnsignedByte()) >= HIGH_BIT) { + len |= (curr & 0x7f) << shift; + shift += 7; + } + len |= curr << shift; + } + + // subtract one for the null length + len -= 1; + + return len; + } + + /** + Copies a serialized variable-length encoded Unicode String. + @param in input channel + @param out output channel + @throws IOException + */ + public static final void copyUnicode(DataInput in, DataOutput out) throws IOException { + //copy length + int length = readLength(in); + // the length we write is offset by one, because a length of zero indicates a null value + writeLength(length + 1, out); + + //copy data + for (int i = 0; i < length; i++) { + int c; + while ((c = in.readUnsignedByte()) >= HIGH_BIT) { + out.writeByte(c); + } + out.writeByte(c); + } + } + + /** + Compares two serialized variable-length encoded Unicode String. + @param firstSource input channel + @param secondSource input channel + @return A negative value if the first String is less than the second, 0 if equal, a positive value if greater. + @throws IOException + */ + public static final int compareUnicode(DataInputView firstSource, DataInputView secondSource) throws IOException { + int lengthFirst = readLength(firstSource); + int lengthSecond = readLength(secondSource); + + for (int i = 0; i < Math.min(lengthFirst, lengthSecond); i++) { + int byteCountFirst = 0; //# of bytes read for the first character + int byteCountSecond = 0; //# of bytes read for the second character + int cmp = 0; //comparison result in case both chars have the same # of bytes + boolean byteIncoming = false; //another byte can be read for one character + + int charByteFirst = firstSource.readUnsignedByte(); + int charByteSecond = secondSource.readUnsignedByte(); + + if (charByteFirst >= HIGH_BIT) { + byteCountFirst++; + byteIncoming = true; + } + if (charByteSecond >= HIGH_BIT) { + byteCountSecond++; + byteIncoming = true; + } + cmp = (charByteFirst & 0x7F) - (charByteSecond & 0x7F); + + while (byteIncoming) {//another byte can be read for at least one character + byteIncoming = false; + if (byteCountFirst == byteCountSecond) { //both chars have the same length so far + charByteFirst = firstSource.readUnsignedByte(); + charByteSecond = secondSource.readUnsignedByte(); + + if (charByteFirst >= HIGH_BIT) { + byteCountFirst++; + byteIncoming = true; + } + if (charByteSecond >= HIGH_BIT) { + byteCountSecond++; + byteIncoming = true; + } + if (cmp == 0) { + cmp = (charByteFirst & 0x7F) - (charByteSecond & 0x7F); + } + } else { //one character has a bigger # of bytes => is greater than the other + return byteCountFirst - byteCountSecond; + } + } //both chars reached their end and have the same length + if (cmp != 0) { + return cmp; + } + } + //the first min(lengthFirst, lengthSecond) characterss are equal, longer String > shorter String + return lengthFirst - lengthSecond; + } } diff --git a/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java b/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java index 1886bcfab964a..a07507401dff5 100644 --- a/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java +++ b/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java @@ -16,6 +16,8 @@ import eu.stratosphere.api.common.typeutils.SerializerTestBase; import eu.stratosphere.api.common.typeutils.TypeSerializer; +import eu.stratosphere.util.StringUtils; +import java.util.Random; /** * A test for the {@link StringSerializer}. @@ -39,6 +41,8 @@ protected Class getTypeClass() { @Override protected String[] getTestData() { - return new String[] {"a", "", "bcd", "jbmbmner8 jhk hj \n \t üäßß@µ", "", "non-empty"}; + Random rnd = new Random(289347567856686223L); + return new String[] {StringUtils.getRandomString(rnd, 300, 350), new String(Character.toChars(127315)), + "a", "", "bcd", "jbmbmner8 jhk hj \n \t üäßß@µ", "", "non-empty"}; } } From f89ba53d1001ded2203333819f833b4c44084090 Mon Sep 17 00:00:00 2001 From: zentol Date: Fri, 13 Jun 2014 15:47:51 +0200 Subject: [PATCH 2/8] tests added --- .../api/common/typeutils/base/StringComparatorTest.java | 6 +++--- .../api/common/typeutils/base/StringSerializerTest.java | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringComparatorTest.java b/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringComparatorTest.java index 6b1fdb0fafc86..aa6c271e62a45 100644 --- a/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringComparatorTest.java +++ b/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringComparatorTest.java @@ -17,8 +17,6 @@ import eu.stratosphere.api.common.typeutils.ComparatorTestBase; import eu.stratosphere.api.common.typeutils.TypeComparator; import eu.stratosphere.api.common.typeutils.TypeSerializer; -import eu.stratosphere.api.common.typeutils.base.StringComparator; -import eu.stratosphere.api.common.typeutils.base.StringSerializer; public class StringComparatorTest extends ComparatorTestBase { @@ -42,7 +40,9 @@ protected String[] getSortedTestData() { "abce", "abdd", "accd", - "bbcd" + "bbcd", + "bbcde", + ((char)128)+""+((char)32896) }; } } diff --git a/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java b/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java index a07507401dff5..e7bc0321b7f75 100644 --- a/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java +++ b/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java @@ -42,7 +42,9 @@ protected Class getTypeClass() { @Override protected String[] getTestData() { Random rnd = new Random(289347567856686223L); - return new String[] {StringUtils.getRandomString(rnd, 300, 350), new String(Character.toChars(127315)), + return new String[] { + StringUtils.getRandomString(rnd, 300, 350), new String(Character.toChars(127315)), + (char)128+(char)32896+"", // flag collisions "a", "", "bcd", "jbmbmner8 jhk hj \n \t üäßß@µ", "", "non-empty"}; } } From 6bb2bac9c0b5d40fb214e22180a30d92d58354ee Mon Sep 17 00:00:00 2001 From: zentol Date: Fri, 13 Jun 2014 16:48:37 +0200 Subject: [PATCH 3/8] minor changes --- .../eu/stratosphere/types/StringValue.java | 20 ++++++++++++++----- .../typeutils/base/StringComparatorTest.java | 3 ++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java index b21def2d071d8..8fe2a2ffaf192 100644 --- a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java +++ b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java @@ -874,9 +874,14 @@ public static final void writeUnicode(CharSequence cs, DataOutput out) throws IO } } - + + /** + Writes the given int variable-length encoded to the given DataOutput. NOT adjusted for null offset. + @param lenToWrite int to write + @param out output + @throws IOException + */ private static void writeLength(int lenToWrite, DataOutput out) throws IOException { - // the length we write is offset by one, because a length of zero indicates a null value if (lenToWrite < 0) { throw new IllegalArgumentException("CharSequence is too long."); } @@ -908,8 +913,7 @@ public static final String readUnicode(DataInput in) throws IOException { int r = 0; int c; while ((c = in.readUnsignedByte()) >= HIGH_BIT) { - c &= 0x7F; - r |= c; + r |= (c & 0x7F) ; r <<= 7; } r |= c; @@ -918,7 +922,13 @@ public static final String readUnicode(DataInput in) throws IOException { } return new String(data, 0, len); } - + + /** + Reads a variable-length encoded int from the given DataInput. Adjusted for null offset. + @param in input + @return read int + @throws IOException + */ private static int readLength(DataInput in) throws IOException { // the length we read is offset by one, because a length of zero indicates a null value int len = in.readUnsignedByte(); diff --git a/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringComparatorTest.java b/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringComparatorTest.java index aa6c271e62a45..0f9ac7e5ee493 100644 --- a/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringComparatorTest.java +++ b/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringComparatorTest.java @@ -42,7 +42,8 @@ protected String[] getSortedTestData() { "accd", "bbcd", "bbcde", - ((char)128)+""+((char)32896) + ((char)128)+""+((char)32896), + ((char)128)+""+((char)32897) }; } } From 5ba0cc07fa33285426827e7079e220661aae3d6c Mon Sep 17 00:00:00 2001 From: zentol Date: Mon, 16 Jun 2014 13:02:06 +0200 Subject: [PATCH 4/8] even more tests --- .../api/common/typeutils/base/StringSerializerTest.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java b/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java index e7bc0321b7f75..3e89ce9c3b872 100644 --- a/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java +++ b/stratosphere-core/src/test/java/eu/stratosphere/api/common/typeutils/base/StringSerializerTest.java @@ -44,7 +44,9 @@ protected String[] getTestData() { Random rnd = new Random(289347567856686223L); return new String[] { StringUtils.getRandomString(rnd, 300, 350), new String(Character.toChars(127315)), - (char)128+(char)32896+"", // flag collisions + (char)128+(char)32896+"", ""+(char)24640+(char)24640+(char)24640+(char)24640+(char)24640, + ""+(char)65535+(char)65535+(char)65535+(char)65535+(char)65535, + ""+(char)0, "a", "", "bcd", "jbmbmner8 jhk hj \n \t üäßß@µ", "", "non-empty"}; } } From cb42d5da9603ace39ba47d79e9860977a72fd2b8 Mon Sep 17 00:00:00 2001 From: zentol Date: Wed, 18 Jun 2014 22:53:46 +0200 Subject: [PATCH 5/8] simplified compare --- .../eu/stratosphere/types/StringValue.java | 71 ++++++------------- 1 file changed, 22 insertions(+), 49 deletions(-) diff --git a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java index 8fe2a2ffaf192..7e5ecfabd455d 100644 --- a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java +++ b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java @@ -910,19 +910,28 @@ public static final String readUnicode(DataInput in) throws IOException { final int[] data = new int[len]; for (int i = 0; i < len; i++) { - int r = 0; - int c; - while ((c = in.readUnsignedByte()) >= HIGH_BIT) { - r |= (c & 0x7F) ; - r <<= 7; - } - r |= c; - - data[i] = r; + data[i] = readUnicodeChar(in); } return new String(data, 0, len); } + /** + Reads and returns a variable-length encoded Unicode Character. + @param in input channel + @return Unicode Character + @throws IOException + */ + private static int readUnicodeChar(DataInput in) throws IOException { + int r = 0; + int c; + while ((c = in.readUnsignedByte()) >= HIGH_BIT) { + r |= (c & 0x7F) ; + r <<= 7; + } + r |= c; + return r; + } + /** Reads a variable-length encoded int from the given DataInput. Adjusted for null offset. @param in input @@ -987,46 +996,10 @@ public static final int compareUnicode(DataInputView firstSource, DataInputView int lengthFirst = readLength(firstSource); int lengthSecond = readLength(secondSource); - for (int i = 0; i < Math.min(lengthFirst, lengthSecond); i++) { - int byteCountFirst = 0; //# of bytes read for the first character - int byteCountSecond = 0; //# of bytes read for the second character - int cmp = 0; //comparison result in case both chars have the same # of bytes - boolean byteIncoming = false; //another byte can be read for one character - - int charByteFirst = firstSource.readUnsignedByte(); - int charByteSecond = secondSource.readUnsignedByte(); - - if (charByteFirst >= HIGH_BIT) { - byteCountFirst++; - byteIncoming = true; - } - if (charByteSecond >= HIGH_BIT) { - byteCountSecond++; - byteIncoming = true; - } - cmp = (charByteFirst & 0x7F) - (charByteSecond & 0x7F); - - while (byteIncoming) {//another byte can be read for at least one character - byteIncoming = false; - if (byteCountFirst == byteCountSecond) { //both chars have the same length so far - charByteFirst = firstSource.readUnsignedByte(); - charByteSecond = secondSource.readUnsignedByte(); - - if (charByteFirst >= HIGH_BIT) { - byteCountFirst++; - byteIncoming = true; - } - if (charByteSecond >= HIGH_BIT) { - byteCountSecond++; - byteIncoming = true; - } - if (cmp == 0) { - cmp = (charByteFirst & 0x7F) - (charByteSecond & 0x7F); - } - } else { //one character has a bigger # of bytes => is greater than the other - return byteCountFirst - byteCountSecond; - } - } //both chars reached their end and have the same length + for (int i = 0; i < Math.min(lengthFirst, lengthSecond); i++) { + int c1 = readUnicodeChar(firstSource); + int c2 = readUnicodeChar(secondSource); + int cmp = c1 - c2; if (cmp != 0) { return cmp; } From 55269286d031761567c874b13af8ff7e9f563dec Mon Sep 17 00:00:00 2001 From: zentol Date: Wed, 18 Jun 2014 23:42:00 +0200 Subject: [PATCH 6/8] small method name change --- .../api/common/typeutils/base/StringComparator.java | 2 +- .../api/common/typeutils/base/StringSerializer.java | 6 +++--- .../src/main/java/eu/stratosphere/types/StringValue.java | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringComparator.java b/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringComparator.java index c44c034254b3d..7591b3f271858 100644 --- a/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringComparator.java +++ b/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringComparator.java @@ -39,7 +39,7 @@ public StringComparator(boolean ascending) { @Override public int compare(DataInputView firstSource, DataInputView secondSource) throws IOException { - int comp = StringValue.compareUnicode(firstSource, secondSource); + int comp = StringValue.compareUnicodeString(firstSource, secondSource); return ascendingComparison ? comp : -comp; } diff --git a/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringSerializer.java b/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringSerializer.java index 4af1bfbd5e855..3f2233289e368 100644 --- a/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringSerializer.java +++ b/stratosphere-core/src/main/java/eu/stratosphere/api/common/typeutils/base/StringSerializer.java @@ -57,16 +57,16 @@ public int getLength() { @Override public void serialize(String record, DataOutputView target) throws IOException { - StringValue.writeUnicode(record, target); + StringValue.writeUnicodeString(record, target); } @Override public String deserialize(String record, DataInputView source) throws IOException { - return StringValue.readUnicode(source); + return StringValue.readUnicodeString(source); } @Override public void copy(DataInputView source, DataOutputView target) throws IOException { - StringValue.copyUnicode(source, target); + StringValue.copyUnicodeString(source, target); } } diff --git a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java index 7e5ecfabd455d..999b44928def3 100644 --- a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java +++ b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java @@ -841,7 +841,7 @@ public static final void copyString(DataInput in, DataOutput out) throws IOExcep @param out output channel @throws IOException */ - public static final void writeUnicode(CharSequence cs, DataOutput out) throws IOException { + public static final void writeUnicodeString(CharSequence cs, DataOutput out) throws IOException { if (cs == null) { writeLength(0, out); } else { @@ -900,7 +900,7 @@ private static void writeLength(int lenToWrite, DataOutput out) throws IOExcepti @return Unicode String @throws IOException */ - public static final String readUnicode(DataInput in) throws IOException { + public static final String readUnicodeString(DataInput in) throws IOException { int len = readLength(in); if(len==0){ @@ -969,7 +969,7 @@ private static int readLength(DataInput in) throws IOException { @param out output channel @throws IOException */ - public static final void copyUnicode(DataInput in, DataOutput out) throws IOException { + public static final void copyUnicodeString(DataInput in, DataOutput out) throws IOException { //copy length int length = readLength(in); // the length we write is offset by one, because a length of zero indicates a null value @@ -992,7 +992,7 @@ public static final void copyUnicode(DataInput in, DataOutput out) throws IOExce @return A negative value if the first String is less than the second, 0 if equal, a positive value if greater. @throws IOException */ - public static final int compareUnicode(DataInputView firstSource, DataInputView secondSource) throws IOException { + public static final int compareUnicodeString(DataInputView firstSource, DataInputView secondSource) throws IOException { int lengthFirst = readLength(firstSource); int lengthSecond = readLength(secondSource); From e7486d628e2a0d93592e40b9f0390351670615f8 Mon Sep 17 00:00:00 2001 From: zentol Date: Fri, 20 Jun 2014 14:49:09 +0200 Subject: [PATCH 7/8] yet another rework --- .../eu/stratosphere/types/StringValue.java | 96 +++++++++++-------- 1 file changed, 58 insertions(+), 38 deletions(-) diff --git a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java index 999b44928def3..83c1a4f092d01 100644 --- a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java +++ b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java @@ -837,6 +837,7 @@ public static final void copyString(DataInput in, DataOutput out) throws IOExcep /** Writes a CharSequence as a variable-length encoded Unicode String. + Supports Unicode characters up to 22 bits. @param cs CharSequence to write @param out output channel @throws IOException @@ -852,27 +853,9 @@ public static final void writeUnicodeString(CharSequence cs, DataOutput out) thr //Non-BMP Unicode character, two characters are treated as one i++; } - - int shift = 0; - int count = 0; - //determine number of bytes needed - while (c >= (HIGH_BIT << (shift - count))) { - shift += 8; - count++; - } - //write bytes - while (shift >= 0) { - if (shift == 0) { - out.write(c & 0x7F); - } else { - out.write((c >>> (shift - count)) | 0x80); - } - shift -= 8; - count--; - } + writeUnicodeChar(c, out); } } - } /** @@ -894,6 +877,42 @@ private static void writeLength(int lenToWrite, DataOutput out) throws IOExcepti out.write(lenToWrite); } + /** + Writes a variable-length encoded Unicode Character. + @param c char to write + @param out output channel + @return Unicode Character + @throws IOException + */ + private static void writeUnicodeChar(int c, DataOutput out) throws IOException { + int shift = 0; + + while (c >= (HIGH_BIT << shift)) { + shift += 5; + } + + while (shift >= 0) { + switch (shift) { + case 0: + out.write(c & 0x7F); + shift -= 1; + break; + case 5: + out.write(((c >> shift + 2) | 0x80) & 0x9F); + shift -= 5; + break; + case 10: + out.write(((c >> shift + 2) | 0xA0) & 0xBF); + shift -= 5; + break; + case 15: + out.write(((c >> shift + 2) | 0xC0) & 0xDF); + shift -= 5; + break; + } + } + } + /** Reads and returns a variable-length encoded Unicode String. @param in input channel @@ -915,23 +934,6 @@ public static final String readUnicodeString(DataInput in) throws IOException { return new String(data, 0, len); } - /** - Reads and returns a variable-length encoded Unicode Character. - @param in input channel - @return Unicode Character - @throws IOException - */ - private static int readUnicodeChar(DataInput in) throws IOException { - int r = 0; - int c; - while ((c = in.readUnsignedByte()) >= HIGH_BIT) { - r |= (c & 0x7F) ; - r <<= 7; - } - r |= c; - return r; - } - /** Reads a variable-length encoded int from the given DataInput. Adjusted for null offset. @param in input @@ -962,7 +964,25 @@ private static int readLength(DataInput in) throws IOException { return len; } - + + /** + Reads and returns a variable-length encoded Unicode Character. + @param in input channel + @return Unicode Character + @throws IOException + */ + private static int readUnicodeChar(DataInput in) throws IOException { + int r = 0; + int c; + while ((c = in.readUnsignedByte()) >= HIGH_BIT) { + r |= (c & 0x1F); + r <<= 5; + } + r <<= 2; + r |= c; + return r; + } + /** Copies a serialized variable-length encoded Unicode String. @param in input channel @@ -1007,4 +1027,4 @@ public static final int compareUnicodeString(DataInputView firstSource, DataInpu //the first min(lengthFirst, lengthSecond) characterss are equal, longer String > shorter String return lengthFirst - lengthSecond; } -} +} \ No newline at end of file From 85c69dda8580531b4bde3e64a36d6ba2a07af4a6 Mon Sep 17 00:00:00 2001 From: zentol Date: Fri, 20 Jun 2014 17:22:30 +0200 Subject: [PATCH 8/8] further adjustments, comparisons on bytes --- .../eu/stratosphere/types/StringValue.java | 63 ++++++++++--------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java index 83c1a4f092d01..48572a5610658 100644 --- a/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java +++ b/stratosphere-core/src/main/java/eu/stratosphere/types/StringValue.java @@ -46,6 +46,8 @@ public class StringValue implements NormalizableKey, CharSequence, private static final int HIGH_BIT = 0x1 << 7; + private static final int END_BYTE = 0x60; + private static final int HIGH_BIT2 = 0x1 << 13; private static final int HIGH_BIT2_MASK = 0x3 << 6; @@ -847,7 +849,7 @@ public static final void writeUnicodeString(CharSequence cs, DataOutput out) thr writeLength(0, out); } else { writeLength(Character.codePointCount(cs, 0, cs.length()) + 1, out); - for (int i = 0; i < Character.codePointCount(cs, 0, cs.length()); i++) { + for (int i = 0; i < cs.length(); i++) { int c = Character.codePointAt(cs, i); if (c >= 65536) { //Non-BMP Unicode character, two characters are treated as one @@ -856,6 +858,8 @@ public static final void writeUnicodeString(CharSequence cs, DataOutput out) thr writeUnicodeChar(c, out); } } + //end-of-string byte + out.write(END_BYTE); } /** @@ -894,19 +898,19 @@ private static void writeUnicodeChar(int c, DataOutput out) throws IOException { while (shift >= 0) { switch (shift) { case 0: - out.write(c & 0x7F); + out.write(c | 0x80); shift -= 1; break; case 5: - out.write(((c >> shift + 2) | 0x80) & 0x9F); + out.write((c >> shift + 2) & 0x1F); shift -= 5; break; case 10: - out.write(((c >> shift + 2) | 0xA0) & 0xBF); + out.write(((c >> shift + 2) | 0x20) & 0x3F); shift -= 5; break; case 15: - out.write(((c >> shift + 2) | 0xC0) & 0xDF); + out.write(((c >> shift + 2) | 0x40) & 0x5F); shift -= 5; break; } @@ -922,15 +926,13 @@ private static void writeUnicodeChar(int c, DataOutput out) throws IOException { public static final String readUnicodeString(DataInput in) throws IOException { int len = readLength(in); - if(len==0){ - return ""; - } - final int[] data = new int[len]; for (int i = 0; i < len; i++) { data[i] = readUnicodeChar(in); } + //end-of-string byte + in.readUnsignedByte(); return new String(data, 0, len); } @@ -974,12 +976,12 @@ private static int readLength(DataInput in) throws IOException { private static int readUnicodeChar(DataInput in) throws IOException { int r = 0; int c; - while ((c = in.readUnsignedByte()) >= HIGH_BIT) { + while ((c = in.readUnsignedByte()) < HIGH_BIT) { r |= (c & 0x1F); r <<= 5; } r <<= 2; - r |= c; + r |= (c & 0x7F); return r; } @@ -997,34 +999,35 @@ public static final void copyUnicodeString(DataInput in, DataOutput out) throws //copy data for (int i = 0; i < length; i++) { - int c; - while ((c = in.readUnsignedByte()) >= HIGH_BIT) { - out.writeByte(c); - } - out.writeByte(c); + writeUnicodeChar(readUnicodeChar(in),out); } + //end-of-string byte + in.readUnsignedByte(); + out.writeByte(END_BYTE); } /** Compares two serialized variable-length encoded Unicode String. - @param firstSource input channel - @param secondSource input channel + @param first input channel + @param second input channel @return A negative value if the first String is less than the second, 0 if equal, a positive value if greater. @throws IOException */ - public static final int compareUnicodeString(DataInputView firstSource, DataInputView secondSource) throws IOException { - int lengthFirst = readLength(firstSource); - int lengthSecond = readLength(secondSource); - - for (int i = 0; i < Math.min(lengthFirst, lengthSecond); i++) { - int c1 = readUnicodeChar(firstSource); - int c2 = readUnicodeChar(secondSource); - int cmp = c1 - c2; - if (cmp != 0) { - return cmp; - } + public static final int compareUnicodeString(DataInputView first, DataInputView second) throws IOException { + int lengthFirst = readLength(first); + int lengthSecond = readLength(second); + byte c1, c2; + int cmp = 0; + for( + int x = 0; + (c1=first.readByte()) < END_BYTE && + (c2=second.readByte()) < END_BYTE && + (cmp=c1-c2) == 0; + x++); + if (cmp!=0){ + return cmp; } - //the first min(lengthFirst, lengthSecond) characterss are equal, longer String > shorter String + //the first min(lengthFirst, lengthSecond) characters are equal, longer String > shorter String return lengthFirst - lengthSecond; } } \ No newline at end of file