From ba451c5a51e4aca6fd03cccb0ed976dcc3d90fe3 Mon Sep 17 00:00:00 2001 From: Shawn Yang Date: Wed, 24 Apr 2024 21:39:19 +0800 Subject: [PATCH] feat(spec/java): add strip flag in meta string encoding spec (#1565) ## What does this PR do? add strip flag in meta string encoding spec ## Related issues #1540 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --- docs/specification/java_serialization_spec.md | 10 +-- .../specification/xlang_serialization_spec.md | 10 +-- .../java/org/apache/fury/meta/MetaString.java | 40 +++++------- .../apache/fury/meta/MetaStringDecoder.java | 44 +++++++------ .../apache/fury/meta/MetaStringEncoder.java | 62 ++++++------------- .../apache/fury/resolver/MetaStringBytes.java | 14 +---- .../fury/resolver/MetaStringResolver.java | 2 - .../org/apache/fury/meta/MetaStringTest.java | 32 +++------- 8 files changed, 81 insertions(+), 133 deletions(-) diff --git a/docs/specification/java_serialization_spec.md b/docs/specification/java_serialization_spec.md index b05af49dd1..242a141672 100644 --- a/docs/specification/java_serialization_spec.md +++ b/docs/specification/java_serialization_spec.md @@ -223,11 +223,11 @@ Meta string is mainly used to encode meta strings such as class name and field n String binary encoding algorithm: -| Algorithm | Pattern | Description | -|---------------------------|--------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| LOWER_SPECIAL | `a-z._$\|` | every char is written using 5 bits, `a-z`: `0b00000~0b11001`, `._$\|`: `0b11010~0b11101` | -| LOWER_UPPER_DIGIT_SPECIAL | `a-zA-Z0~9[c1,c2]` | every char is written using 6 bits, `a-z`: `0b00000~0b11001`, `A-Z`: `0b11010~0b110011`, `0~9`: `0b110100~0b111101`, `c1,c2`: `0b111110~0b111111`, `c1,c2` should be two of `._$` | -| UTF-8 | any chars | UTF-8 encoding | +| Algorithm | Pattern | Description | +|---------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | `a-z._$\|` | every char is written using 5 bits, `a-z`: `0b00000~0b11001`, `._$\|`: `0b11010~0b11101`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| LOWER_UPPER_DIGIT_SPECIAL | `a-zA-Z0~9._` | every char is written using 6 bits, `a-z`: `0b00000~0b11001`, `A-Z`: `0b11010~0b110011`, `0~9`: `0b110100~0b111101`, `._`: `0b111110~0b111111`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| UTF-8 | any chars | UTF-8 encoding | Encoding flags: diff --git a/docs/specification/xlang_serialization_spec.md b/docs/specification/xlang_serialization_spec.md index dd8c672ea8..bbfdc76e40 100644 --- a/docs/specification/xlang_serialization_spec.md +++ b/docs/specification/xlang_serialization_spec.md @@ -338,11 +338,11 @@ Meta string is mainly used to encode meta strings such as field names. String binary encoding algorithm: -| Algorithm | Pattern | Description | -|---------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------| -| LOWER_SPECIAL | `a-z._$\|` | every char is written using 5 bits, `a-z`: `0b00000~0b11001`, `._$\|`: `0b11010~0b11101` | -| LOWER_UPPER_DIGIT_SPECIAL | `a-zA-Z0~9._` | every char is written using 6 bits, `a-z`: `0b00000~0b11001`, `A-Z`: `0b11010~0b110011`, `0~9`: `0b110100~0b111101`, `._`: `0b111110~0b111111` | -| UTF-8 | any chars | UTF-8 encoding | +| Algorithm | Pattern | Description | +|---------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| LOWER_SPECIAL | `a-z._$\|` | every char is written using 5 bits, `a-z`: `0b00000~0b11001`, `._$\|`: `0b11010~0b11101`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| LOWER_UPPER_DIGIT_SPECIAL | `a-zA-Z0~9._` | every char is written using 6 bits, `a-z`: `0b00000~0b11001`, `A-Z`: `0b11010~0b110011`, `0~9`: `0b110100~0b111101`, `._`: `0b111110~0b111111`, prepend one bit at the start to indicate whether strip last char since last byte may have 7 redundant bits(1 indicates strip last char) | +| UTF-8 | any chars | UTF-8 encoding | Encoding flags: diff --git a/java/fury-core/src/main/java/org/apache/fury/meta/MetaString.java b/java/fury-core/src/main/java/org/apache/fury/meta/MetaString.java index 57f86ef012..ec16404847 100644 --- a/java/fury-core/src/main/java/org/apache/fury/meta/MetaString.java +++ b/java/fury-core/src/main/java/org/apache/fury/meta/MetaString.java @@ -21,6 +21,7 @@ import java.util.Arrays; import java.util.Objects; +import org.apache.fury.util.Preconditions; /** * Represents a string with metadata that describes its encoding. It supports different encodings @@ -61,31 +62,27 @@ public static Encoding fromInt(int value) { private final char specialChar1; private final char specialChar2; private final byte[] bytes; - private final int numChars; - private final int numBits; + private final boolean stripLastChar; /** * Constructs a MetaString with the specified encoding and data. * * @param encoding The type of encoding used for the string data. * @param bytes The encoded string data as a byte array. - * @param numBits The number of bits used for encoding. */ public MetaString( - String string, - Encoding encoding, - char specialChar1, - char specialChar2, - byte[] bytes, - int numChars, - int numBits) { + String string, Encoding encoding, char specialChar1, char specialChar2, byte[] bytes) { this.string = string; this.encoding = encoding; this.specialChar1 = specialChar1; this.specialChar2 = specialChar2; this.bytes = bytes; - this.numChars = numChars; - this.numBits = numBits; + if (encoding != Encoding.UTF_8) { + Preconditions.checkArgument(bytes.length > 0); + this.stripLastChar = (bytes[0] & 0b1) != 0; + } else { + this.stripLastChar = false; + } } public String getString() { @@ -108,12 +105,8 @@ public byte[] getBytes() { return bytes; } - public int getNumChars() { - return numChars; - } - - public int getNumBits() { - return numBits; + public boolean stripLastChar() { + return stripLastChar; } @Override @@ -127,15 +120,14 @@ public boolean equals(Object o) { MetaString that = (MetaString) o; return specialChar1 == that.specialChar1 && specialChar2 == that.specialChar2 - && numChars == that.numChars - && numBits == that.numBits + && stripLastChar == that.stripLastChar && encoding == that.encoding && Arrays.equals(bytes, that.bytes); } @Override public int hashCode() { - int result = Objects.hash(encoding, specialChar1, specialChar2, numChars, numBits); + int result = Objects.hash(encoding, specialChar1, specialChar2, stripLastChar); result = 31 * result + Arrays.hashCode(bytes); return result; } @@ -153,10 +145,8 @@ public String toString() { + specialChar2 + ", bytes=" + Arrays.toString(bytes) - + ", numChars=" - + numChars - + ", numBits=" - + numBits + + ", stripLastChar=" + + stripLastChar + '}'; } } diff --git a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringDecoder.java b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringDecoder.java index 587cf758a4..3883525739 100644 --- a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringDecoder.java +++ b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringDecoder.java @@ -45,19 +45,18 @@ public MetaStringDecoder(char specialChar1, char specialChar2) { * * @param encodedData encoded data using passed encoding. * @param encoding encoding the passed data. - * @param numBits total bits for encoded data. * @return Decoded string. */ - public String decode(byte[] encodedData, Encoding encoding, int numBits) { + public String decode(byte[] encodedData, Encoding encoding) { switch (encoding) { case LOWER_SPECIAL: - return decodeLowerSpecial(encodedData, numBits); + return decodeLowerSpecial(encodedData); case LOWER_UPPER_DIGIT_SPECIAL: - return decodeLowerUpperDigitSpecial(encodedData, numBits); + return decodeLowerUpperDigitSpecial(encodedData); case FIRST_TO_LOWER_SPECIAL: - return decodeRepFirstLowerSpecial(encodedData, numBits); + return decodeRepFirstLowerSpecial(encodedData); case ALL_TO_LOWER_SPECIAL: - return decodeRepAllToLowerSpecial(encodedData, numBits); + return decodeRepAllToLowerSpecial(encodedData); case UTF_8: return new String(encodedData, StandardCharsets.UTF_8); default: @@ -66,30 +65,36 @@ public String decode(byte[] encodedData, Encoding encoding, int numBits) { } /** Decoding method for {@link Encoding#LOWER_SPECIAL}. */ - private String decodeLowerSpecial(byte[] data, int numBits) { + private String decodeLowerSpecial(byte[] data) { StringBuilder decoded = new StringBuilder(); - int bitIndex = 0; - int bitMask = 0b11111; // 5 bits for mask - while (bitIndex + 5 <= numBits) { + int totalBits = data.length * 8; // Total number of bits in the data + boolean stripLastChar = (data[0] & 0x80) != 0; // Check the first bit of the first byte + int bitMask = 0b11111; // 5 bits for the mask + int bitIndex = 1; // Start from the second bit + while (bitIndex + 5 <= totalBits) { int byteIndex = bitIndex / 8; int intraByteIndex = bitIndex % 8; // Extract the 5-bit character value across byte boundaries if needed int charValue = ((data[byteIndex] & 0xFF) << 8) | (byteIndex + 1 < data.length ? (data[byteIndex + 1] & 0xFF) : 0); - charValue = ((byte) ((charValue >> (11 - intraByteIndex)) & bitMask)); + charValue = (byte) ((charValue >> (11 - intraByteIndex)) & bitMask); bitIndex += 5; decoded.append(decodeLowerSpecialChar(charValue)); } - + if (stripLastChar) { + decoded.deleteCharAt(decoded.length() - 1); + } return decoded.toString(); } /** Decoding method for {@link Encoding#LOWER_UPPER_DIGIT_SPECIAL}. */ - private String decodeLowerUpperDigitSpecial(byte[] data, int numBits) { + private String decodeLowerUpperDigitSpecial(byte[] data) { StringBuilder decoded = new StringBuilder(); - int bitIndex = 0; + int bitIndex = 1; + boolean stripLastChar = (data[0] & 0x80) != 0; // Check the first bit of the first byte int bitMask = 0b111111; // 6 bits for mask + int numBits = data.length * 8; while (bitIndex + 6 <= numBits) { int byteIndex = bitIndex / 8; int intraByteIndex = bitIndex % 8; @@ -102,6 +107,9 @@ private String decodeLowerUpperDigitSpecial(byte[] data, int numBits) { bitIndex += 6; decoded.append(decodeLowerUpperDigitSpecialChar(charValue)); } + if (stripLastChar) { + decoded.deleteCharAt(decoded.length() - 1); + } return decoded.toString(); } @@ -140,13 +148,13 @@ private char decodeLowerUpperDigitSpecialChar(int charValue) { } } - private String decodeRepFirstLowerSpecial(byte[] data, int numBits) { - String str = decodeLowerSpecial(data, numBits); + private String decodeRepFirstLowerSpecial(byte[] data) { + String str = decodeLowerSpecial(data); return StringUtils.capitalize(str); } - private String decodeRepAllToLowerSpecial(byte[] data, int numBits) { - String str = decodeLowerSpecial(data, numBits); + private String decodeRepAllToLowerSpecial(byte[] data) { + String str = decodeLowerSpecial(data); StringBuilder builder = new StringBuilder(); char[] chars = str.toCharArray(); for (int i = 0; i < chars.length; i++) { diff --git a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java index 5680a49881..fe6796a75a 100644 --- a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java +++ b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java @@ -48,8 +48,7 @@ public MetaStringEncoder(char specialChar1, char specialChar2) { */ public MetaString encode(String input) { if (input.isEmpty()) { - return new MetaString( - input, Encoding.LOWER_SPECIAL, specialChar1, specialChar2, new byte[0], 0, 0); + return new MetaString(input, Encoding.UTF_8, specialChar1, specialChar2, new byte[0]); } Encoding encoding = computeEncoding(input); return encode(input, encoding); @@ -66,53 +65,27 @@ public MetaString encode(String input, Encoding encoding) { Preconditions.checkArgument( input.length() < Short.MAX_VALUE, "Long meta string than 32767 is not allowed"); if (input.isEmpty()) { - return new MetaString( - input, Encoding.LOWER_SPECIAL, specialChar1, specialChar2, new byte[0], 0, 0); + return new MetaString(input, Encoding.UTF_8, specialChar1, specialChar2, new byte[0]); } - int length = input.length(); + byte[] bytes; switch (encoding) { case LOWER_SPECIAL: - return new MetaString( - input, - encoding, - specialChar1, - specialChar2, - encodeLowerSpecial(input), - length, - length * 5); + bytes = encodeLowerSpecial(input); + return new MetaString(input, encoding, specialChar1, specialChar2, bytes); case LOWER_UPPER_DIGIT_SPECIAL: - return new MetaString( - input, - encoding, - specialChar1, - specialChar2, - encodeLowerUpperDigitSpecial(input), - length, - length * 6); + bytes = encodeLowerUpperDigitSpecial(input); + return new MetaString(input, encoding, specialChar1, specialChar2, bytes); case FIRST_TO_LOWER_SPECIAL: - return new MetaString( - input, - encoding, - specialChar1, - specialChar2, - encodeFirstToLowerSpecial(input), - length, - length * 5); + bytes = encodeFirstToLowerSpecial(input); + return new MetaString(input, encoding, specialChar1, specialChar2, bytes); case ALL_TO_LOWER_SPECIAL: char[] chars = input.toCharArray(); int upperCount = countUppers(chars); - return new MetaString( - input, - encoding, - specialChar1, - specialChar2, - encodeAllToLowerSpecial(chars, upperCount), - length, - (upperCount + length) * 5); + bytes = encodeAllToLowerSpecial(chars, upperCount); + return new MetaString(input, encoding, specialChar1, specialChar2, bytes); default: - byte[] bytes = input.getBytes(StandardCharsets.UTF_8); - return new MetaString( - input, Encoding.UTF_8, specialChar1, specialChar2, bytes, bytes.length * 8, 0); + bytes = input.getBytes(StandardCharsets.UTF_8); + return new MetaString(input, Encoding.UTF_8, specialChar1, specialChar2, bytes); } } @@ -238,10 +211,10 @@ private byte[] encodeGeneric(String input, int bitsPerChar) { } private byte[] encodeGeneric(char[] chars, int bitsPerChar) { - int totalBits = chars.length * bitsPerChar; + int totalBits = chars.length * bitsPerChar + 1; int byteLength = (totalBits + 7) / 8; // Calculate number of needed bytes byte[] bytes = new byte[byteLength]; - int currentBit = 0; + int currentBit = 1; for (char c : chars) { int value = (bitsPerChar == 5) ? charToValueLowerSpecial(c) : charToValueLowerUpperDigitSpecial(c); @@ -256,7 +229,10 @@ private byte[] encodeGeneric(char[] chars, int bitsPerChar) { currentBit++; } } - + boolean stripLastChar = bytes.length * 8 >= totalBits + bitsPerChar; + if (stripLastChar) { + bytes[0] = (byte) (bytes[0] | 0x80); + } return bytes; } diff --git a/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringBytes.java b/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringBytes.java index fec847d605..f58f406dd1 100644 --- a/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringBytes.java +++ b/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringBytes.java @@ -27,7 +27,6 @@ @Internal final class MetaStringBytes { - static final int STRIP_LAST_CHAR = 0b1000; static final short DEFAULT_DYNAMIC_WRITE_STRING_ID = -1; final byte[] bytes; @@ -57,12 +56,6 @@ public MetaStringBytes(MetaString metaString) { } hashCode &= 0xffffffffffffff00L; int header = metaString.getEncoding().getValue(); - String decoded = - new MetaStringDecoder(metaString.getSpecialChar1(), metaString.getSpecialChar2()) - .decode(bytes, metaString.getEncoding(), bytes.length * 8); - if (decoded.length() > metaString.getString().length()) { - header |= STRIP_LAST_CHAR; - } this.hashCode = hashCode | header; } @@ -70,12 +63,7 @@ public String decode(char specialChar1, char specialChar2) { int header = (int) (hashCode & 0xff); int encodingFlags = header & 0b111; MetaString.Encoding encoding = MetaString.Encoding.values()[encodingFlags]; - String str = - new MetaStringDecoder(specialChar1, specialChar2).decode(bytes, encoding, bytes.length * 8); - if ((header & STRIP_LAST_CHAR) != 0) { - str = str.substring(0, str.length() - 1); - } - return str; + return new MetaStringDecoder(specialChar1, specialChar2).decode(bytes, encoding); } @Override diff --git a/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringResolver.java b/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringResolver.java index 658ddbe1a4..bf8665d3a4 100644 --- a/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringResolver.java +++ b/java/fury-core/src/main/java/org/apache/fury/resolver/MetaStringResolver.java @@ -31,8 +31,6 @@ * share common immutable datastructure globally across multiple fury. */ public final class MetaStringResolver { - public static final byte USE_STRING_VALUE = 0; - public static final byte USE_STRING_ID = 1; private static final int initialCapacity = 8; // use a lower load factor to minimize hash collision private static final float furyMapLoadFactor = 0.25f; diff --git a/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java b/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java index 312cd8e52a..4fa84d75ae 100644 --- a/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/meta/MetaStringTest.java @@ -40,7 +40,7 @@ public void testEncodeMetaStringLowerSpecial() { // utf8 use 12 bytes, we use only 9 bytes. assertEquals(encoder.encode("MediaContent").getBytes().length, 9); MetaStringDecoder decoder = new MetaStringDecoder('_', '$'); - String decoded = decoder.decode(encoded, MetaString.Encoding.LOWER_SPECIAL, 7 * 5); + String decoded = decoder.decode(encoded, MetaString.Encoding.LOWER_SPECIAL); assertEquals(decoded, "abc_def"); for (int i = 0; i < 128; i++) { StringBuilder builder = new StringBuilder(); @@ -49,7 +49,7 @@ public void testEncodeMetaStringLowerSpecial() { } String str = builder.toString(); encoded = encoder.encodeLowerSpecial(str); - decoded = decoder.decode(encoded, MetaString.Encoding.LOWER_SPECIAL, i * 5); + decoded = decoder.decode(encoded, MetaString.Encoding.LOWER_SPECIAL); assertEquals(decoded, str); } } @@ -62,13 +62,13 @@ public void testEncodeMetaStringLowerUpperDigitSpecial() { byte[] encoded = encoder.encodeLowerUpperDigitSpecial("ExampleInput123"); assertEquals(encoded.length, 12); MetaStringDecoder decoder = new MetaStringDecoder(specialChar1, specialChar2); - String decoded = decoder.decode(encoded, MetaString.Encoding.LOWER_UPPER_DIGIT_SPECIAL, 15 * 6); + String decoded = decoder.decode(encoded, MetaString.Encoding.LOWER_UPPER_DIGIT_SPECIAL); assertEquals(decoded, "ExampleInput123"); for (int i = 1; i < 128; i++) { String str = createString(i, specialChar1, specialChar2); encoded = encoder.encodeLowerUpperDigitSpecial(str); - decoded = decoder.decode(encoded, MetaString.Encoding.LOWER_UPPER_DIGIT_SPECIAL, i * 6); + decoded = decoder.decode(encoded, MetaString.Encoding.LOWER_UPPER_DIGIT_SPECIAL); assertEquals(decoded, str, "Failed at " + i); } } @@ -103,7 +103,7 @@ public static Object[][] specialChars() { @Test(dataProvider = "specialChars") public void testMetaString(char specialChar1, char specialChar2) { MetaStringEncoder encoder = new MetaStringEncoder(specialChar1, specialChar2); - for (int i = 0; i < 128; i++) { + for (int i = 1; i < 128; i++) { try { String str = createString(i, specialChar1, specialChar2); MetaString metaString = encoder.encode(str); @@ -112,9 +112,7 @@ public void testMetaString(char specialChar1, char specialChar2) { assertEquals(metaString.getSpecialChar1(), specialChar1); assertEquals(metaString.getSpecialChar2(), specialChar2); MetaStringDecoder decoder = new MetaStringDecoder(specialChar1, specialChar2); - String newStr = - decoder.decode( - metaString.getBytes(), metaString.getEncoding(), metaString.getNumBits()); + String newStr = decoder.decode(metaString.getBytes(), metaString.getEncoding()); assertEquals(newStr, str); } catch (Throwable e) { throw new RuntimeException("Failed at " + i, e); @@ -139,8 +137,7 @@ public void testEncodeEmptyString(MetaString.Encoding encoding) { MetaString metaString = encoder.encode("", encoding); assertEquals(metaString.getBytes().length, 0); MetaStringDecoder decoder = new MetaStringDecoder('_', '$'); - String decoded = - decoder.decode(metaString.getBytes(), metaString.getEncoding(), metaString.getNumBits()); + String decoded = decoder.decode(metaString.getBytes(), metaString.getEncoding()); assertEquals(decoded, ""); } @@ -162,10 +159,7 @@ public void testAllToUpperSpecialEncoding() { MetaStringDecoder decoder = new MetaStringDecoder('_', '$'); String decodedString = - decoder.decode( - encodedMetaString.getBytes(), - encodedMetaString.getEncoding(), - encodedMetaString.getNumBits()); + decoder.decode(encodedMetaString.getBytes(), encodedMetaString.getEncoding()); assertEquals(decodedString, testString); } @@ -178,10 +172,7 @@ public void testFirstToLowerSpecialEncoding() { MetaStringDecoder decoder = new MetaStringDecoder('_', '$'); String decodedString = - decoder.decode( - encodedMetaString.getBytes(), - encodedMetaString.getEncoding(), - encodedMetaString.getNumBits()); + decoder.decode(encodedMetaString.getBytes(), encodedMetaString.getEncoding()); assertEquals(decodedString, testString); } @@ -194,10 +185,7 @@ public void testUtf8Encoding() { MetaStringDecoder decoder = new MetaStringDecoder('_', '$'); String decodedString = - decoder.decode( - encodedMetaString.getBytes(), - encodedMetaString.getEncoding(), - encodedMetaString.getNumBits()); + decoder.decode(encodedMetaString.getBytes(), encodedMetaString.getEncoding()); assertEquals(decodedString, testString); } }