diff --git a/src/java/org/apache/cassandra/db/marshal/IntegerType.java b/src/java/org/apache/cassandra/db/marshal/IntegerType.java index 30a4fbea503d..378e040bf12d 100644 --- a/src/java/org/apache/cassandra/db/marshal/IntegerType.java +++ b/src/java/org/apache/cassandra/db/marshal/IntegerType.java @@ -32,6 +32,7 @@ import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.bytecomparable.ByteComparable; import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; public final class IntegerType extends NumberType { @@ -143,38 +144,45 @@ public static int compareIntegers(VL lhs, ValueAccessor accessorL, /** * Constructs a byte-comparable representation of the number. - * We represent it as - * - * where a length_byte is: - * - 0x80 + (length - 1) for positive numbers (so that longer length sorts bigger) - * - 0x7F - (length - 1) for negative numbers (so that longer length sorts smaller) + * In the current format we represent it: + * directly as varint, if the length is 6 or smaller (the encoding has non-00/FF first byte) + * <7 or more bytes>, otherwise + * where is 00 for negative numbers and FF for positive ones, and the length's bytes are inverted if + * the number is negative (so that longer length sorts smaller). * - * Because we include the sign in the length byte: - * - unlike fixed-length ints, we don't need to sign-invert the first significant byte, - * - unlike BigInteger, we don't need to include 0x00 prefix for positive integers whose first byte is >= 0x80 - * or 0xFF prefix for negative integers whose first byte is < 0x80. + * Because we present the sign separately, we don't need to include 0x00 prefix for positive integers whose first + * byte is >= 0x80 or 0xFF prefix for negative integers whose first byte is < 0x80. * * The representations are prefix-free, because representations of different length always have length bytes that * differ. * * Examples: - * 0 as 8000 - * 1 as 8001 - * 127 as 807F + * -1 as 7F + * 0 as 80 + * 1 as 81 + * 127 as C07F * 255 as 80FF - * 2^32-1 as 837FFFFFFF - * 2^32 as 8380000000 - * 2^33 as 840100000000 + * 2^32-1 as F8FFFFFFFF + * 2^32 as F900000000 + * 2^47-1 as fe7fffffffffff + * 2^47 as ff06800000000000 + * + * See asComparableBytesLegacy for description of the legacy format. */ - @Override public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) { - int p = 0; final int limit = accessor.size(data); - if (p == limit) - return null; + if (limit <= 6) + { + if (limit == 0) + return null; + if (version != ByteComparable.Version.LEGACY) + return encodeAsVarInt(accessor, data, limit); + } + // skip any leading sign-only byte(s) + int p = 0; final byte signbyte = accessor.getByte(data, p); if (signbyte == BIG_INTEGER_NEGATIVE_LEADING_ZERO || signbyte == BIG_INTEGER_POSITIVE_LEADING_ZERO) { @@ -185,8 +193,128 @@ public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteC } } - final int startpos = p; + if (version != ByteComparable.Version.LEGACY) + return asComparableBytesCurrent(accessor, data, p, limit, (signbyte >> 7) & 0xFF); + else + return asComparableBytesLegacy(accessor, data, p, limit, signbyte); + } + + /** + * Encode the BigInteger stored in the given buffer as a variable-length signed integer. + * The length of the number is given in the limit argument, and must be <= 6. + */ + private ByteSource encodeAsVarInt(ValueAccessor accessor, V data, int limit) + { + long v; + switch (limit) + { + case 1: + v = accessor.getByte(data, 0); + break; + case 2: + v = accessor.getShort(data, 0); + break; + case 3: + v = (accessor.getShort(data, 0) << 8) | (accessor.getByte(data, 2) & 0xFF); + break; + case 4: + v = accessor.getInt(data, 0); + break; + case 5: + v = ((long) accessor.getInt(data, 0) << 8) | (accessor.getByte(data, 4) & 0xFF); + break; + case 6: + v = ((long) accessor.getInt(data, 0) << 16) | (accessor.getShort(data, 4) & 0xFFFF); + break; + default: + throw new AssertionError(); + } + return ByteSource.variableLengthInteger(v); + } + /** + * Constructs a byte-comparable representation of the number. + * We represent it: + * directly as varint, if the length is 6 or smaller (the encoding has non-00/FF first byte) + * <7 or more bytes>, otherwise + * where is 00 for negative numbers and FF for positive ones, and the length's bytes are inverted if + * the number is negative (so that longer length sorts smaller). + * + * Because we present the sign separately, we don't need to include 0x00 prefix for positive integers whose first + * byte is >= 0x80 or 0xFF prefix for negative integers whose first byte is < 0x80. + * + * The representations are prefix-free, because representations of different length always have length bytes that + * differ. + * + * Examples: + * -1 as 7F + * 0 as 80 + * 1 as 81 + * 127 as C07F + * 255 as 80FF + * 2^32-1 as F8FFFFFFFF + * 2^32 as F900000000 + * 2^47-1 as fe7fffffffffff + * 2^47 as ff06800000000000 + */ + private ByteSource asComparableBytesCurrent(ValueAccessor accessor, V data, int startpos, int limit, int signbyte) + { + // start with sign as a byte, then variable-length-encoded length, then bytes (stripped leading sign) + return new ByteSource() + { + int pos = -2; + ByteSource lengthEncoding = new VariableLengthUnsignedInteger(limit - startpos); + + public int next() + { + if (pos == -2) + { + ++pos; + return signbyte ^ 0xFF; // 00 for negative/FF for positive (01-FE for direct varint encoding) + } + else if (pos == -1) + { + int nextByte = lengthEncoding.next(); + if (nextByte != END_OF_STREAM) + return nextByte ^ signbyte; + pos = startpos; + } + + if (pos == limit) + return END_OF_STREAM; + + return accessor.getByte(data, pos++) & 0xFF; + } + }; + } + + /** + * Constructs a byte-comparable representation of the number in the legacy format. + * We represent it as + * + * where a length_byte is: + * - 0x80 + (length - 1) for positive numbers (so that longer length sorts bigger) + * - 0x7F - (length - 1) for negative numbers (so that longer length sorts smaller) + * + * Because we include the sign in the length byte: + * - unlike fixed-length ints, we don't need to sign-invert the first significant byte, + * - unlike BigInteger, we don't need to include 0x00 prefix for positive integers whose first byte is >= 0x80 + * or 0xFF prefix for negative integers whose first byte is < 0x80. + * + * The representations are prefix-free, because representations of different length always have length bytes that + * differ. + * + * Examples: + * 0 as 8000 + * 1 as 8001 + * 127 as 807F + * 255 as 80FF + * 2^32-1 as 837FFFFFFF + * 2^32 as 8380000000 + * 2^33 as 840100000000 + */ + private ByteSource asComparableBytesLegacy(ValueAccessor accessor, V data, int startpos, int limit, int signbyte) + { return new ByteSource() { int pos = startpos; @@ -227,6 +355,92 @@ public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable if (comparableBytes == null) return accessor.empty(); + if (version == ByteComparable.Version.LEGACY) + return fromComparableBytesLegacy(accessor, comparableBytes); + else + return fromComparableBytesCurrent(accessor, comparableBytes); + } + + + private V fromComparableBytesCurrent(ValueAccessor accessor, ByteSource.Peekable comparableBytes) + { + // Consume the first byte to determine whether the encoded number is positive and + // start iterating through the length header bytes and collecting the number of value bytes. + int sign = comparableBytes.peek() ^ 0xFF; // FF if negative, 00 if positive + if (sign != 0xFF && sign != 0x00) + return extractVarIntBytes(accessor, ByteSourceInverse.getVariableLengthInteger(comparableBytes)); + + // consume the sign byte + comparableBytes.next(); + + // Read the length (inverted if the number is negative) + int valueBytes = Math.toIntExact(ByteSourceInverse.getVariableLengthUnsignedIntegerXoring(comparableBytes, sign)); + // Get the bytes. + return extractBytes(accessor, comparableBytes, sign, valueBytes); + } + + private V extractVarIntBytes(ValueAccessor accessor, long value) + { + int length = (64 - Long.numberOfLeadingZeros(value ^ (value >> 63)) + 8) / 8; // number of bytes needed: 7 bits -> one byte, 8 bits -> 2 bytes + V buf = accessor.allocate(length); + switch (length) + { + case 1: + accessor.putByte(buf, 0, (byte) value); + break; + case 2: + accessor.putShort(buf, 0, (short) value); + break; + case 3: + accessor.putShort(buf, 0, (short) (value >> 8)); + accessor.putByte(buf, 2, (byte) value); + break; + case 4: + accessor.putInt(buf, 0, (int) value); + break; + case 5: + accessor.putInt(buf, 0, (int) (value >> 8)); + accessor.putByte(buf, 4, (byte) value); + break; + case 6: + accessor.putInt(buf, 0, (int) (value >> 16)); + accessor.putShort(buf, 4, (short) value); + break; + default: + throw new AssertionError(); + } + return buf; + } + + private V extractBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, int sign, int valueBytes) + { + int writtenBytes = 0; + V buf; + // Add "leading zero" if needed (i.e. in case the leading byte of a positive number corresponds to a negative + // value, or in case the leading byte of a negative number corresponds to a non-negative value). + // Size the array containing all the value bytes accordingly. + int curr = comparableBytes.next(); + if ((curr & 0x80) != (sign & 0x80)) + { + ++valueBytes; + buf = accessor.allocate(valueBytes); + accessor.putByte(buf, writtenBytes++, (byte) sign); + } + else + buf = accessor.allocate(valueBytes); + // Don't forget to add the first consumed value byte after determining whether leading zero should be added + // and sizing the value bytes array. + accessor.putByte(buf, writtenBytes++, (byte) curr); + + // Consume exactly the number of expected value bytes. + while (writtenBytes < valueBytes) + accessor.putByte(buf, writtenBytes++, (byte) comparableBytes.next()); + + return buf; + } + + private V fromComparableBytesLegacy(ValueAccessor accessor, ByteSource.Peekable comparableBytes) + { int valueBytes; byte signedZero; // Consume the first byte to determine whether the encoded number is positive and @@ -253,29 +467,7 @@ public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable signedZero = -1; } - int writtenBytes = 0; - V buf; - // Add "leading zero" if needed (i.e. in case the leading byte of a positive number corresponds to a negative - // value, or in case the leading byte of a negative number corresponds to a non-negative value). - // Size the array containing all the value bytes accordingly. - curr = comparableBytes.next(); - if ((curr & 0x80) != (signedZero & 0x80)) - { - ++valueBytes; - buf = accessor.allocate(valueBytes); - accessor.putByte(buf, writtenBytes++, signedZero); - } - else - buf = accessor.allocate(valueBytes); - // Don't forget to add the first consumed value byte after determining whether leading zero should be added - // and sizing the value bytes array. - accessor.putByte(buf, writtenBytes++, (byte) curr); - - // Consume exactly the number of expected value bytes. - while (writtenBytes < valueBytes) - accessor.putByte(buf, writtenBytes++, (byte) comparableBytes.next()); - - return buf; + return extractBytes(accessor, comparableBytes, signedZero, valueBytes); } public ByteBuffer fromString(String source) throws MarshalException diff --git a/src/java/org/apache/cassandra/db/marshal/LongType.java b/src/java/org/apache/cassandra/db/marshal/LongType.java index 9e0335733745..6bf5e9e66990 100644 --- a/src/java/org/apache/cassandra/db/marshal/LongType.java +++ b/src/java/org/apache/cassandra/db/marshal/LongType.java @@ -63,13 +63,23 @@ public static int compareLongs(VL left, ValueAccessor accessorL, VR @Override public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) { - return ByteSource.optionalSignedFixedLengthNumber(accessor, data); + if (accessor.isEmpty(data)) + return null; + if (version == ByteComparable.Version.LEGACY) + return ByteSource.signedFixedLengthNumber(accessor, data); + else + return ByteSource.variableLengthInteger(accessor.getLong(data, 0)); } @Override public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) { - return ByteSourceInverse.getOptionalSignedFixedLength(accessor, comparableBytes, 8); + if (comparableBytes == null) + return accessor.empty(); + if (version == ByteComparable.Version.LEGACY) + return ByteSourceInverse.getSignedFixedLength(accessor, comparableBytes, 8); + else + return accessor.valueOf(ByteSourceInverse.getVariableLengthInteger(comparableBytes)); } public ByteBuffer fromString(String source) throws MarshalException diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md index a6732a5fece2..281b9284927e 100644 --- a/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md +++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md @@ -332,7 +332,7 @@ This is the trivial case, as we can simply use the input bytes in big-endian ord and fixed length values are trivially prefix free, i.e. (1) and (2) are satisfied, and thus (3) and (4) follow from the observation above. -## Fixed-length signed integers (byte, short, int, bigint) +## Fixed-length signed integers (byte, short, int, legacy bigint) As above, but we need to invert the sign bit of the number to put negative numbers before positives. This maps `MIN_VALUE` to `0x00`..., `-1` to `0x7F…`, `0` to `0x80…`, and `MAX_VALUE` to `0xFF…`; comparing the resulting number @@ -348,6 +348,46 @@ Examples: |int MAX_VALUE |7F FF FF FF| FF FF FF FF |long MIN_VALUE|80 00 00 00 00 00 00 00| 00 00 00 00 00 00 00 00 +## Variable-length encoding of integers (current bigint) + +Another way to encode integers that may save significant amounts of space when smaller numbers are often in use, but +still permits large values to be efficiently encoded, is to use an encoding scheme similar to UTF-8. + +For unsigned numbers this can be done by starting the number with as many 1s in most significant bits as there are +additional bytes in the encoding, followed by a 0, and the bits of the number. Numbers between 0 and 127 are encoded +in one byte, and each additional byte adds 7 more bits. Values that use all 8 bytes do not need a 9th bit of 0 and can +thus fit 9 bytes. Because longer numbers have more 1s in their MSBs, they compare +higher than shorter ones (and we always use the shortest representation). Because the length is specified through these +initial bits, no value can be a prefix of another. + +| Value | bytes |encodes as| +|-----------|-------------------------|----------| +| 0 | 00 | 00 +| 1 | 00 01 | 01 +| 127 | 00 00 00 7F | 7F +| 128 | 00 80 | 80 80 +| 2^31 | 7F FF FF FF | FF FF FF FF +| 2^64- 1 | FF FF FF FF FF FF FF FF | FF FF FF FF FF FF FF FF FF + + +To encode signed numbers, we must start with the sign bit, and must also ensure that longer negative numbers sort +smaller than shorter ones. The first bit of the encoding is the inverted sign (i.e. 1 for positive, 0 for negative), +followed by the length encoded as a sequence of bits that matches the inverted sign, followed by a bit that differs +(like above, not necessary for 9-byte encodings) and the bits of the number's two's complement. + +| Value | bytes |encodes as| +|-------------------|-------------------------|----------| +| 1 | 00 00 00 01 | 01 +| -1 | FF FF | 7F +| 0 | 00 | 80 +| 63 | 3F | BF +| -64 | C0 | 40 +| 64 | 40 | C0 40 +| -65 | BF | 3F BF +| Integer.MAX_VALUE | 7F FF FF FF | F8 7F FF FF FF +| Long.MIN_VALUE | 80 00 00 00 00 00 00 00 | 00 00 00 00 00 00 00 00 00 + + ## Fixed-size floating-point numbers (float, double) IEEE-754 was designed with byte-by-byte comparisons in mind, and provides an important guarantee about the bytes of a @@ -455,7 +495,7 @@ another, the latter has to have a `FE` or `FF` as the next byte, which ensures b makes it no longer a prefix of the latter) and (3) (adding `10`-`EF` to the former makes it smaller than the latter; in this case the original value of the former is a prefix of the original value of the latter). -## Variable-length integers (varint, RandomPartitioner token) +## Variable-length integers (varint, RandomPartitioner token), legacy encoding If integers of unbounded length are guaranteed to start with a non-zero digit, to compare them we can first use a signed length, as numbers with longer representations have higher magnitudes. Only if the lengths match we need to compare the @@ -496,6 +536,41 @@ Examples: (Middle dot · shows the transition point between length and digits.) +## Variable-length integers, current encoding + +Because variable-length integers are also often used to store smaller range integers, it makes sense to also apply +the variable-length integer encoding. Thus, the current varint scheme chooses to: +- map numbers directly to their variable-length integer encoding, if they have 6 bytes or less +- otherwise, encode as: + - a sign byte (00 for negative numbers, FF for positive, distinct from the leading byte of the variable-length + encoding above) + - a variable-length encoded number of bytes, inverted for negative numbers (so that greater length compares smaller) + - the bytes of the number, two's complement encoded. +We never use a longer encoding (e.g. using the second method if variable-length suffices or with added 00 leading +bytes) if a shorter one suffices. + +By the same reasoning as above, and the fact that the sign byte cannot be confused with a variable-length encoding +first byte, no value can be a prefix of another. As the sign byte compares smaller for negative (respectively bigger +for positive numbers) than any variable-length encoded integer, the comparison order is maintained when one number +uses variable-length encoding, and the other doesn't. Longer numbers compare smaller when negative (because of the +inverted length bytes), and bigger when positive. + +Examples: + +|value|bytes|encodes as| +|---:|---|---| +|0 | 00 | 80 +|1 | 01 | 81 +|-1 | FF | 7F +|255 | 00 FF | C0 FF +|-256 | FF 00 | 3F 00 +|2^16 | 01 00 00 | E1 00 00 +|-2^32 | FF 00 00 00 00 | 07 00 00 00 00 +|2^1024 | 01 00(128 times)| FF·80 80·00(128 times) +|-2^2048| FF 00(256 times)| 00·7E FF·00(256 times) + +(Middle dot · shows the transition point between length and digits.) + ## Variable-length floating-point decimals (decimal) Variable-length floats are more complicated, but we can treat them similarly to IEEE-754 floating point numbers, by diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java index 108ca5e12065..65304831249a 100644 --- a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java +++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java @@ -182,6 +182,16 @@ static ByteSource signedFixedLengthFloat(ValueAccessor accessor, V data) return new SignedFixedLengthFloat<>(accessor, data); } + /** + * Produce a source for a signed integer, stored using variable length encoding. + * The representation takes between 1 and 9 bytes, is prefix-free and compares + * correctly. + */ + static ByteSource variableLengthInteger(long value) + { + return new VariableLengthInteger(value); + } + /** * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming * prevMax < currMin. @@ -453,6 +463,113 @@ public int next() } } + /** + * Variable-length encoding for unsigned integers. + * The encoding is similar to UTF-8 encoding. + * Numbers between 0 and 127 are encoded in one byte, using 0 in the most significant bit. + * Larger values have 1s in as many of the most significant bits as the number of additional bytes + * in the representation, followed by a 0. This ensures that longer numbers compare larger than shorter + * ones. Since we never use a longer representation than necessary, this implies numbers compare correctly. + * As the number of bytes is specified in the bits of the first, no value is a prefix of another. + */ + static class VariableLengthUnsignedInteger implements ByteSource + { + final long value; + int pos = -1; + + public VariableLengthUnsignedInteger(long value) + { + this.value = value; + } + + public int next() + { + if (pos == -1) + { + int bitsMinusOne = 63 - (Long.numberOfLeadingZeros(value | 1)); // 0 to 63 (the | 1 is to make sure 0 maps to 0 (1 bit)) + int bytesMinusOne = bitsMinusOne / 7; + int mask = -256 >> bytesMinusOne; // sequence of bytesMinusOne 1s in the most-significant bits + pos = bytesMinusOne * 8; + return (int) ((value >>> pos) | mask) & 0xFF; + } + pos -= 8; + if (pos < 0) + return END_OF_STREAM; + return (int) (value >>> pos) & 0xFF; + } + } + + /** + * Variable-length encoding for signed integers. + * The encoding is based on the unsigned encoding above, where the first bit stored is the inverted sign, + * followed by as many matching bits as there are additional bytes in the encoding, followed by the two's + * complement of the number. + * Because of the inverted sign bit, negative numbers compare smaller than positives, and because the length + * bits match the sign, longer positive numbers compare greater and longer negative ones compare smaller. + * + * Examples: + * 0 encodes as 80 + * 1 encodes as 81 + * -1 encodes as 7F + * 63 encodes as BF + * 64 encodes as C040 + * -64 encodes as 40 + * -65 encodes as 3FBF + * 2^20-1 encodes as EFFFFF + * 2^20 encodes as F0100000 + * -2^20 encodes as 100000 + * 2^64-1 encodes as FFFFFFFFFFFFFFFFFF + * -2^64 encodes as 000000000000000000 + * + * As the number of bytes is specified in bits 2-9, no value is a prefix of another. + */ + static class VariableLengthInteger implements ByteSource + { + final long value; + int pos; + + public VariableLengthInteger(long value) + { + long negativeMask = value >> 63; // -1 for negative, 0 for positive + value ^= negativeMask; + + int bits = 64 - Long.numberOfLeadingZeros(value | 1); // 1 to 63 (can't be 64 because we flip negative numbers) + int bytes = bits / 7 + 1; // 0-6 bits 1 byte 7-13 2 bytes etc to 56-63 9 bytes + if (bytes >= 9) + { + value |= 0x8000000000000000L; // 8th bit, which doesn't fit the first byte + pos = negativeMask < 0 ? 256 : -1; // out of 0-64 range integer such that & 0xFF is 0x00 for negative and 0xFF for positive + } + else + { + long mask = (-0x100 >> bytes) & 0xFF; // one in sign bit and as many more as there are extra bytes + pos = bytes * 8; + value = value | (mask << (pos - 8)); + } + + value ^= negativeMask; + this.value = value; + } + + public int next() + { + if (pos <= 0 || pos > 64) + { + if (pos == 0) + return END_OF_STREAM; + else + { + // 8-byte value, returning first byte + int result = pos & 0xFF; // 0x00 for negative numbers, 0xFF for positive + pos = 64; + return result; + } + } + pos -= 8; + return (int) (value >>> pos) & 0xFF; + } + } + static class Number implements ByteSource { final long value; diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java index ff29cb10aa00..5df0a7923aaf 100644 --- a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java +++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java @@ -267,6 +267,58 @@ public static byte[] getUnescapedBytes(ByteSource.Peekable byteSource) return byteSource == null ? null : readBytes(unescape(byteSource)); } + /** + * Decode a variable-length signed integer. + */ + public static long getVariableLengthInteger(ByteSource byteSource) + { + int signAndMask = byteSource.next(); + + long sum = 0; + int bytes; + // For every bit after the sign that matches the sign, read one more byte. + for (bytes = 0; bytes < 7 && sameByteSign(signAndMask << (bytes + 1), signAndMask); ++bytes) + sum = (sum << 8) | byteSource.next(); + + // The eighth length bit is stored in the second byte. + if (bytes == 7 && sameByteSign((int) (sum >> 48), signAndMask)) + return ((sum << 8) | byteSource.next()) ^ LONG_SIGN_BIT; // 9-byte encoding, use bytes 2-9 with inverted sign + else + { + sum |= (((long) signAndMask) << bytes * 8); // add the rest of the bits + long signMask = -0x40L << bytes * 7; // mask of the bits that should be replaced by the sign + long sign = (byte) (signAndMask ^ 0x80) >> 7; // -1 if negative (0 leading bit), 0 otherwise + return sum & ~signMask | sign & signMask; + } + } + + /** + * Decode a variable-length unsigned integer, passing all bytes read through XOR with the given xorWith parameter. + * + * Used in BigInteger encoding to read number length, where negative numbers have their length negated + * (i.e. xorWith = 0xFF) to ensure correct ordering. + */ + public static long getVariableLengthUnsignedIntegerXoring(ByteSource byteSource, int xorWith) + { + int signAndMask = byteSource.next() ^ xorWith; + + long sum = 0; + int bytes; + // Read an extra byte while the next most significant bit is 1. + for (bytes = 0; bytes <= 7 && ((signAndMask << bytes) & 0x80) != 0; ++bytes) + sum = (sum << 8) | byteSource.next() ^ xorWith; + + // Strip the length bits from the leading byte. + signAndMask &= ~(-256 >> bytes); + return sum | (((long) signAndMask) << bytes * 8); // Add the rest of the bits of the leading byte. + } + + /** Returns true if the two parameters treated as bytes have the same sign. */ + private static boolean sameByteSign(int a, int b) + { + return ((a ^ b) & 0x80) == 0; + } + /** * As above, but converts the result to a ByteSource. */ diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java index abbd9bb4029d..8e3843c416ab 100644 --- a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java @@ -1009,7 +1009,17 @@ void assertComparesSame(AbstractType type, Object v1, Object v2) assertEquals(String.format("Failed comparing reversed %s(%s, %s) and %s(%s, %s) direct (%d) and as clustering", safeStr(v1), ByteBufferUtil.bytesToHex(b1), c1, safeStr(v2), ByteBufferUtil.bytesToHex(b2), c2, actual), expected, actualcc); } else - assertEquals(String.format("Failed comparing %s(%s) and %s(%s)", safeStr(v1), ByteBufferUtil.bytesToHex(b1), safeStr(v2), ByteBufferUtil.bytesToHex(b2)), expected, actual); + if (expected != actual) + assertEquals(String.format("Failed comparing %s(%s BC %s) and %s(%s BC %s) version %s", + safeStr(v1), + ByteBufferUtil.bytesToHex(b1), + bc1.byteComparableAsString(version), + safeStr(v2), + ByteBufferUtil.bytesToHex(b2), + bc2.byteComparableAsString(version), + version), + expected, + actual); } maybeCheck41Properties(expected, bc1, bc2, version); } diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java index 656901022fa2..8624f3300fd0 100644 --- a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java @@ -599,9 +599,8 @@ public void testType(AbstractType type, Object[] values) safeStr(type.getSerializer().toCQLLiteral(b)), safeStr(ByteBufferUtil.bytesToHex(b)), typeToComparable(type, b).byteComparableAsString(VERSION)); - } - for (Object i : values) assertConvertsSame(type, i); + } if (!type.isReversed()) testType(ReversedType.getInstance(type), values); } diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java index 7f04a3ab8499..fea0e98948ff 100644 --- a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java @@ -53,9 +53,13 @@ public class ByteSourceTestBase -255, -128, -127, + -64, + -63, -1, 0, 1, + 63, + 64, 127, 128, 255, @@ -69,6 +73,9 @@ public class ByteSourceTestBase -255, -128, -127, + -65, + -64, + -63, -1, 0, 1, @@ -86,16 +93,268 @@ public class ByteSourceTestBase -255L, -128L, -127L, + -65L, + -64L, + -63L, -1L, 0L, 1L, - 127L, - 128L, - 255L, - 256L, Integer.MAX_VALUE + 1L, Long.MAX_VALUE - 1, - Long.MAX_VALUE }; + Long.MAX_VALUE, + + (1L << 1) - 1, + (1L << 1), + (1L << 2) - 1, + (1L << 2), + (1L << 3) - 1, + (1L << 3), + (1L << 4) - 1, + (1L << 4), + (1L << 5) - 1, + (1L << 5), + (1L << 6) - 1, + (1L << 6), + (1L << 7) - 1, + (1L << 7), + (1L << 8) - 1, + (1L << 8), + (1L << 9) - 1, + (1L << 9), + (1L << 10) - 1, + (1L << 10), + (1L << 11) - 1, + (1L << 11), + (1L << 12) - 1, + (1L << 12), + (1L << 13) - 1, + (1L << 13), + (1L << 14) - 1, + (1L << 14), + (1L << 15) - 1, + (1L << 15), + (1L << 16) - 1, + (1L << 16), + (1L << 17) - 1, + (1L << 17), + (1L << 18) - 1, + (1L << 18), + (1L << 19) - 1, + (1L << 19), + (1L << 20) - 1, + (1L << 20), + (1L << 21) - 1, + (1L << 21), + (1L << 22) - 1, + (1L << 22), + (1L << 23) - 1, + (1L << 23), + (1L << 24) - 1, + (1L << 24), + (1L << 25) - 1, + (1L << 25), + (1L << 26) - 1, + (1L << 26), + (1L << 27) - 1, + (1L << 27), + (1L << 28) - 1, + (1L << 28), + (1L << 29) - 1, + (1L << 29), + (1L << 30) - 1, + (1L << 30), + (1L << 31) - 1, + (1L << 31), + (1L << 32) - 1, + (1L << 32), + (1L << 33) - 1, + (1L << 33), + (1L << 34) - 1, + (1L << 34), + (1L << 35) - 1, + (1L << 35), + (1L << 36) - 1, + (1L << 36), + (1L << 37) - 1, + (1L << 37), + (1L << 38) - 1, + (1L << 38), + (1L << 39) - 1, + (1L << 39), + (1L << 40) - 1, + (1L << 40), + (1L << 41) - 1, + (1L << 41), + (1L << 42) - 1, + (1L << 42), + (1L << 43) - 1, + (1L << 43), + (1L << 44) - 1, + (1L << 44), + (1L << 45) - 1, + (1L << 45), + (1L << 46) - 1, + (1L << 46), + (1L << 47) - 1, + (1L << 47), + (1L << 48) - 1, + (1L << 48), + (1L << 49) - 1, + (1L << 49), + (1L << 50) - 1, + (1L << 50), + (1L << 51) - 1, + (1L << 51), + (1L << 52) - 1, + (1L << 52), + (1L << 53) - 1, + (1L << 53), + (1L << 54) - 1, + (1L << 54), + (1L << 55) - 1, + (1L << 55), + (1L << 56) - 1, + (1L << 56), + (1L << 57) - 1, + (1L << 57), + (1L << 58) - 1, + (1L << 58), + (1L << 59) - 1, + (1L << 59), + (1L << 60) - 1, + (1L << 60), + (1L << 61) - 1, + (1L << 61), + (1L << 62) - 1, + (1L << 62), + (1L << 63) - 1, + + ~((1L << 1) - 1), + ~((1L << 1)), + ~((1L << 2) - 1), + ~((1L << 2)), + ~((1L << 3) - 1), + ~((1L << 3)), + ~((1L << 4) - 1), + ~((1L << 4)), + ~((1L << 5) - 1), + ~((1L << 5)), + ~((1L << 6) - 1), + ~((1L << 6)), + ~((1L << 7) - 1), + ~((1L << 7)), + ~((1L << 8) - 1), + ~((1L << 8)), + ~((1L << 9) - 1), + ~((1L << 9)), + ~((1L << 10) - 1), + ~((1L << 10)), + ~((1L << 11) - 1), + ~((1L << 11)), + ~((1L << 12) - 1), + ~((1L << 12)), + ~((1L << 13) - 1), + ~((1L << 13)), + ~((1L << 14) - 1), + ~((1L << 14)), + ~((1L << 15) - 1), + ~((1L << 15)), + ~((1L << 16) - 1), + ~((1L << 16)), + ~((1L << 17) - 1), + ~((1L << 17)), + ~((1L << 18) - 1), + ~((1L << 18)), + ~((1L << 19) - 1), + ~((1L << 19)), + ~((1L << 20) - 1), + ~((1L << 20)), + ~((1L << 21) - 1), + ~((1L << 21)), + ~((1L << 22) - 1), + ~((1L << 22)), + ~((1L << 23) - 1), + ~((1L << 23)), + ~((1L << 24) - 1), + ~((1L << 24)), + ~((1L << 25) - 1), + ~((1L << 25)), + ~((1L << 26) - 1), + ~((1L << 26)), + ~((1L << 27) - 1), + ~((1L << 27)), + ~((1L << 28) - 1), + ~((1L << 28)), + ~((1L << 29) - 1), + ~((1L << 29)), + ~((1L << 30) - 1), + ~((1L << 30)), + ~((1L << 31) - 1), + ~((1L << 31)), + ~((1L << 32) - 1), + ~((1L << 32)), + ~((1L << 33) - 1), + ~((1L << 33)), + ~((1L << 34) - 1), + ~((1L << 34)), + ~((1L << 35) - 1), + ~((1L << 35)), + ~((1L << 36) - 1), + ~((1L << 36)), + ~((1L << 37) - 1), + ~((1L << 37)), + ~((1L << 38) - 1), + ~((1L << 38)), + ~((1L << 39) - 1), + ~((1L << 39)), + ~((1L << 40) - 1), + ~((1L << 40)), + ~((1L << 41) - 1), + ~((1L << 41)), + ~((1L << 42) - 1), + ~((1L << 42)), + ~((1L << 43) - 1), + ~((1L << 43)), + ~((1L << 44) - 1), + ~((1L << 44)), + ~((1L << 45) - 1), + ~((1L << 45)), + ~((1L << 46) - 1), + ~((1L << 46)), + ~((1L << 47) - 1), + ~((1L << 47)), + ~((1L << 48) - 1), + ~((1L << 48)), + ~((1L << 49) - 1), + ~((1L << 49)), + ~((1L << 50) - 1), + ~((1L << 50)), + ~((1L << 51) - 1), + ~((1L << 51)), + ~((1L << 52) - 1), + ~((1L << 52)), + ~((1L << 53) - 1), + ~((1L << 53)), + ~((1L << 54) - 1), + ~((1L << 54)), + ~((1L << 55) - 1), + ~((1L << 55)), + ~((1L << 56) - 1), + ~((1L << 56)), + ~((1L << 57) - 1), + ~((1L << 57)), + ~((1L << 58) - 1), + ~((1L << 58)), + ~((1L << 59) - 1), + ~((1L << 59)), + ~((1L << 60) - 1), + ~((1L << 60)), + ~((1L << 61) - 1), + ~((1L << 61)), + ~((1L << 62) - 1), + ~((1L << 62)), + ~((1L << 63) - 1), + }; Double[] testDoubles = new Double[]{ null, Double.NEGATIVE_INFINITY, -Double.MAX_VALUE,