From 17d54c6bb898e27cbb386c214921ad4feaf106f8 Mon Sep 17 00:00:00 2001 From: Matthew Brandyberry Date: Fri, 31 Jul 2015 17:27:55 -0500 Subject: [PATCH 1/2] [SPARK-9483] Fix UTF8String.getPrefix for big-endian. Previous code assumed little-endian. --- .../org/apache/spark/unsafe/types/UTF8String.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c38953f65d7d7..5285dbadaefd6 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -20,6 +20,7 @@ import javax.annotation.Nonnull; import java.io.Serializable; import java.io.UnsupportedEncodingException; +import java.nio.ByteOrder; import java.util.Arrays; import org.apache.spark.unsafe.PlatformDependent; @@ -161,18 +162,25 @@ public long getPrefix() { // If size is greater than 4, assume we have at least 8 bytes of data to fetch. // After getting the data, we use a mask to mask out data that is not part of the string. long p; + long mask = 0; if (numBytes >= 8) { p = PlatformDependent.UNSAFE.getLong(base, offset); } else if (numBytes > 4) { + mask = (1L << (8 - numBytes) * 8) - 1; p = PlatformDependent.UNSAFE.getLong(base, offset); - p = p & ((1L << numBytes * 8) - 1); } else if (numBytes > 0) { + mask = (1L << (8 - numBytes) * 8) - 1; p = (long) PlatformDependent.UNSAFE.getInt(base, offset); - p = p & ((1L << numBytes * 8) - 1); + if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) { + p <<= 32; + } } else { p = 0; } - p = java.lang.Long.reverseBytes(p); + if (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN) { + p = java.lang.Long.reverseBytes(p); + } + p &= ~mask; return p; } From ec31df8ce5b32b747224212bef15e3f85d96a572 Mon Sep 17 00:00:00 2001 From: Matthew Brandyberry Date: Mon, 3 Aug 2015 16:19:15 -0500 Subject: [PATCH 2/2] [SPARK-9483] Changes from review comments. - Introduce static byteOrder variable - Branch on byteOrder at top-level for readability. --- .../apache/spark/unsafe/types/UTF8String.java | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 5285dbadaefd6..bd6b60159967c 100644 --- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -51,6 +51,8 @@ public final class UTF8String implements Comparable, Serializable { 5, 5, 5, 5, 6, 6}; + private static ByteOrder byteOrder = ByteOrder.nativeOrder(); + public static final UTF8String EMPTY_UTF8 = UTF8String.fromString(""); /** @@ -163,22 +165,32 @@ public long getPrefix() { // After getting the data, we use a mask to mask out data that is not part of the string. long p; long mask = 0; - if (numBytes >= 8) { - p = PlatformDependent.UNSAFE.getLong(base, offset); - } else if (numBytes > 4) { - mask = (1L << (8 - numBytes) * 8) - 1; - p = PlatformDependent.UNSAFE.getLong(base, offset); - } else if (numBytes > 0) { - mask = (1L << (8 - numBytes) * 8) - 1; - p = (long) PlatformDependent.UNSAFE.getInt(base, offset); - if (ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN) { - p <<= 32; + if (byteOrder == ByteOrder.LITTLE_ENDIAN) { + if (numBytes >= 8) { + p = PlatformDependent.UNSAFE.getLong(base, offset); + } else if (numBytes > 4) { + p = PlatformDependent.UNSAFE.getLong(base, offset); + mask = (1L << (8 - numBytes) * 8) - 1; + } else if (numBytes > 0) { + p = (long) PlatformDependent.UNSAFE.getInt(base, offset); + mask = (1L << (8 - numBytes) * 8) - 1; + } else { + p = 0; } - } else { - p = 0; - } - if (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN) { p = java.lang.Long.reverseBytes(p); + } else { + // byteOrder == ByteOrder.BIG_ENDIAN + if (numBytes >= 8) { + p = PlatformDependent.UNSAFE.getLong(base, offset); + } else if (numBytes > 4) { + p = PlatformDependent.UNSAFE.getLong(base, offset); + mask = (1L << (8 - numBytes) * 8) - 1; + } else if (numBytes > 0) { + p = ((long) PlatformDependent.UNSAFE.getInt(base, offset)) << 32; + mask = (1L << (8 - numBytes) * 8) - 1; + } else { + p = 0; + } } p &= ~mask; return p;