Skip to content

Commit

Permalink
[SPARK-32892][CORE][SQL] Fix hash functions on big-endian platforms.
Browse files Browse the repository at this point in the history
MurmurHash3 and xxHash64 interpret sequences of bytes as integers
encoded in little-endian byte order. This requires a byte reversal
on big endian platforms.

I've left the hashInt and hashLong functions as-is for now. My
interpretation of these functions is that they perform the hash on
the integer value as if it were serialized in little-endian byte
order. Therefore no byte reversal is necessary.

This commit fixes existing tests on the IBM Z (s390x) platform.
I've modified the XXH64 tests to expect the same results on all
platforms. Other tests assume little-endian which aligns with the
specification for xxHash64.
  • Loading branch information
mundaym committed Sep 15, 2020
1 parent 888b343 commit df1a1c8
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,16 @@

package org.apache.spark.util.sketch;

import java.nio.ByteOrder;

/**
* 32-bit Murmur3 hasher. This is based on Guava's Murmur3_32HashFunction.
*/
// This class is duplicated from `org.apache.spark.unsafe.hash.Murmur3_x86_32` to make sure
// spark-sketch has no external dependencies.
final class Murmur3_x86_32 {
private static final boolean isBigEndian = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);

private static final int C1 = 0xcc9e2d51;
private static final int C2 = 0x1b873593;

Expand Down Expand Up @@ -92,8 +96,10 @@ private static int hashBytesByInt(Object base, long offset, int lengthInBytes, i
int h1 = seed;
for (int i = 0; i < lengthInBytes; i += 4) {
int halfWord = Platform.getInt(base, offset + i);
int k1 = mixK1(halfWord);
h1 = mixH1(h1, k1);
if (isBigEndian) {
halfWord = Integer.reverseBytes(halfWord);
}
h1 = mixH1(h1, mixK1(halfWord));
}
return h1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,16 @@

package org.apache.spark.unsafe.hash;

import java.nio.ByteOrder;

import org.apache.spark.unsafe.Platform;

/**
* 32-bit Murmur3 hasher. This is based on Guava's Murmur3_32HashFunction.
*/
public final class Murmur3_x86_32 {
private static final boolean isBigEndian = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);

private static final int C1 = 0xcc9e2d51;
private static final int C2 = 0x1b873593;

Expand Down Expand Up @@ -92,8 +96,10 @@ private static int hashBytesByInt(Object base, long offset, int lengthInBytes, i
int h1 = seed;
for (int i = 0; i < lengthInBytes; i += 4) {
int halfWord = Platform.getInt(base, offset + i);
int k1 = mixK1(halfWord);
h1 = mixH1(h1, k1);
if (isBigEndian) {
halfWord = Integer.reverseBytes(halfWord);
}
h1 = mixH1(h1, mixK1(halfWord));
}
return h1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
*/
package org.apache.spark.sql.catalyst.expressions;

import java.nio.ByteOrder;

import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.types.UTF8String;

Expand All @@ -31,6 +33,7 @@
*/
// scalastyle: on
public final class XXH64 {
private static final boolean isBigEndian = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);

private static final long PRIME64_1 = 0x9E3779B185EBCA87L;
private static final long PRIME64_2 = 0xC2B2AE3D27D4EB4FL;
Expand Down Expand Up @@ -93,7 +96,11 @@ public static long hashUnsafeBytes(Object base, long offset, int length, long se
offset += length & -8;

if (offset + 4L <= end) {
hash ^= (Platform.getInt(base, offset) & 0xFFFFFFFFL) * PRIME64_1;
int k1 = Platform.getInt(base, offset);
if (isBigEndian) {
k1 = Integer.reverseBytes(k1);
}
hash ^= (k1 & 0xFFFFFFFFL) * PRIME64_1;
hash = Long.rotateLeft(hash, 23) * PRIME64_2 + PRIME64_3;
offset += 4L;
}
Expand Down Expand Up @@ -130,21 +137,22 @@ private static long hashBytesByWords(Object base, long offset, int length, long
long v4 = seed - PRIME64_1;

do {
v1 += Platform.getLong(base, offset) * PRIME64_2;
v1 = Long.rotateLeft(v1, 31);
v1 *= PRIME64_1;

v2 += Platform.getLong(base, offset + 8) * PRIME64_2;
v2 = Long.rotateLeft(v2, 31);
v2 *= PRIME64_1;

v3 += Platform.getLong(base, offset + 16) * PRIME64_2;
v3 = Long.rotateLeft(v3, 31);
v3 *= PRIME64_1;

v4 += Platform.getLong(base, offset + 24) * PRIME64_2;
v4 = Long.rotateLeft(v4, 31);
v4 *= PRIME64_1;
long k1 = Platform.getLong(base, offset);
long k2 = Platform.getLong(base, offset + 8);
long k3 = Platform.getLong(base, offset + 16);
long k4 = Platform.getLong(base, offset + 24);

if (isBigEndian) {
k1 = Long.reverseBytes(k1);
k2 = Long.reverseBytes(k2);
k3 = Long.reverseBytes(k3);
k4 = Long.reverseBytes(k4);
}

v1 = Long.rotateLeft(v1 + (k1 * PRIME64_2), 31) * PRIME64_1;
v2 = Long.rotateLeft(v2 + (k2 * PRIME64_2), 31) * PRIME64_1;
v3 = Long.rotateLeft(v3 + (k3 * PRIME64_2), 31) * PRIME64_1;
v4 = Long.rotateLeft(v4 + (k4 * PRIME64_2), 31) * PRIME64_1;

offset += 32L;
} while (offset <= limit);
Expand Down Expand Up @@ -186,6 +194,9 @@ private static long hashBytesByWords(Object base, long offset, int length, long
long limit = end - 8;
while (offset <= limit) {
long k1 = Platform.getLong(base, offset);
if (isBigEndian) {
k1 = Long.reverseBytes(k1);
}
hash ^= Long.rotateLeft(k1 * PRIME64_2, 31) * PRIME64_1;
hash = Long.rotateLeft(hash, 27) * PRIME64_1 + PRIME64_4;
offset += 8L;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

package org.apache.spark.sql.catalyst.expressions;

import java.nio.ByteOrder;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Random;
Expand Down Expand Up @@ -73,42 +72,22 @@ public void testKnownByteArrayInputs() {
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 1));
Assert.assertEquals(0x739840CB819FA723L,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 1, PRIME));

if (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN) {
Assert.assertEquals(0x9256E58AA397AEF1L,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 4));
Assert.assertEquals(0x9D5FFDFB928AB4BL,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 4, PRIME));
Assert.assertEquals(0xF74CB1451B32B8CFL,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 8));
Assert.assertEquals(0x9C44B77FBCC302C5L,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 8, PRIME));
Assert.assertEquals(0xCFFA8DB881BC3A3DL,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 14));
Assert.assertEquals(0x5B9611585EFCC9CBL,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 14, PRIME));
Assert.assertEquals(0x0EAB543384F878ADL,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, SIZE));
Assert.assertEquals(0xCAA65939306F1E21L,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, SIZE, PRIME));
} else {
Assert.assertEquals(0x7F875412350ADDDCL,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 4));
Assert.assertEquals(0x564D279F524D8516L,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 4, PRIME));
Assert.assertEquals(0x7D9F07E27E0EB006L,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 8));
Assert.assertEquals(0x893CEF564CB7858L,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 8, PRIME));
Assert.assertEquals(0xC6198C4C9CC49E17L,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 14));
Assert.assertEquals(0x4E21BEF7164D4BBL,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 14, PRIME));
Assert.assertEquals(0xBCF5FAEDEE1F2B5AL,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, SIZE));
Assert.assertEquals(0x6F680C877A358FE5L,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, SIZE, PRIME));
}
Assert.assertEquals(0x9256E58AA397AEF1L,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 4));
Assert.assertEquals(0x9D5FFDFB928AB4BL,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 4, PRIME));
Assert.assertEquals(0xF74CB1451B32B8CFL,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 8));
Assert.assertEquals(0x9C44B77FBCC302C5L,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 8, PRIME));
Assert.assertEquals(0xCFFA8DB881BC3A3DL,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 14));
Assert.assertEquals(0x5B9611585EFCC9CBL,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, 14, PRIME));
Assert.assertEquals(0x0EAB543384F878ADL,
hasher.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, SIZE));
Assert.assertEquals(0xCAA65939306F1E21L,
XXH64.hashUnsafeBytes(BUFFER, Platform.BYTE_ARRAY_OFFSET, SIZE, PRIME));
}

@Test
Expand Down

0 comments on commit df1a1c8

Please sign in to comment.