Skip to content

Commit

Permalink
Merge 9ee2104 into f0e122d
Browse files Browse the repository at this point in the history
  • Loading branch information
Claudenw committed Nov 3, 2019
2 parents f0e122d + 9ee2104 commit ffbca6e
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 48 deletions.
3 changes: 3 additions & 0 deletions src/changes/changes.xml
Expand Up @@ -41,6 +41,9 @@ The <action> type attribute can be add,update,fix,remove.
<author>Apache Commons Developers</author>
</properties>
<body>
<release version="1.14" description="Feature and fix release.">
<action issue="CODEC-264" dev="claude" due-to="Claude Warren" type="fix">Fixed sign extensions in Murmur3 hashes</action>
</release>

<release version="1.13" date="2019-07-20" description="Feature and fix release.">
<action issue="CODEC-255" dev="sebb" due-to="Holger Grote" type="fix">ColognePhonetic handles x incorrectly</action>
Expand Down
109 changes: 61 additions & 48 deletions src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
Expand Up @@ -62,6 +62,7 @@ public final class MurmurHash3 {
private static final int R2_32 = 13;
private static final int M_32 = 5;
private static final int N_32 = 0xe6546b64;
private static final int UBYTE_MASK = 0xff;

// Constants for 128 bit variant
private static final long C1 = 0x87c37b91114253d5L;
Expand All @@ -72,6 +73,7 @@ public final class MurmurHash3 {
private static final int M = 5;
private static final int N1 = 0x52dce729;
private static final int N2 = 0x38495ab5;
private static final long UINT_MASK = 0xffffffffL;

public static final int DEFAULT_SEED = 104729;

Expand Down Expand Up @@ -207,20 +209,24 @@ public static int hash32(final byte[] data, final int offset, final int length,
// tail
final int idx = nblocks << 2;
int k1 = 0;
/*
* The original algorithm uses unsigned bytes.
* We have to mask to match the behavior of the unsigned bytes and prevent sign extension.
*/
switch (length - idx) {
case 3:
k1 ^= data[offset + idx + 2] << 16;
k1 ^= (data[offset + idx + 2] & UBYTE_MASK) << 16;
// fallthrough
case 2:
k1 ^= data[offset + idx + 1] << 8;
k1 ^= (data[offset + idx + 1] & UBYTE_MASK) << 8;
// fallthrough
case 1:
k1 ^= data[offset + idx];

// mix functions
k1 ^= (data[offset + idx] & UBYTE_MASK);
k1 *= C1_32;
k1 = Integer.rotateLeft(k1, R1_32);
k1 *= C2_32;
hash ^= k1;
}
}

return fmix32(length, hash);
}
Expand Down Expand Up @@ -290,8 +296,8 @@ public static long hash64(final int data) {
public static long hash64(final short data) {
long hash = DEFAULT_SEED;
long k1 = 0;
k1 ^= ((long) data & 0xff) << 8;
k1 ^= ((long) ((data & 0xFF00) >> 8) & 0xff);
k1 ^= ((long) data & UBYTE_MASK) << 8;
k1 ^= ((long) ((data & 0xFF00) >> 8) & UBYTE_MASK);
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
Expand Down Expand Up @@ -332,10 +338,10 @@ public static long hash64(final byte[] data, final int offset, final int length,
// body
for (int i = 0; i < nblocks; i++) {
final int i8 = i << 3;
long k = ((long) data[offset + i8] & 0xff) | (((long) data[offset + i8 + 1] & 0xff) << 8)
| (((long) data[offset + i8 + 2] & 0xff) << 16) | (((long) data[offset + i8 + 3] & 0xff) << 24)
| (((long) data[offset + i8 + 4] & 0xff) << 32) | (((long) data[offset + i8 + 5] & 0xff) << 40)
| (((long) data[offset + i8 + 6] & 0xff) << 48) | (((long) data[offset + i8 + 7] & 0xff) << 56);
long k = ((long) data[offset + i8] & UBYTE_MASK) | (((long) data[offset + i8 + 1] & UBYTE_MASK) << 8)
| (((long) data[offset + i8 + 2] & UBYTE_MASK) << 16) | (((long) data[offset + i8 + 3] & UBYTE_MASK) << 24)
| (((long) data[offset + i8 + 4] & UBYTE_MASK) << 32) | (((long) data[offset + i8 + 5] & UBYTE_MASK) << 40)
| (((long) data[offset + i8 + 6] & UBYTE_MASK) << 48) | (((long) data[offset + i8 + 7] & UBYTE_MASK) << 56);

// mix functions
k *= C1;
Expand All @@ -350,19 +356,19 @@ public static long hash64(final byte[] data, final int offset, final int length,
final int tailStart = nblocks << 3;
switch (length - tailStart) {
case 7:
k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
k1 ^= ((long) data[offset + tailStart + 6] & UBYTE_MASK) << 48;
case 6:
k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
k1 ^= ((long) data[offset + tailStart + 5] & UBYTE_MASK) << 40;
case 5:
k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
k1 ^= ((long) data[offset + tailStart + 4] & UBYTE_MASK) << 32;
case 4:
k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
k1 ^= ((long) data[offset + tailStart + 3] & UBYTE_MASK) << 24;
case 3:
k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
k1 ^= ((long) data[offset + tailStart + 2] & UBYTE_MASK) << 16;
case 2:
k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
k1 ^= ((long) data[offset + tailStart + 1] & UBYTE_MASK) << 8;
case 1:
k1 ^= ((long) data[offset + tailStart] & 0xff);
k1 ^= ((long) data[offset + tailStart] & UBYTE_MASK);
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
Expand Down Expand Up @@ -407,22 +413,25 @@ public static long[] hash128(final String data) {
* @return - 128 bit hash (2 longs)
*/
public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) {
long h1 = seed;
long h2 = seed;
// The original algorithm does have a 32 bit unsigned seed.
// We have to mask to match the behavior of the unsigned types and prevent sign extension.
long h1 = seed & UINT_MASK;
long h2 = seed & UINT_MASK;

final int nblocks = length >> 4;

// body
for (int i = 0; i < nblocks; i++) {
final int i16 = i << 4;
long k1 = ((long) data[offset + i16] & 0xff) | (((long) data[offset + i16 + 1] & 0xff) << 8)
| (((long) data[offset + i16 + 2] & 0xff) << 16) | (((long) data[offset + i16 + 3] & 0xff) << 24)
| (((long) data[offset + i16 + 4] & 0xff) << 32) | (((long) data[offset + i16 + 5] & 0xff) << 40)
| (((long) data[offset + i16 + 6] & 0xff) << 48) | (((long) data[offset + i16 + 7] & 0xff) << 56);
long k1 = ((long) data[offset + i16] & UBYTE_MASK) | (((long) data[offset + i16 + 1] & UBYTE_MASK) << 8)
| (((long) data[offset + i16 + 2] & UBYTE_MASK) << 16) | (((long) data[offset + i16 + 3] & UBYTE_MASK) << 24)
| (((long) data[offset + i16 + 4] & UBYTE_MASK) << 32) | (((long) data[offset + i16 + 5] & UBYTE_MASK) << 40)
| (((long) data[offset + i16 + 6] & UBYTE_MASK) << 48) | (((long) data[offset + i16 + 7] & UBYTE_MASK) << 56);

long k2 = ((long) data[offset + i16 + 8] & 0xff) | (((long) data[offset + i16 + 9] & 0xff) << 8)
| (((long) data[offset + i16 + 10] & 0xff) << 16) | (((long) data[offset + i16 + 11] & 0xff) << 24)
| (((long) data[offset + i16 + 12] & 0xff) << 32) | (((long) data[offset + i16 + 13] & 0xff) << 40)
| (((long) data[offset + i16 + 14] & 0xff) << 48) | (((long) data[offset + i16 + 15] & 0xff) << 56);
long k2 = ((long) data[offset + i16 + 8] & UBYTE_MASK) | (((long) data[offset + i16 + 9] & UBYTE_MASK) << 8)
| (((long) data[offset + i16 + 10] & UBYTE_MASK) << 16) | (((long) data[offset + i16 + 11] & UBYTE_MASK) << 24)
| (((long) data[offset + i16 + 12] & UBYTE_MASK) << 32) | (((long) data[offset + i16 + 13] & UBYTE_MASK) << 40)
| (((long) data[offset + i16 + 14] & UBYTE_MASK) << 48) | (((long) data[offset + i16 + 15] & UBYTE_MASK) << 56);

// mix functions for k1
k1 *= C1;
Expand All @@ -449,40 +458,40 @@ public static long[] hash128(final byte[] data, final int offset, final int leng
final int tailStart = nblocks << 4;
switch (length - tailStart) {
case 15:
k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
k2 ^= (long) (data[offset + tailStart + 14] & UBYTE_MASK) << 48;
case 14:
k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
k2 ^= (long) (data[offset + tailStart + 13] & UBYTE_MASK) << 40;
case 13:
k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
k2 ^= (long) (data[offset + tailStart + 12] & UBYTE_MASK) << 32;
case 12:
k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
k2 ^= (long) (data[offset + tailStart + 11] & UBYTE_MASK) << 24;
case 11:
k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
k2 ^= (long) (data[offset + tailStart + 10] & UBYTE_MASK) << 16;
case 10:
k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
k2 ^= (long) (data[offset + tailStart + 9] & UBYTE_MASK) << 8;
case 9:
k2 ^= data[offset + tailStart + 8] & 0xff;
k2 ^= data[offset + tailStart + 8] & UBYTE_MASK;
k2 *= C2;
k2 = Long.rotateLeft(k2, R3);
k2 *= C1;
h2 ^= k2;

case 8:
k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
k1 ^= (long) (data[offset + tailStart + 7] & UBYTE_MASK) << 56;
case 7:
k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
k1 ^= (long) (data[offset + tailStart + 6] & UBYTE_MASK) << 48;
case 6:
k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
k1 ^= (long) (data[offset + tailStart + 5] & UBYTE_MASK) << 40;
case 5:
k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
k1 ^= (long) (data[offset + tailStart + 4] & UBYTE_MASK) << 32;
case 4:
k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
k1 ^= (long) (data[offset + tailStart + 3] & UBYTE_MASK) << 24;
case 3:
k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
k1 ^= (long) (data[offset + tailStart + 2] & UBYTE_MASK) << 16;
case 2:
k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
k1 ^= (long) (data[offset + tailStart + 1] & UBYTE_MASK) << 8;
case 1:
k1 ^= data[offset + tailStart] & 0xff;
k1 ^= data[offset + tailStart] & UBYTE_MASK;
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
Expand Down Expand Up @@ -602,15 +611,19 @@ public final void add(final byte[] data, int offset, final int length) {
System.arraycopy(data, offset + consumed, tail, 0, tailLen);
}

/*
* The original algorithm uses unsigned bytes.
* We have to mask to match the behavior of the unsigned bytes and prevent sign extension.
*/
public final int end() {
int k1 = 0;
switch (tailLen) {
case 3:
k1 ^= tail[2] << 16;
k1 ^= (tail[2] & UBYTE_MASK) << 16;
case 2:
k1 ^= tail[1] << 8;
k1 ^= (tail[1] & UBYTE_MASK) << 8;
case 1:
k1 ^= tail[0];
k1 ^= (tail[0] & UBYTE_MASK);

// mix functions
k1 *= C1_32;
Expand All @@ -631,6 +644,6 @@ public final int end() {
}

private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) {
return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24);
return (b1 & UBYTE_MASK) | ((b2 & UBYTE_MASK) << 8) | ((b3 & UBYTE_MASK) << 16) | ((b4 & UBYTE_MASK) << 24);
}
}
64 changes: 64 additions & 0 deletions src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java
Expand Up @@ -224,4 +224,68 @@ public void testSingleLongRandom() {
}
}

/**
* Execute the Murmur3_x64_128 tests from Yonic Seeley's implementation tests.
* Part of CODEC-264 testing
* @see https://github.com/yonik/java_util/blob/master/test/util/hash/TestMurmurHash3.java
* @throws Exception
*/
@Test
public void testHash128SignExtension() throws Exception {
long[] answers128 = new long[] {0x6e54d3ad2be8e9a2L,0xd99e452d1cfc7decL,0x609c35d060cf37c1L,0x4ba03e78929b6807L,0xf4865522a8838216L,0xef8dc0ad3f5a0581L,0x8513b05a329d04ecL,0x2295dbef5a603ebcL,0xd0259c75fa8711b2L,0x311f78657cb7ecb9L,0x771d03baa6accef1L,0x596d9c3bde77e873L,0xdc177610450452dbL,0x5b85d931e890ef5eL,0x261f88eedccbbd36L,0xcba71c1101271139L,0xa3a125d270c03cL,0xc41e9d6ae4ef9d56L,0xf9b21d4d660517c0L,0x409d87f99aeb3ea9L,0x92d8e70ae59a864L,0xf4e12d297744d05aL,0xd894caa03d461dbeL,0x99d6ff317880f305L,0x145d42da3710d23aL,0x2812adb381c1d64aL,0xd90254532b45e323L,0xacbb43b768a7b276L,0x74573f58c60c3ddfL,0xc72b9b42a7cbbd69L,0xd1129837bea190a7L,0xa7b20418ce5d46f9L,0xa6d094d2a166f659L,0x10f66ed93811576eL,0x28d3553af07b8cfaL,0xdd3b57dcd4d98ec2L,0xcd57b4faccaf9764L,0x1e4001ee8b46813aL,0xc79f57499389029eL,0xf4f84142db2d7673L,0xfafc9890edaf9086L,0xc54472528c0fcd98L,0xd3ff4eff416c02b7L,0x47c8414e9fa28367L,0x78f0171da51288e6L,0x7f5046c28cd1b43aL,0xc38dacef191ad1f0L,0x6210c0aba8230563L,0x15e3cd836648fe66L,0x56a1797408568c1eL,0x9162e9b79d4f6689L,0x6fc7ba8e6135592dL,0x569e7feab218d54aL,0x93d21aac30f6029fL,0x4e7a938ca19a5fe5L,0x3c7dd68323efe355L,0x651993620ca49e3fL,0x9f0cc9127f8eca7L,0x3963f278753c4f44L,0x3f2ab0d0e62bb19fL,0x4d72a64283465629L,0xd9d958282564a987L,0xde019492e4164d94L,0xc319fb27d1d42455L,0xe788f28b58a0c025L,0xabb3f2ca571338b4L,0xaac4a40f227db268L,0x8f86a5605449d75aL,0xcc3999bd3c872160L,0x3010e16e331a57e2L,0xd43cfd0741d4ed2L,0x7954298caa472790L,0xfe5b6444abb41ceL,0xaff3b10d222afeafL,0xa5438bad24a5629eL,0xc474fa5e2ff33329L,0xbda083bc5d7b382bL,0xef31a1cda016673eL,0xda9b98b58bb7eff5L,0xe001283d41a1576L,0x6ee0f9ab35eb17ebL,0x5de93fcf7e7e0169L,0x3cd1756a735b7caL,0x582ded067b6714e9L,0x56194735c4168e94L,0xeeaf5a39dcf76088L,0xf9d9a9c7d1520670L,0xb98d7d405a177795L,0x3281c2365b5bc415L,0x85e4cfb23980f8b4L,0x484aee59fa5880bdL,0xe000f2daa2078018L,0xebff3a4bff725d23L,0x803e3c3dd2716703L,0x413e18195eb5b4bfL,0xce1ea41794fec551L,0xcbf65e356e2d69bdL,0x654a616738582ba7L,0x62e46d535f11c417L,0xbd11185034218fa2L,0x7c715d440eaa5fb1L,0xe68ad0d758ade8dL,0x3242a4d88ac3ba92L,0x10f1e6939ee06b78L,0x965d9c4109ab6eb4L,0x6bc256008b6083d5L,0xa8fb3b9666e0eb4dL,0x2d8a83366565a273L,0xa5eddde29cc59fc4L,0xfd1f7dc9866ceb19L,0x86c13e98272a7eb9L,0x11149397f635b42cL,0xcbf82258e2b85bf5L,0x37215737b1ab86fbL,0x44e5126c5c5f4ae5L,0x99fe7cce58649b93L,0xc455e6ddc7be80f0L,0xf93bec96644e8723L,0x130dc4e99fb989e8L,0xb01734fafdc5308dL,0x8fde545bd48cb2feL,0x1102c89b77b4b405L,0x2cd24ed5816eca6eL,0xebd56473a502b63fL,0x357fb8e6b489be97L,0xe163a9495e6d67daL,0x87411ac34bd7399aL,0xf8bc18d84f4237bfL,0x43702207d2269e74L,0x37a3eec07a419e21L,0x7fe4605c33d4ac0cL,0x6df566b6925a898dL,0x89526c269d9225b0L,0xfc24aac3b731d33eL,0x2518f6ea6300c3caL,0xe4e20fdb203d79f5L};

byte[] bytes = "Now is the time for all good men to come to the aid of their country".getBytes("UTF-8");
int hash=0;
for (int i=0; i<bytes.length; i++) {
hash = hash*31+(bytes[i]&0xff);
bytes[i] = (byte)hash;
}

// test different offsets.
for (int offset = 0; offset<20; offset++) {
// put the original bytes at the offset so the same hash will be generated
byte[] arr = new byte[bytes.length + offset];
System.arraycopy(bytes, 0, arr, offset, bytes.length);
int seed = 1;
long[] result;
for (int len=0; len<bytes.length; len++) {
seed *= 0x9e3779b1;
result = MurmurHash3.hash128(arr, offset, len, seed);
assertEquals(answers128[len * 2], result[0]);
assertEquals(answers128[len*2+1], result[1]);
}
}
}

/**
* Execute the Murmur3_x86_32 tests from Yonic Seeley's implementation tests.
* Part of CODEC-264 testing
* @see https://github.com/yonik/java_util/blob/master/test/util/hash/TestMurmurHash3.java
* @throws Exception
*/
@Test
public void testHash32SignExtension() throws Exception {
int[] answers32 = new int[] {0x11fd02eb,0x8dd65a73,0x29b074ba,0xcbcd43ce,0xb6463881,0xf6228557,0x3d55c634,0xa1bb9072,0x448402c6,0xb12bf3d4,0x18a71ccb,0x6ae5f185,0x9a482256,0xc686d7f2,0x8e8984d8,0x68a2491d,0xcc29b0e6,0x3e9130bd,0xc90defb3,0xf81c5978,0x15ff7f63,0x4ec16a7a,0xa08aa899,0x7317ffee,0x93752d34,0x400f8781,0x2358838c,0x6ecb8998,0x45a5c102,0x46ed68fd,0xfecb51c0,0x7a68c7db,0x9e334eab,0x21ea13b6,0xf184e92c,0xc016220d,0x7f6c9713,0x1e909123,0xb51a21b7,0x94c58881,0xe4e91bf0,0xde80a366,0xfd84005a,0x3361d373,0xe7d528cc,0x487275a7,0xf2290ee5,0x869992a8,0x63cdd341,0x8e94b334,0x1fc7bf11,0x5228b0,0xb4292b62,0x36ed3770,0xfe914519,0x7d9d1830,0xe1acfb60,0xc8b4d4b7,0xf1ec49ba,0xedbb8cc1,0xdc5b3ab1,0x7c7778ae,0x52bf68d,0xe0bb4148,0xfea36521,0xa0696ca5,0xf28df752,0xd82dccb6};

byte[] bytes = "Now is the time for all good men to come to the aid of their country".getBytes("UTF-8");
int hash=0;
for (int i=0; i<bytes.length; i++) {
hash = hash*31+(bytes[i]&0xff);
bytes[i] = (byte)hash;
}

// test different offsets.
for (int offset = 0; offset<20; offset++) {
// put the original bytes at the offset so the same hash will be generated
byte[] arr = new byte[bytes.length + offset];
System.arraycopy(bytes, 0, arr, offset, bytes.length);
int seed = 1;
for (int len=0; len<bytes.length; len++) {
seed *= 0x9e3779b1;
int h = MurmurHash3.hash32(arr, offset, len, seed);
assertEquals(answers32[len], h);
}
}
}

}

0 comments on commit ffbca6e

Please sign in to comment.