Skip to content

Commit

Permalink
ARROW-8009: [Java] Fix the hash code methods for BitVector
Browse files Browse the repository at this point in the history
The current hash code methods of BitVector are based on implementations in BaseFixedWidthVector, which rely on the type width of the vector.
For BitVector, the type width is 0, so the underlying data is not actually used when computing the hash code. That means, the hash code will always be 0, no matter if the underlying data is null or not, and no matter if the underlying bit is 0 or 1.

We fix this by overriding the methods in BitVector.

Closes #6543 from liyafan82/fly_0305_bit and squashes the following commits:

a38b527 <liyafan82>  Fix the hash code methods for BitVector

Authored-by: liyafan82 <fan_li_ya@foxmail.com>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
  • Loading branch information
liyafan82 authored and emkornfield committed Mar 6, 2020
1 parent 5ffbf0a commit e92416f
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 0 deletions.
24 changes: 24 additions & 0 deletions java/vector/src/main/java/org/apache/arrow/vector/BitVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.util.ArrowBufPointer;
import org.apache.arrow.memory.util.hash.ArrowBufHasher;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.complex.impl.BitReaderImpl;
import org.apache.arrow.vector.complex.reader.FieldReader;
Expand All @@ -41,6 +42,11 @@
* to a single bit in the underlying data stream backing the vector.
*/
public final class BitVector extends BaseFixedWidthVector {

private static final int HASH_CODE_FOR_ZERO = 17;

private static final int HASH_CODE_FOR_ONE = 19;

private final FieldReader reader;

/**
Expand Down Expand Up @@ -473,6 +479,24 @@ public ArrowBufPointer getDataPointer(int index, ArrowBufPointer reuse) {
throw new UnsupportedOperationException();
}

@Override
public int hashCode(int index) {
if (isNull(index)) {
return ArrowBufPointer.NULL_HASH_CODE;
} else {
if (get(index) == 0) {
return HASH_CODE_FOR_ZERO;
} else {
return HASH_CODE_FOR_ONE;
}
}
}

@Override
public int hashCode(int index, ArrowBufHasher hasher) {
return hashCode(index);
}

/**
* Set count bits to 1 in data starting at firstBitIndex.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.util.stream.IntStream;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.memory.util.hash.MurmurHasher;
import org.apache.arrow.vector.testing.ValueVectorDataPopulator;
import org.apache.arrow.vector.util.TransferPair;
import org.junit.After;
import org.junit.Assert;
Expand Down Expand Up @@ -505,4 +509,35 @@ private void validateRange(int length, int start, int count) {
}
}
}

@Test
public void testBitVectorHashCode() {
final int size = 6;
try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) {
ValueVectorDataPopulator.setVector(vector, 0, 1, null, 0, 1, null);

int[] hashCodes = new int[size];
IntStream.range(0, size).forEach(i -> hashCodes[i] = vector.hashCode(i));

assertTrue(hashCodes[0] == hashCodes[3]);
assertTrue(hashCodes[1] == hashCodes[4]);
assertTrue(hashCodes[2] == hashCodes[5]);

assertFalse(hashCodes[0] == hashCodes[1]);
assertFalse(hashCodes[0] == hashCodes[2]);
assertFalse(hashCodes[1] == hashCodes[2]);

MurmurHasher hasher = new MurmurHasher();

IntStream.range(0, size).forEach(i -> hashCodes[i] = vector.hashCode(i, hasher));

assertTrue(hashCodes[0] == hashCodes[3]);
assertTrue(hashCodes[1] == hashCodes[4]);
assertTrue(hashCodes[2] == hashCodes[5]);

assertFalse(hashCodes[0] == hashCodes[1]);
assertFalse(hashCodes[0] == hashCodes[2]);
assertFalse(hashCodes[1] == hashCodes[2]);
}
}
}

0 comments on commit e92416f

Please sign in to comment.