diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java deleted file mode 100644 index 18e1fee029..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.PrimitiveIterator.OfInt; -import java.util.function.LongBinaryOperator; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; - -/** - * An abstract Bloom filter providing default implementations for most Bloom filter - * functions. Specific implementations are encouraged to override the methods that can be - * more efficiently implemented. - *

- * This abstract class provides additional functionality not declared in the interface. - * Specifically: - *

- * - * @since 4.5 - */ -public abstract class AbstractBloomFilter implements BloomFilter { - - /** - * The shape used by this BloomFilter - */ - private final Shape shape; - - /** - * Construct a Bloom filter with the specified shape. - * - * @param shape The shape. - */ - protected AbstractBloomFilter(final Shape shape) { - this.shape = shape; - } - - @Override - public int andCardinality(final BloomFilter other) { - verifyShape(other); - final long[] mine = getBits(); - final long[] theirs = other.getBits(); - final int limit = Integer.min(mine.length, theirs.length); - int count = 0; - for (int i = 0; i < limit; i++) { - count += Long.bitCount(mine[i] & theirs[i]); - } - return count; - } - - @Override - public int cardinality() { - int count = 0; - for (final long bits : getBits()) { - count += Long.bitCount(bits); - } - return count; - } - - @Override - public boolean contains(final BloomFilter other) { - verifyShape(other); - return other.cardinality() == andCardinality(other); - } - - @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - final long[] buff = getBits(); - - final OfInt iter = hasher.iterator(shape); - while (iter.hasNext()) { - final int idx = iter.nextInt(); - BloomFilterIndexer.checkPositive(idx); - final int buffIdx = BloomFilterIndexer.getLongIndex(idx); - final long buffOffset = BloomFilterIndexer.getLongBit(idx); - if ((buff[buffIdx] & buffOffset) == 0) { - return false; - } - } - return true; - } - - @Override - public final Shape getShape() { - return shape; - } - - /** - * Determines if the bloom filter is "full". Full is defined as having no unset - * bits. - * - * @return true if the filter is full. - */ - public final boolean isFull() { - return cardinality() == getShape().getNumberOfBits(); - } - - @Override - public int orCardinality(final BloomFilter other) { - // Logical OR - return opCardinality(other, (a, b) -> a | b); - } - - /** - * Verifies that the hasher has the same name as the shape. - * - * @param hasher the Hasher to check - */ - protected void verifyHasher(final Hasher hasher) { - // It is assumed that the filter and hasher have been constructed using the - // same hash function. Use the signature for a fast check the hash function is equal. - // Collisions will occur at a rate of 1 in 2^64. - if (shape.getHashFunctionIdentity().getSignature() != hasher.getHashFunctionIdentity().getSignature()) { - throw new IllegalArgumentException( - String.format("Hasher (%s) is not the hasher for shape (%s)", - HashFunctionIdentity.asCommonString(hasher.getHashFunctionIdentity()), - shape.toString())); - } - } - - /** - * Verify the other Bloom filter has the same shape as this Bloom filter. - * - * @param other the other filter to check. - * @throws IllegalArgumentException if the shapes are not the same. - */ - protected void verifyShape(final BloomFilter other) { - verifyShape(other.getShape()); - } - - /** - * Verify the specified shape has the same shape as this Bloom filter. - * - * @param shape the other shape to check. - * @throws IllegalArgumentException if the shapes are not the same. - */ - protected void verifyShape(final Shape shape) { - if (!this.shape.equals(shape)) { - throw new IllegalArgumentException(String.format("Shape %s is not the same as %s", shape, this.shape)); - } - } - - @Override - public int xorCardinality(final BloomFilter other) { - // Logical XOR - return opCardinality(other, (a, b) -> a ^ b); - } - - /** - * Perform the operation on the matched longs from this filter and the other filter - * and count the cardinality. - * - *

The remaining unmatched longs from the larger filter are always counted. This - * method is suitable for OR and XOR cardinality. - * - * @param other the other Bloom filter. - * @param operation the operation (e.g. OR, XOR) - * @return the cardinality - */ - private int opCardinality(final BloomFilter other, final LongBinaryOperator operation) { - verifyShape(other); - final long[] mine = getBits(); - final long[] theirs = other.getBits(); - final long[] small; - final long[] big; - if (mine.length > theirs.length) { - big = mine; - small = theirs; - } else { - small = mine; - big = theirs; - } - int count = 0; - for (int i = 0; i < small.length; i++) { - count += Long.bitCount(operation.applyAsLong(small[i], big[i])); - } - for (int i = small.length; i < big.length; i++) { - count += Long.bitCount(big[i]); - } - return count; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index 0722b92576..092e1460d6 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -16,18 +16,16 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.BitSet; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator; -import java.util.PrimitiveIterator.OfInt; +import java.util.Objects; import java.util.function.IntConsumer; +import java.util.function.LongConsumer; +import java.util.stream.IntStream; +import org.apache.commons.collections4.bloomfilter.exceptions.NoMatchException; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; /** - * A counting Bloom filter using an array to track counts for each enabled bit + * A counting Bloom filter using an int array to track counts for each enabled bit * index. * *

Any operation that results in negative counts or integer overflow of @@ -35,13 +33,13 @@ * The operation is completed in full, no exception is raised and the state is * set to invalid. This allows the counts for the filter immediately prior to the * operation that created the invalid state to be recovered. See the documentation - * in {@link #isValid()} for details. + * in {@link #isValid()} for details.

* *

All the operations in the filter assume the counts are currently valid, - * for example cardinality or contains operations. Behaviour of an invalid + * for example {@code cardinality} or {@code contains} operations. Behavior of an invalid * filter is undefined. It will no longer function identically to a standard * Bloom filter that is the merge of all the Bloom filters that have been added - * to and not later subtracted from the counting Bloom filter. + * to and not later subtracted from the counting Bloom filter.

* *

The maximum supported number of items that can be stored in the filter is * limited by the maximum array size combined with the {@link Shape}. For @@ -53,7 +51,12 @@ * @see Shape * @since 4.5 */ -public class ArrayCountingBloomFilter extends AbstractBloomFilter implements CountingBloomFilter { +public class ArrayCountingBloomFilter implements CountingBloomFilter { + + /** + * The shape of this Bloom filter. + */ + private final Shape shape; /** * The count of each bit index in the filter. @@ -61,20 +64,20 @@ public class ArrayCountingBloomFilter extends AbstractBloomFilter implements Cou private final int[] counts; /** - * The state flag. This is a bitwise OR of the entire history of all updated + * The state flag. This is a bitwise @{code OR} of the entire history of all updated * counts. If negative then a negative count or integer overflow has occurred on * one or more counts in the history of the filter and the state is invalid. * *

Maintenance of this state flag is branch-free for improved performance. It * eliminates a conditional check for a negative count during remove/subtract * operations and a conditional check for integer overflow during merge/add - * operations. + * operations.

* *

Note: Integer overflow is unlikely in realistic usage scenarios. A count * that overflows indicates that the number of items in the filter exceeds the * maximum possible size (number of bits) of any Bloom filter constrained by * integer indices. At this point the filter is most likely full (all bits are - * non-zero) and thus useless. + * non-zero) and thus useless.

* *

Negative counts are a concern if the filter is used incorrectly by * removing an item that was never added. It is expected that a user of a @@ -82,174 +85,94 @@ public class ArrayCountingBloomFilter extends AbstractBloomFilter implements Cou * Enabling an explicit recovery path for negative or overflow counts is a major * performance burden not deemed necessary for the unlikely scenarios when an * invalid state is created. Maintenance of the state flag is a concession to - * flag improper use that should not have a major performance impact. + * flag improper use that should not have a major performance impact.

*/ private int state; - /** - * An iterator of all indexes with non-zero counts. - * - *

In the event that the filter state is invalid any index with a negative count - * will also be produced by the iterator. - */ - private class IndexIterator implements PrimitiveIterator.OfInt { - /** The next non-zero index (or counts.length). */ - private int next; - - /** - * Create an instance. - */ - IndexIterator() { - advance(); - } - - /** - * Advance to the next non-zero index. - */ - void advance() { - while (next < counts.length && counts[next] == 0) { - next++; - } - } - - @Override - public boolean hasNext() { - return next < counts.length; - } - - @Override - public int nextInt() { - if (hasNext()) { - final int result = next++; - advance(); - return result; - } - // Currently unreachable as the iterator is only used by - // the StaticHasher which iterates correctly. - throw new NoSuchElementException(); - } - } - /** * Constructs an empty counting Bloom filter with the specified shape. * * @param shape the shape of the filter + * */ public ArrayCountingBloomFilter(final Shape shape) { - super(shape); + Objects.requireNonNull(shape, "shape"); + this.shape = shape; counts = new int[shape.getNumberOfBits()]; } @Override - public int cardinality() { - int size = 0; - for (final int c : counts) { - if (c != 0) { - size++; - } - } - return size; - } - - @Override - public boolean contains(final BloomFilter other) { - // The AbstractBloomFilter implementation converts both filters to long[] bits. - // This would involve checking all indexes in this filter against zero. - // Ideally we use an iterator of bit indexes to allow fail-fast on the - // first bit index that is zero. - if (other instanceof ArrayCountingBloomFilter) { - verifyShape(other); - return contains(((ArrayCountingBloomFilter) other).iterator()); - } - - // Note: - // This currently creates a StaticHasher which stores all the indexes. - // It would greatly benefit from direct generation of the index iterator - // avoiding the intermediate storage. - return contains(other.getHasher()); + public boolean isSparse() { + return true; } @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - return contains(hasher.iterator(getShape())); + public int cardinality() { + return (int) IntStream.range(0, counts.length).filter(i -> counts[i] > 0).count(); } /** - * Return true if this filter is has non-zero counts for each index in the iterator. - * - * @param iter the iterator - * @return true if this filter contains all the indexes + * Clones the filter. Used to create merged values. + * @return A clone of this filter. */ - private boolean contains(final OfInt iter) { - while (iter.hasNext()) { - if (counts[iter.nextInt()] == 0) { - return false; - } - } - return true; + protected ArrayCountingBloomFilter makeClone() { + ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); + filter.add(this); + filter.state = this.state; + return filter; } @Override - public long[] getBits() { - final BitSet bs = new BitSet(); - for (int i = 0; i < counts.length; i++) { - if (counts[i] != 0) { - bs.set(i); - } - } - return bs.toLongArray(); + public CountingBloomFilter merge(BloomFilter other) { + Objects.requireNonNull(other, "other"); + CountingBloomFilter filter = makeClone(); + filter.add(BitCountProducer.from(other)); + return filter; } @Override - public StaticHasher getHasher() { - return new StaticHasher(iterator(), getShape()); - } - - /** - * Returns an iterator over the enabled indexes in this filter. - * Any index with a non-zero count is considered enabled. - * The iterator returns indexes in their natural order. - * - * @return an iterator over the enabled indexes - */ - private PrimitiveIterator.OfInt iterator() { - return new IndexIterator(); + public CountingBloomFilter merge(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + ArrayCountingBloomFilter filter = makeClone(); + filter.add(BitCountProducer.from(hasher.indices(shape))); + return filter; } @Override - public boolean merge(final BloomFilter other) { - applyAsBloomFilter(other, this::increment); - return isValid(); + public boolean mergeInPlace(final BloomFilter other) { + Objects.requireNonNull(other, "other"); + return add(BitCountProducer.from(other)); } @Override - public boolean merge(final Hasher hasher) { - applyAsHasher(hasher, this::increment); - return isValid(); + public boolean mergeInPlace(final Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + return add(BitCountProducer.from(hasher.indices(shape))); } @Override public boolean remove(final BloomFilter other) { - applyAsBloomFilter(other, this::decrement); - return isValid(); + Objects.requireNonNull(other, "other"); + return subtract(BitCountProducer.from(other)); } @Override public boolean remove(final Hasher hasher) { - applyAsHasher(hasher, this::decrement); - return isValid(); + Objects.requireNonNull(hasher, "hasher"); + return subtract(BitCountProducer.from(hasher.indices(shape))); } @Override - public boolean add(final CountingBloomFilter other) { - applyAsCountingBloomFilter(other, this::add); + public boolean add(final BitCountProducer other) { + Objects.requireNonNull(other, "other"); + other.forEachCount(this::add); return isValid(); } @Override - public boolean subtract(final CountingBloomFilter other) { - applyAsCountingBloomFilter(other, this::subtract); + public boolean subtract(final BitCountProducer other) { + Objects.requireNonNull(other, "other"); + other.forEachCount(this::subtract); return isValid(); } @@ -258,14 +181,14 @@ public boolean subtract(final CountingBloomFilter other) { * *

Implementation note * - *

The state transition to invalid is permanent. + *

The state transition to invalid is permanent.

* *

This implementation does not correct negative counts to zero or integer * overflow counts to {@link Integer#MAX_VALUE}. Thus the operation that * generated invalid counts can be reversed by using the complement of the * original operation with the same Bloom filter. This will restore the counts * to the state prior to the invalid operation. Counts can then be extracted - * using {@link #forEachCount(BitCountConsumer)}. + * using {@link #forEachCount(BitCountConsumer)}.

*/ @Override public boolean isValid() { @@ -273,69 +196,29 @@ public boolean isValid() { } @Override - public void forEachCount(final BitCountConsumer action) { + public void forEachCount(final BitCountProducer.BitCountConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); for (int i = 0; i < counts.length; i++) { if (counts[i] != 0) { - action.accept(i, counts[i]); + consumer.accept(i, counts[i]); } } } - /** - * Apply the action for each index in the Bloom filter. - */ - private void applyAsBloomFilter(final BloomFilter other, final IntConsumer action) { - verifyShape(other); - if (other instanceof ArrayCountingBloomFilter) { - // Only use the presence of non-zero and not the counts - final int[] counts2 = ((ArrayCountingBloomFilter) other).counts; - for (int i = 0; i < counts2.length; i++) { - if (counts2[i] != 0) { - action.accept(i); - } + @Override + public void forEachIndex(IntConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); + for (int i = 0; i < counts.length; i++) { + if (counts[i] != 0) { + consumer.accept(i); } - } else { - BitSet.valueOf(other.getBits()).stream().forEach(action); } } - /** - * Apply the action for each index in the hasher. - */ - private void applyAsHasher(final Hasher hasher, final IntConsumer action) { - verifyHasher(hasher); - // We do not naturally handle duplicates so filter them. - IndexFilters.distinctIndexes(hasher, getShape(), action); - } - - /** - * Apply the action for each index in the Bloom filter. - */ - private void applyAsCountingBloomFilter(final CountingBloomFilter other, final BitCountConsumer action) { - verifyShape(other); - other.forEachCount(action); - } - - /** - * Increment to the count for the bit index. - * - * @param idx the index - */ - private void increment(final int idx) { - final int updated = counts[idx] + 1; - state |= updated; - counts[idx] = updated; - } - - /** - * Decrement from the count for the bit index. - * - * @param idx the index - */ - private void decrement(final int idx) { - final int updated = counts[idx] - 1; - state |= updated; - counts[idx] = updated; + @Override + public void forEachBitMap(LongConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); + BitMapProducer.fromIndexProducer(this, shape).forEachBitMap(consumer); } /** @@ -344,7 +227,7 @@ private void decrement(final int idx) { * @param idx the index * @param addend the amount to add */ - private void add(final int idx, final int addend) { + protected void add(final int idx, final int addend) { final int updated = counts[idx] + addend; state |= updated; counts[idx] = updated; @@ -356,9 +239,34 @@ private void add(final int idx, final int addend) { * @param idx the index * @param subtrahend the amount to subtract */ - private void subtract(final int idx, final int subtrahend) { + protected void subtract(final int idx, final int subtrahend) { final int updated = counts[idx] - subtrahend; state |= updated; counts[idx] = updated; } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean contains(IndexProducer indexProducer) { + try { + indexProducer.forEachIndex(idx -> { + if (this.counts[idx] == 0) { + throw new NoMatchException(); + } + }); + } catch (NoMatchException e) { + return false; + } + return true; + } + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + return contains(IndexProducer.fromBitMapProducer(bitMapProducer)); + } + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java new file mode 100644 index 0000000000..af444eeeb9 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.function.IntConsumer; + +/** + * Produces bit counts for counting type Bloom filters. + * + * @since 4.5 + */ +public interface BitCountProducer extends IndexProducer { + + /** + * Performs the given action for each {@code } pair where the count is non-zero. + * Any exceptions thrown by the action are relayed to the caller. + * + *

Must only process each index once, and must process indexes in order.

+ * + * @param consumer the action to be performed for each non-zero bit count + * @throws NullPointerException if the specified action is null + */ + void forEachCount(BitCountConsumer consumer); + + @Override + default void forEachIndex(IntConsumer consumer) { + forEachCount((i, v) -> consumer.accept(i)); + } + + /** + * Creates a BitCountProducer from an IndexProducer. The resulting + * producer will count each enabled bit once. + * @param idx An index producer. + * @return A BitCountProducer with the same indices as the IndexProducer. + */ + static BitCountProducer from(IndexProducer idx) { + return new BitCountProducer() { + @Override + public void forEachCount(BitCountConsumer consumer) { + idx.forEachIndex(i -> consumer.accept(i, 1)); + } + + }; + } + + /** + * Represents an operation that accepts an {@code } pair representing + * the count for a bit index in a Bit Count Producer Bloom filter and returns no result. + * + *

Note: This is a functional interface as a primitive type specialization of + * {@link java.util.function.BiConsumer} for {@code int}. + */ + @FunctionalInterface + interface BitCountConsumer { + /** + * Performs this operation on the given {@code } pair. + * + * @param index the bit index + * @param count the count at the specified bit index + */ + void accept(int index, int count); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java new file mode 100644 index 0000000000..a16cb0c917 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; + +/** + * Contains functions to convert {@code int} indices into Bloom filter bit positions and visa versa. + * + *

The functions view an array of longs as a collection of bitmaps each containing 64 bits. The bits are arranged + * in memory as a little-endian long value. This matches the requirements of the BitMapProducer interface.

+ * + * @since 4.5 + */ +public class BitMap { + /** A bit shift to apply to an integer to divided by 64 (2^6). */ + private static final int DIVIDE_BY_64 = 6; + + /** Do not instantiate. */ + private BitMap() { + } + + /** + * Calculates the number of bitmaps (longs) required for the numberOfBits parameter. + * + *

If the input is negative the behavior is not defined.

+ + * @param numberOfBits the number of bits to store in the array of bitmaps. + * @return the number of bitmaps necessary. + */ + public static int numberOfBitMaps(int numberOfBits) { + return numberOfBits == 0 ? 0 : ((numberOfBits - 1) >> DIVIDE_BY_64) + 1; + } + + /** + * Checks if the specified index bit is enabled in the array of bit bitmaps. + * + * If the bit specified by idx is not in the bitMap false is returned. + * + * @param bitMaps The array of bit maps. + * @param idx the index of the bit to locate. + * @return {@code true} if the bit is enabled, {@code false} otherwise. + */ + public static boolean contains(long[] bitMaps, int idx) { + return (idx >= 0 && getLongIndex(idx) < bitMaps.length && (bitMaps[getLongIndex(idx)] & getLongBit(idx)) != 0); + } + + /** + * Sets the bit in the bitmaps. + *

Does not perform range checking

+ * + * @param bitMaps The array of bit maps.. + * @param idx the index of the bit to set. + * @throws IndexOutOfBoundsException if idx specifies a bit not in the range being tracked. + */ + public static void set(long[] bitMaps, int idx) { + bitMaps[checkRange(bitMaps.length, idx)] |= getLongBit(idx); + } + + /** + * Checks that the index is positive. + * + * @param bitIndex the bit index + * @throws IndexOutOfBoundsException if the index is not positive + */ + public static void checkPositive(final int bitIndex) { + if (bitIndex < 0) { + throw new IndexOutOfBoundsException("Negative bitIndex: " + bitIndex); + } + } + + /** + * Checks that the bitIndex produces a value in the range of a collection. + * + * @param limit the number of bitmaps in a collection. + * @param bitIndex the bit index + * @return the index for the bitmap in the array. + * @throws IndexOutOfBoundsException if the index is not positive + * @see #getLongIndex(int) + */ + public static int checkRange(final int limit, final int bitIndex) { + checkPositive(bitIndex); + int idx = getLongIndex(bitIndex); + if (limit <= idx) { + throw new IndexOutOfBoundsException("bitIndex to large: " + bitIndex); + } + return idx; + } + + /** + * Gets the filter index for the specified bit index assuming the filter is using 64-bit longs + * to store bits starting at index 0. + * + *

The index is assumed to be positive. For a positive index the result will match + * {@code bitIndex / 64}.

+ * + *

The divide is performed using bit shifts. If the input is negative the behavior + * is not defined.

+ * + * @param bitIndex the bit index (assumed to be positive) + * @return the index of the BitMap in an array of BitMaps. + * @see #checkPositive(int) + */ + public static int getLongIndex(final int bitIndex) { + // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is + // positive. + // We do not explicitly check for a negative here. Instead we use a + // a signed shift. Any negative index will produce a negative value + // by sign-extension and if used as an index into an array it will throw an + // exception. + return bitIndex >> DIVIDE_BY_64; + } + + /** + * Gets the filter bit mask for the specified bit index assuming the filter is using 64-bit + * longs to store bits starting at index 0. The returned value is a {@code long} with only + * 1 bit set. + * + *

The index is assumed to be positive. For a positive index the result will match + * {@code 1L << (bitIndex % 64)}.

+ * + *

If the input is negative the behavior is not defined.

+ * + * @param bitIndex the bit index (assumed to be positive) + * @return the filter bit + * @see #checkPositive(int) + */ + public static long getLongBit(final int bitIndex) { + // Bit shifts only use the first 6 bits. Thus it is not necessary to mask this + // using 0x3f (63) or compute bitIndex % 64. + // Note: If the index is negative the shift will be (64 - (bitIndex & 0x3f)) and + // this will identify an incorrect bit. + return 1L << bitIndex; + } + + /** + * Determines if a cardinality is sparse based on the shape. + *

This method assumes that BitMaps are 64bits and indexes are 32bits. If the memory + * necessary to store the cardinality as indexes is less than the estimated memory for BitMaps, + * the cardinality is determined to be {@code sparse}.

+ * @param cardinality the cardinality to check. + * @param shape the Shape to check against + * @return true if the cardinality is sparse within the shape. + */ + public static boolean isSparse(int cardinality, Shape shape) { + /* + * Since the size of a BitMap is a long and the size of an index is an int, + * there can be 2 indexes for each bitmap. In Bloom filters indexes are evenly + * distributed across the range of possible values, Thus if the cardinality + * (number of indexes) is less than or equal to 2*number of BitMaps the + * cardinality is sparse within the shape. + */ + + Objects.requireNonNull(shape, "shape"); + return cardinality <= (numberOfBitMaps(shape.getNumberOfBits()) * 2); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java new file mode 100644 index 0000000000..478cce0928 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; +import java.util.Objects; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; + +/** + * Produces BitMap longs for a Bloom filter. + * + * Each bit map is a little-endian long value representing a block of bits of this filter. + * + *

The returned array will have length {@code ceil(m / 64)} where {@code m} is the + * number of bits in the filter and {@code ceil} is the ceiling function. + * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit + * index is enabled. + * + * The producer may stop at the last non zero BitMap or may produce zero value bit maps to the limit determined by + * a shape.. + * + * @since 4.5 + */ +public interface BitMapProducer { + + /** + * Each BitMap is passed to the consumer in order. + * Any exceptions thrown by the action are relayed to the caller. + * + * @param consumer the consumer of the BitMaps. + * @throws NullPointerException if the specified consumer is null + */ + void forEachBitMap(LongConsumer consumer); + + /** + * Creates a BitMapProducer from an array of Long. + * @param bitMaps the bitMaps to return. + * @return a BitMapProducer. + */ + static BitMapProducer fromLongArray(long[] bitMaps) { + return new BitMapProducer() { + + @Override + public void forEachBitMap(LongConsumer consumer) { + for (long word : bitMaps) { + consumer.accept(word); + } + } + + }; + } + + /** + * Creates a BitMapProducer from an IndexProducer. + * @param producer the IndexProducer that specifies the indexes of the bits to enable. + * @param shape the desired shape. + * @return A BitMapProducer that produces the BitMap equivalent of the Indices from the producer. + */ + static BitMapProducer fromIndexProducer(IndexProducer producer, Shape shape) { + Objects.requireNonNull(producer, "producer"); + Objects.requireNonNull(shape, "shape"); + + return new BitMapProducer() { + private int maxBucket = -1; + private long[] result = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; + + @Override + public void forEachBitMap(LongConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); + /* + * we can not assume that all the ints will be in order and not repeated. This + * is because the HasherCollection does not make the guarantee. + */ + // process all the ints into a array of BitMaps + IntConsumer builder = new IntConsumer() { + @Override + public void accept(int i) { + int bucketIdx = BitMap.getLongIndex(i); + maxBucket = maxBucket < bucketIdx ? bucketIdx : maxBucket; + result[bucketIdx] |= BitMap.getLongBit(i); + } + }; + producer.forEachIndex(builder); + // send the bitmaps to the consumer. + for (int bucket = 0; bucket <= maxBucket; bucket++) { + consumer.accept(result[bucket]); + } + } + }; + } + + /** + * A LongConsumer that builds an Array of BitMaps as produced by a BitMapProducer. + * + */ + class ArrayBuilder implements LongConsumer { + private long[] result; + private int idx = 0; + private int bucketCount = 0; + + /** + * Constructor that creates an empty ArrayBuilder. + * @param shape The shape used to generate the BitMaps. + */ + public ArrayBuilder(Shape shape) { + this(shape, null); + } + + /** + * Constructor that creates an array builder with an initial value. + * @param shape The shape used to generate the BitMaps. + * @param initialValue an array of BitMap values to initialize the builder with. May be {@code null}. + * @throws IllegalArgumentException is the length of initialValue is greater than the number of + * buckets as specified by the number of bits in the Shape. + */ + public ArrayBuilder(Shape shape, long[] initialValue) { + Objects.requireNonNull(shape, "shape"); + result = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; + if (initialValue != null) { + if (initialValue.length > result.length) { + throw new IllegalArgumentException( + String.format("initialValue length (%s) is longer than shape length (%s)", + initialValue.length, result.length)); + } + bucketCount = initialValue.length; + System.arraycopy(initialValue, 0, result, 0, initialValue.length); + } + } + + @Override + public void accept(long bitmap) { + result[idx++] |= bitmap; + bucketCount = bucketCount >= idx ? bucketCount : idx; + } + + /** + * Returns the array. + * @return the Array of BitMaps. + */ + public long[] getArray() { + return Arrays.copyOf(result, bucketCount); + } + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java deleted file mode 100644 index de55cbe93d..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.BitSet; -import java.util.PrimitiveIterator.OfInt; -import java.util.function.IntConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; - -/** - * A bloom filter using a Java BitSet to track enabled bits. This is a standard - * implementation and should work well for most Bloom filters. - * @since 4.5 - */ -public class BitSetBloomFilter extends AbstractBloomFilter { - - /** - * The bitSet that defines this BloomFilter. - */ - private final BitSet bitSet; - - /** - * Constructs an empty BitSetBloomFilter. - * - * @param shape the desired shape of the filter. - */ - public BitSetBloomFilter(final Shape shape) { - super(shape); - this.bitSet = new BitSet(); - } - - @Override - public int andCardinality(final BloomFilter other) { - if (other instanceof BitSetBloomFilter) { - verifyShape(other); - final BitSet result = (BitSet) bitSet.clone(); - result.and(((BitSetBloomFilter) other).bitSet); - return result.cardinality(); - } - return super.andCardinality(other); - } - - @Override - public int cardinality() { - return bitSet.cardinality(); - } - - @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - final OfInt iter = hasher.iterator(getShape()); - while (iter.hasNext()) { - if (!bitSet.get(iter.nextInt())) { - return false; - } - } - return true; - } - - @Override - public long[] getBits() { - return bitSet.toLongArray(); - } - - @Override - public StaticHasher getHasher() { - return new StaticHasher(bitSet.stream().iterator(), getShape()); - } - - @Override - public boolean merge(final BloomFilter other) { - verifyShape(other); - if (other instanceof BitSetBloomFilter) { - bitSet.or(((BitSetBloomFilter) other).bitSet); - } else { - bitSet.or(BitSet.valueOf(other.getBits())); - } - return true; - } - - @Override - public boolean merge(final Hasher hasher) { - verifyHasher(hasher); - hasher.iterator(getShape()).forEachRemaining((IntConsumer) bitSet::set); - return true; - } - - @Override - public int orCardinality(final BloomFilter other) { - if (other instanceof BitSetBloomFilter) { - verifyShape(other); - final BitSet result = (BitSet) bitSet.clone(); - result.or(((BitSetBloomFilter) other).bitSet); - return result.cardinality(); - } - return super.orCardinality(other); - } - - @Override - public int xorCardinality(final BloomFilter other) { - if (other instanceof BitSetBloomFilter) { - verifyShape(other); - final BitSet result = (BitSet) bitSet.clone(); - result.xor(((BitSetBloomFilter) other).bitSet); - return result.cardinality(); - } - return super.xorCardinality(other); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index af43ddd51e..1e8680a239 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -16,138 +16,258 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; /** * The interface that describes a Bloom filter. * @since 4.5 */ -public interface BloomFilter { +public interface BloomFilter extends IndexProducer, BitMapProducer { - // Query Operations + /** + * Return the Bloom filter data as a BitMap array. + * @param filter the filter to get the data from. + * @return An array of BitMap long. + */ + static long[] asBitMapArray(BloomFilter filter) { + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(filter.getShape()); + filter.forEachBitMap(builder); + return builder.getArray(); + } /** - * Gets the shape of this filter. - * - * @return the shape of this filter + * Return the Bloom filter data as an array of indices for the enabled bits. + * @param filter the Filter to get the data from. + * @return An array of indices for enabled bits in the Bloom filter. */ - Shape getShape(); + static int[] asIndexArray(BloomFilter filter) { + List lst = new ArrayList(); + filter.forEachIndex(lst::add); + return lst.stream().mapToInt(Integer::intValue).toArray(); + } + + // Query Operations /** - * Gets an array of little-endian long values representing the bits of this filter. + * This method is used to determine the best method for matching. * - *

The returned array will have length {@code ceil(m / 64)} where {@code m} is the - * number of bits in the filter and {@code ceil} is the ceiling function. - * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit - * index is enabled. + *

For `sparse` implementations + * the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations + * the {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier + * for the implementation to produce indexes of BitMap blocks.

* - * @return the {@code long[]} representation of this filter + * @return {@code true} if the implementation is sparse {@code false} otherwise. + * @see BitMap */ - long[] getBits(); + boolean isSparse(); /** - * Creates a StaticHasher that contains the indexes of the bits that are on in this - * filter. - * - * @return a StaticHasher for that produces this Bloom filter + * Gets the shape that was used when the filter was built. + * @return The shape the filter was built with. */ - StaticHasher getHasher(); + Shape getShape(); /** - * Returns {@code true} if this filter contains the specified filter. Specifically this + * Returns {@code true} if this filter contains the specified filter. + * + *

Specifically this * returns {@code true} if this filter is enabled for all bits that are enabled in the * {@code other} filter. Using the bit representations this is - * effectively {@code (this AND other) == other}. + * effectively {@code (this AND other) == other}.

* * @param other the other Bloom filter - * @return true if this filter is enabled for all enabled bits in the other filter - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter + * @return true if all enabled bits in the other filter are enabled in this filter. */ - boolean contains(BloomFilter other); + default boolean contains(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return isSparse() ? contains((IndexProducer) other) : contains((BitMapProducer) other); + } /** - * Returns {@code true} if this filter contains the specified decomposed Bloom filter. - * Specifically this returns {@code true} if this filter is enabled for all bit indexes - * identified by the {@code hasher}. Using the bit representations this is - * effectively {@code (this AND hasher) == hasher}. + * Returns {@code true} if this filter contains the bits specified in the hasher. + * + *

Specifically this returns {@code true} if this filter is enabled for all bit indexes + * identified by the {@code hasher}. Using the BitMap representations this is + * effectively {@code (this AND hasher) == hasher}.

* * @param hasher the hasher to provide the indexes * @return true if this filter is enabled for all bits specified by the hasher - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter */ - boolean contains(Hasher hasher); + default boolean contains(Hasher hasher) { + Objects.requireNonNull(hasher, "Hasher"); + Shape shape = getShape(); + return contains(hasher.indices(shape)); + } - // Modification Operations + /** + * Returns {@code true} if this filter contains the indices specified IndexProducer. + * + *

Specifically this returns {@code true} if this filter is enabled for all bit indexes + * identified by the {@code IndexProducer}.

+ * + * @param indexProducer the IndexProducer to provide the indexes + * @return {@code true} if this filter is enabled for all bits specified by the IndexProducer + */ + boolean contains(IndexProducer indexProducer); + + /** + * Returns {@code true} if this filter contains the bits specified in the BitMaps produced by the + * bitMapProducer. + * + * @param bitMapProducer the the {@code BitMapProducer} to provide the BitMaps. + * @return {@code true} if this filter is enabled for all bits specified by the BitMaps + */ + boolean contains(BitMapProducer bitMapProducer); /** - * Merges the specified Bloom filter into this Bloom filter. Specifically all bit indexes - * that are enabled in the {@code other} filter will be enabled in this filter. + * Merges the specified Bloom filter with this Bloom filter creating a new Bloom filter. * - *

Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the {@code other} Bloom filter. + *

Specifically all bit indexes that are enabled in the {@code other} and in @code this} filter will be + * enabled in the resulting filter.

* * @param other the other Bloom filter + * @return The new Bloom filter. + */ + default BloomFilter merge(BloomFilter other) { + Objects.requireNonNull(other, "other"); + Shape shape = getShape(); + BloomFilter result = BitMap.isSparse((cardinality() + other.cardinality()), getShape()) + ? new SparseBloomFilter(shape) + : new SimpleBloomFilter(shape); + + result.mergeInPlace(this); + result.mergeInPlace(other); + return result; + } + + /** + * Merges the specified Hasher with this Bloom filter and returns a new Bloom filter. + * + *

Specifically all bit indexes that are identified by the {@code hasher} and in {@code this} Bloom filter + * be enabled in the resulting filter.

+ * + * @param hasher the hasher to provide the indices + * @return the new Bloom filter. + */ + default BloomFilter merge(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + Shape shape = getShape(); + BloomFilter result = BitMap.isSparse((hasher.size() * shape.getNumberOfHashFunctions()) + cardinality(), shape) + ? new SparseBloomFilter(shape, hasher) + : new SimpleBloomFilter(shape, hasher); + result.mergeInPlace(this); + return result; + } + + /** + * Merges the specified Bloom filter into this Bloom filter. + * + *

Specifically all + * bit indexes that are identified by the {@code other} will be enabled in this filter.

+ * + *

Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter may or may not contain + * the {@code other} Bloom filter. This state may occur in complex Bloom filter implementations like + * counting Bloom filters.

+ * + * @param other The bloom filter to merge into this one. * @return true if the merge was successful - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter */ - boolean merge(BloomFilter other); + boolean mergeInPlace(BloomFilter other); /** - * Merges the specified decomposed Bloom filter into this Bloom filter. Specifically all + * Merges the specified hasher into this Bloom filter. Specifically all * bit indexes that are identified by the {@code hasher} will be enabled in this filter. * - *

Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the specified decomposed Bloom filter. + *

Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter may or may not contain + * the {@code other} Bloom filter. This state may occur in complex Bloom filter implementations like + * counting Bloom filters.

* - * @param hasher the hasher to provide the indexes + * @param hasher The hasher to merge. * @return true if the merge was successful - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter */ - boolean merge(Hasher hasher); + default boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + Shape shape = getShape(); + BloomFilter result = BitMap.isSparse((hasher.size() * shape.getNumberOfHashFunctions()) + cardinality(), shape) + ? new SparseBloomFilter(getShape(), hasher) + : new SimpleBloomFilter(getShape(), hasher); + return mergeInPlace(result); + } + + /** + * Determines if the bloom filter is "full". + * + *

Full is defined as having no unset bits.

+ * + * @return {@code true} if the filter is full, {@code false} otherwise. + */ + default boolean isFull() { + return cardinality() == getShape().getNumberOfBits(); + } // Counting Operations /** * Gets the cardinality (number of enabled bits) of this Bloom filter. * - *

This is also known as the Hamming value.

+ *

This is also known as the Hamming value or Hamming number.

* * @return the cardinality of this filter */ int cardinality(); /** - * Performs a logical "AND" with the other Bloom filter and returns the cardinality - * (number of enabled bits) of the result. + * Estimates the number of items in the Bloom filter. * - * @param other the other Bloom filter - * @return the cardinality of the result of {@code (this AND other)} + *

By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the + * shape and cardinality of this filter.

+ * + *

An item is roughly equivalent to the number of Hashers that have been merged. As the Bloom filter + * is a probabilistic structure this value is an estimate.

+ * + * @return an estimate of the number of items in the bloom filter. + * @see Shape#estimateN(int) */ - int andCardinality(BloomFilter other); + default int estimateN() { + return (int) Math.round(getShape().estimateN(cardinality())); + } /** - * Performs a logical "OR" with the other Bloom filter and returns the cardinality - * (number of enabled bits) of the result. + * Estimates the number of items in the union of this Bloom filter with the other bloom filter. * - * @param other the other Bloom filter - * @return the cardinality of the result of {@code (this OR other)} + *

By default this is the {@code estimateN()} of the merging of this filter with the {@code other} filter.

+ * + *

An item is roughly equivalent to the number of Hashers that have been merged. As the Bloom filter + * is a probabilistic structure this value is an estimate.

+ * + * @param other The other Bloom filter + * @return an estimate of the number of items in the union. + * @see #estimateN() */ - int orCardinality(BloomFilter other); + default int estimateUnion(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return this.merge(other).estimateN(); + } /** - * Performs a logical "XOR" with the other Bloom filter and returns the cardinality - * (number of enabled bits) of the result. + * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter. * - * @param other the other Bloom filter - * @return the cardinality of the result of {@code (this XOR other)} + *

By default this is the {@code estimateN() + other.estimateN() - estimateUnion(other)}

+ * + *

An item is roughly equivalent to the number of Hashers that have been merged. As the Bloom filter + * is a probabilistic structure this value is an estimate.

+ * + * @param other The other Bloom filter + * @return an estimate of the number of items in the intersection. */ - int xorCardinality(BloomFilter other); + default int estimateIntersection(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return estimateN() + other.estimateN() - estimateUnion(other); + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java deleted file mode 100644 index fe9b1161a9..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -/** - * Contains functions to convert {@code int} indices into Bloom filter bit positions. - */ -public final class BloomFilterIndexer { - /** A bit shift to apply to an integer to divided by 64 (2^6). */ - private static final int DIVIDE_BY_64 = 6; - - /** Do not instantiate. */ - private BloomFilterIndexer() {} - - /** - * Check the index is positive. - * - * @param bitIndex the bit index - * @throws IndexOutOfBoundsException if the index is not positive - */ - public static void checkPositive(final int bitIndex) { - if (bitIndex < 0) { - throw new IndexOutOfBoundsException("Negative bitIndex: " + bitIndex); - } - } - - /** - * Gets the filter index for the specified bit index assuming the filter is using 64-bit longs - * to store bits starting at index 0. - * - *

The index is assumed to be positive. For a positive index the result will match - * {@code bitIndex / 64}. - * - *

The divide is performed using bit shifts. If the input is negative the behavior - * is not defined. - * - * @param bitIndex the bit index (assumed to be positive) - * @return the filter index - * @see #checkPositive(int) - */ - public static int getLongIndex(final int bitIndex) { - // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is positive. - // We do not explicitly check for a negative here. Instead we use a - // a signed shift. Any negative index will produce a negative value - // by sign-extension and if used as an index into an array it will throw an exception. - return bitIndex >> DIVIDE_BY_64; - } - - /** - * Gets the filter bit mask for the specified bit index assuming the filter is using 64-bit - * longs to store bits starting at index 0. The returned value is a {@code long} with only - * 1 bit set. - * - *

The index is assumed to be positive. For a positive index the result will match - * {@code 1L << (bitIndex % 64)}. - * - *

If the input is negative the behavior is not defined. - * - * @param bitIndex the bit index (assumed to be positive) - * @return the filter bit - * @see #checkPositive(int) - */ - public static long getLongBit(final int bitIndex) { - // Bit shifts only use the first 6 bits. Thus it is not necessary to mask this - // using 0x3f (63) or compute bitIndex % 64. - // Note: If the index is negative the shift will be (64 - (bitIndex & 0x3f)) and - // this will identify an incorrect bit. - return 1L << bitIndex; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index 0c414ebe93..e8e8a7b242 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -27,7 +27,7 @@ * to and not later subtracted from the counting Bloom filter. The functional * state of a CountingBloomFilter at the start and end of a series of merge and * subsequent remove operations of the same Bloom filters, irrespective of - * remove order, is expected to be the same. + * remove order, is expected to be the same.

* *

Removal of a filter that has not previously been merged results in an * invalid state where the counts no longer represent a sum of merged Bloom @@ -36,166 +36,148 @@ * undetected. The CountingBloomFilter maintains a state flag that is used as a * warning that an operation was performed that resulted in invalid counts and * thus an invalid state. For example this may occur if a count for an index was - * set to negative following a remove operation. + * set to negative following a remove operation.

* *

Implementations should document the expected state of the filter after an * operation that generates invalid counts, and any potential recovery options. * An implementation may support a reversal of the operation to restore the * state to that prior to the operation. In the event that invalid counts are * adjusted to a valid range then it should be documented if there has been - * irreversible information loss. + * irreversible information loss.

* *

Implementations may choose to throw an exception during an operation that * generates invalid counts. Implementations should document the expected state * of the filter after such an operation. For example are the counts not updated, - * partially updated or updated entirely before the exception is raised. + * partially updated or updated entirely before the exception is raised.

* * @since 4.5 */ -public interface CountingBloomFilter extends BloomFilter { - - /** - * Represents an operation that accepts an {@code } pair representing - * the count for a bit index in a counting Bloom filter and returns no result. - * - *

Note: This is a functional interface as a primitive type specialization of - * {@link java.util.function.BiConsumer} for {@code int}. - */ - @FunctionalInterface - interface BitCountConsumer { - /** - * Performs this operation on the given {@code } pair. - * - * @param index the bit index - * @param count the count at the specified bit index - */ - void accept(int index, int count); - } +public interface CountingBloomFilter extends BloomFilter, BitCountProducer { // Query Operations /** - * Returns true if the internal state is valid. This flag is a warning that an addition or + * Returns {@code true} if the internal state is valid. + * + *

This flag is a warning that an addition or * subtraction of counts from this filter resulted in an invalid count for one or more * indexes. For example this may occur if a count for an index was * set to negative following a subtraction operation, or overflows an {@code int} following an - * addition operation. + * addition operation.

* *

A counting Bloom filter that has an invalid state is no longer ensured to function * identically to a standard Bloom filter instance that is the merge of all the Bloom filters - * that have been added to and not later subtracted from this counting Bloom filter. + * that have been added to and not later subtracted from this counting Bloom filter.

* *

Note: The change to an invalid state may or may not be reversible. Implementations * are expected to document their policy on recovery from an addition or removal operation - * that generated an invalid state. + * that generated an invalid state.

* - * @return true if the state is valid + * @return {@code true} if the state is valid */ boolean isValid(); - /** - * Performs the given action for each {@code } pair where the count is non-zero. - * Any exceptions thrown by the action are relayed to the caller. - * - * @param action the action to be performed for each non-zero bit count - * @throws NullPointerException if the specified action is null - */ - void forEachCount(BitCountConsumer action); - // Modification Operations /** - * Merges the specified Bloom filter into this Bloom filter. Specifically all counts for - * indexes that are enabled in the {@code other} filter will be incremented by 1. + * Removes the specified Bloom filter from this Bloom filter. + * + *

Specifically: all counts for the indexes identified by the {@code other} filter will be decremented by 1,

* - *

Note: If the other filter is a counting Bloom filter the index counts are ignored; only - * the enabled indexes are used. + *

Note: If the other filter is a counting Bloom filter the index counts are ignored and it is treated as an + * IndexProducer.

* - *

This method will return true if the filter is valid after the operation. + *

This method will return {@code true} if the filter is valid after the operation.

* - * @param other {@inheritDoc} - * @return true if the merge was successful and the state is valid - * @throws IllegalArgumentException {@inheritDoc} + * @param other the other Bloom filter + * @return {@code true} if the removal was successful and the state is valid * @see #isValid() + * @see #subtract(BitCountProducer) */ - @Override - boolean merge(BloomFilter other); + boolean remove(BloomFilter other); /** - * Merges the specified decomposed Bloom filter into this Bloom filter. Specifically all - * counts for the distinct indexes that are identified by the {@code hasher} will - * be incremented by 1. If the {@code hasher} contains duplicate bit indexes these are ignored. + * Removes the specified hasher from the Bloom filter from this Bloom filter. + * + *

Specifically all counts for the indices produced by the {@code hasher} will be + * decremented by 1.

* - *

This method will return true if the filter is valid after the operation. + *

For HasherCollections each enclosed Hasher will be considered a single item and decremented + * from the counts separately.

* - * @param hasher {@inheritDoc} - * @return true if the merge was successful and the state is valid - * @throws IllegalArgumentException {@inheritDoc} + *

This method will return {@code true} if the filter is valid after the operation.

+ * + * @param hasher the hasher to provide the indexes + * @return {@code true} if the removal was successful and the state is valid * @see #isValid() + * @see #subtract(BitCountProducer) */ - @Override - boolean merge(Hasher hasher); + boolean remove(Hasher hasher); /** - * Removes the specified Bloom filter from this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be decremented by 1. + * Adds the specified BitCountProducer to this Bloom filter. * - *

Note: If the other filter is a counting Bloom filter the index counts are ignored; only - * the enabled indexes are used. + *

Specifically + * all counts for the indexes identified by the {@code other} will be incremented + * by their corresponding values in the {@code other}.

* - *

This method will return true if the filter is valid after the operation. + *

This method will return {@code true} if the filter is valid after the operation.

* - * @param other the other Bloom filter - * @return true if the removal was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter + * @param other the BitCountProducer to add. + * @return {@code true} if the addition was successful and the state is valid * @see #isValid() - * @see #subtract(CountingBloomFilter) + * @see #subtract(BitCountProducer) */ - boolean remove(BloomFilter other); + boolean add(BitCountProducer other); /** - * Removes the specified decomposed Bloom filter from this Bloom filter. Specifically - * all counts for the distinct indexes identified by the {@code hasher} will be - * decremented by 1. If the {@code hasher} contains duplicate bit indexes these are ignored. + * Adds the specified BitCountProducer to this Bloom filter. * - *

This method will return true if the filter is valid after the operation. + *

Specifically + * all counts for the indexes identified by the {@code other} will be decremented + * by their corresponding values in the {@code other}.

* - * @param hasher the hasher to provide the indexes - * @return true if the removal was successful and the state is valid - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter + *

This method will return true if the filter is valid after the operation.

+ * + * @param other the BitCountProducer to subtract. + * @return {@code true} if the subtraction was successful and the state is valid * @see #isValid() + * @see #add(BitCountProducer) */ - boolean remove(Hasher hasher); + boolean subtract(BitCountProducer other); /** - * Adds the specified counting Bloom filter to this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be incremented - * by their corresponding counts in the {@code other} filter. + * Merges the specified Bloom filter into this Bloom filter to produce a new CountingBloomFilter. * - *

This method will return true if the filter is valid after the operation. + *

Specifically the new Bloom filter will contain all the counts of this filter and in addition + * all bit indexes that are enabled in the {@code other} filter will be incremented + * by one in the new filter.

* - * @param other the other counting Bloom filter - * @return true if the addition was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter - * @see #isValid() + *

Note: the validity of the resulting filter is not guaranteed. When in doubt {@code isValid()} + * should be called on the new filter.

+ * + * @param other the other Bloom filter + * @return A new CountingBloomFilter instance. */ - boolean add(CountingBloomFilter other); + @Override + CountingBloomFilter merge(BloomFilter other); /** - * Adds the specified counting Bloom filter to this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be decremented - * by their corresponding counts in the {@code other} filter. + * Merges the specified hasher with this Bloom filter to create a new CountingBloomFilter. * - *

This method will return true if the filter is valid after the operation. + *

Specifically the new Bloom filter will contain all the counts of this filter and in addition + * all bit indexes specified by the {@code hasher} will be incremented + * by one in the new filter.

* - * @param other the other counting Bloom filter - * @return true if the subtraction was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter - * @see #isValid() + *

For HasherCollections each enclosed Hasher will be considered a single item and increment + * the counts separately.

+ * + *

Note: the validity of the resulting filter is not guaranteed. When in doubt {@code isValid()} + * should be called on the new filter.

+ * + * @param hasher the hasher to provide the indexes + * @return A new CountingBloomFilter instance. */ - boolean subtract(CountingBloomFilter other); + @Override + CountingBloomFilter merge(Hasher hasher); } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java deleted file mode 100644 index 71272e65c4..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.Arrays; -import java.util.Set; -import java.util.TreeSet; -import java.util.PrimitiveIterator.OfInt; -import java.util.function.IntConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; -import org.apache.commons.collections4.iterators.EmptyIterator; -import org.apache.commons.collections4.iterators.IteratorChain; - -/** - * A Bloom filter built on a single hasher. This filter type should only be used for small - * filters (few on bits). While this implementation correctly supports the merge() methods - * it is recommended that if merges are expected that one of the other Bloom filter - * implementations be used. - * @since 4.5 - */ -public class HasherBloomFilter extends AbstractBloomFilter { - /** The bit representation for an empty Bloom filter. */ - private static final long[] EMPTY = new long[0]; - - /** - * The internal hasher representation. - */ - private StaticHasher hasher; - - /** - * Constructs a HasherBloomFilter from a hasher and a shape. - * - * @param hasher the hasher to use. - * @param shape the shape of the Bloom filter. - */ - public HasherBloomFilter(final Hasher hasher, final Shape shape) { - super(shape); - verifyHasher(hasher); - if (hasher instanceof StaticHasher) { - this.hasher = (StaticHasher) hasher; - verifyShape(this.hasher.getShape()); - } else { - this.hasher = new StaticHasher(hasher, shape); - } - } - - /** - * Constructs an empty HasherBloomFilter from a shape. - * - * @param shape the shape of the Bloom filter. - */ - public HasherBloomFilter(final Shape shape) { - super(shape); - this.hasher = new StaticHasher(EmptyIterator.emptyIterator(), shape); - } - - @Override - public int cardinality() { - return hasher.size(); - } - - @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - final Set set = new TreeSet<>(); - hasher.iterator(getShape()).forEachRemaining((IntConsumer) idx -> { - set.add(idx); - }); - final OfInt iter = this.hasher.iterator(getShape()); - while (iter.hasNext()) { - final int idx = iter.nextInt(); - set.remove(idx); - if (set.isEmpty()) { - return true; - } - } - return false; - } - - @Override - public long[] getBits() { - if (hasher.isEmpty()) { - return EMPTY; - } - - // Note: This can be simplified if the StaticHasher exposed a getMaxIndex() - // method. Since it maintains an ordered list of unique indices the maximum - // is the last value in the iterator. Knowing this value would allow - // exact allocation of the long[]. - // For now we assume that the long[] will have a positive length and at least - // 1 bit set in the entire array. - - final int n = (int) Math.ceil(hasher.getShape().getNumberOfBits() * (1.0 / Long.SIZE)); - final long[] result = new long[n]; - final OfInt iter = hasher.iterator(hasher.getShape()); - iter.forEachRemaining((IntConsumer) idx -> { - BloomFilterIndexer.checkPositive(idx); - final int buffIdx = BloomFilterIndexer.getLongIndex(idx); - final long buffOffset = BloomFilterIndexer.getLongBit(idx); - result[buffIdx] |= buffOffset; - }); - - int limit = result.length; - - // Assume the array has a non-zero length and at least 1 bit set. - // This is tested using assertions. - assert limit > 0 : "Number of bits in Shape is 0"; - while (result[limit - 1] == 0) { - limit--; - // If the hasher was not empty it is not possible to return - // an array of length zero. - assert limit > 0 : "Hasher reported a non-zero size but has no indices"; - } - if (limit < result.length) { - return Arrays.copyOf(result, limit); - } - return result; - } - - @Override - public StaticHasher getHasher() { - return hasher; - } - - @Override - public boolean merge(final BloomFilter other) { - return merge(other.getHasher()); - } - - @Override - public boolean merge(final Hasher hasher) { - verifyHasher(hasher); - final IteratorChain iter = new IteratorChain<>(this.hasher.iterator(getShape()), - hasher.iterator(getShape())); - this.hasher = new StaticHasher(iter, getShape()); - return true; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java deleted file mode 100644 index e4adb4fc66..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; - -import java.util.Objects; -import java.util.Set; -import java.util.TreeSet; -import java.util.function.Consumer; -import java.util.function.IntConsumer; - -/** - * Contains functions to filter indexes. - */ -final class IndexFilters { - /** Do not instantiate. */ - private IndexFilters() { - } - - /** - * Transfer all distinct indexes in the specified {@code hasher} generated for the - * specified {@code shape} to the specified {@code consumer}. For example this - * can be used to merge a {@link Hasher} representation of a Bloom filter into a - * {@link BloomFilter} instance that does not naturally handle duplicate indexes. - * - *

This method is functionally equivalent to: - * - *

-     *     final Set<Integer> distinct = new TreeSet<>();
-     *     hasher.iterator(shape).forEachRemaining((Consumer<Integer>) i -> {
-     *         if (distinct.add(i)) {
-     *             consumer.accept(i);
-     *         }
-     *     });
-     * 
- * - * @param hasher the hasher - * @param shape the shape - * @param consumer the consumer to receive distinct indexes - * @throws NullPointerException if the hasher, shape or action are null - * @see Hasher#iterator(Shape) - */ - static void distinctIndexes(final Hasher hasher, final Shape shape, final IntConsumer consumer) { - Objects.requireNonNull(hasher, "hasher"); - Objects.requireNonNull(shape, "shape"); - Objects.requireNonNull(consumer, "consumer"); - - // TODO - // This function can be optimised based on the expected size - // (number of indexes) of the hasher and the number of bits in the shape. - // - // A large size would benefit from a pre-allocated BitSet-type filter. - // A very small size may be more efficient as a simple array of values - // that have already been seen that is scanned for each new index. - // - // A default is to use a Set to filter distinct values. The choice of set - // should be evaluated. A HashSet would be optimal if size is known. - // A TreeSet has lower memory consumption and performance is not as - // sensitive to knowing the size in advance. - - final Set distinct = new TreeSet<>(); - hasher.iterator(shape).forEachRemaining((Consumer) i -> { - if (distinct.add(i)) { - consumer.accept(i); - } - }); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java new file mode 100644 index 0000000000..a0caace3e4 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; + +/** + * An object that produces indices of a Bloom filter. + * + * @since 4.5 + */ +public interface IndexProducer { + + /** + * Each index is passed to the consumer. + *

Any exceptions thrown by the action are relayed to the caller.

+ * + *

Indices ordering is not guaranteed

+ * + * @param consumer the action to be performed for each non-zero bit index. + * @throws NullPointerException if the specified action is null + */ + void forEachIndex(IntConsumer consumer); + + /** + * Creates an IndexProducer from a @{code BitMapProducer}. + * @param producer the @{code BitMapProducer} + * @return a new @{code IndexProducer}. + */ + static IndexProducer fromBitMapProducer(BitMapProducer producer) { + Objects.requireNonNull(producer, "producer"); + return new IndexProducer() { + @Override + public void forEachIndex(IntConsumer consumer) { + LongConsumer longConsumer = new LongConsumer() { + int wordIdx = 0; + + @Override + public void accept(long word) { + for (int i = 0; i < 64; i++) { + long mask = 1L << i; + if ((word & mask) == mask) { + consumer.accept((wordIdx * 64) + i); + } + } + wordIdx++; + } + }; + producer.forEachBitMap(longConsumer::accept); + } + + }; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java index 48c43620ad..d82548cd99 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java @@ -16,14 +16,123 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import java.util.function.LongBinaryOperator; +import java.util.function.LongConsumer; +import java.util.function.LongUnaryOperator; /** * Implementations of set operations on Bloom filters. * + * @since 4.5 */ public final class SetOperations { + /** + * A helper class that calculates cardinality as the cardinality of the result of an operation on a two BitMap arrays. + * + *

The first array is build in the constructor. The second array is processed as a LongConsumer. Whenever there are + * two values the op2 operation is used. Whenever the one array is longer than the other the op1 operation is used on the + * bitMaps that do not have matching entries.

+ * + *

The calculated cardinalities are summed to return the cardinality of the operation.

+ * + */ + private static class CardCounter implements LongConsumer { + /** + * The calculated cardinality + */ + private int cardinality = 0; + /** + * The index into the array of BitMaps + */ + private int idx = 0; + /** + * The array of BitMaps + */ + private long[] bitMaps; + /** + * The operator to execute for 2 BitMaps + */ + private LongBinaryOperator op2; + /** + * The operator to execute for a single BitMap; + */ + private LongUnaryOperator op1; + + /** + * Constructor. + * @param BitMaps The array of BitMap BitMaps for a Bloom filter + * @param op2 The operation to execute when there are two BitMaps to compare. + * @param op1 The operation to execute when there is only one BitMap to cmpare. + */ + CardCounter(BitMapProducer producer, Shape shape, LongBinaryOperator op2, LongUnaryOperator op1) { + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape); + producer.forEachBitMap(builder); + this.bitMaps = builder.getArray(); + this.op2 = op2; + this.op1 = op1; + } + + @Override + public void accept(long bitMap) { + if (idx < bitMaps.length) { + cardinality += Long.bitCount(op2.applyAsLong(bitMaps[idx++], bitMap)); + } else { + cardinality += Long.bitCount(op1.applyAsLong(bitMap)); + } + } + + /** + * Gets the cardinality value. + * @return The accumulated cardinality. + */ + int getCardinality() { + for (; idx < bitMaps.length; idx++) { + cardinality += Long.bitCount(op1.applyAsLong(bitMaps[idx])); + } + return cardinality; + } + } + + /** + * Calculates the cardinality of the logical {@code AND} of the BitMaps for the two filters. + * @param shape the shape of the filter + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code AND} of the filters. + */ + public static int andCardinality(final Shape shape, final BitMapProducer first, final BitMapProducer second) { + CardCounter lc = new CardCounter(first, shape, (x, y) -> x & y, (x) -> 0); + second.forEachBitMap(lc); + return lc.getCardinality(); + } + + /** + * Calculates the cardinality of the logical {@code OR} of the BitMaps for the two filters. + * @param shape the shape of the filter + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code OR} of the filters. + */ + public static int orCardinality(final Shape shape, final BitMapProducer first, final BitMapProducer second) { + CardCounter lc = new CardCounter(first, shape, (x, y) -> x | y, (x) -> x); + second.forEachBitMap(lc); + return lc.getCardinality(); + } + + /** + * Calculates the cardinality of the logical {@code XOR} of the BitMaps for the two filters. + * @param shape the shape of the filter + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code XOR} of the filters. + */ + public static int xorCardinality(final Shape shape, final BitMapProducer first, final BitMapProducer second) { + CardCounter lc = new CardCounter(first, shape, (x, y) -> x ^ y, (x) -> x); + second.forEachBitMap(lc); + return lc.getCardinality(); + } + /** * Calculates the Cosine distance between two Bloom filters. * @@ -49,57 +158,10 @@ public static double cosineDistance(final BloomFilter first, final BloomFilter s * @return the Cosine similarity. */ public static double cosineSimilarity(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final int numerator = first.andCardinality(second); + final int numerator = andCardinality(first.getShape(), first, second); return numerator == 0 ? 0 : numerator / (Math.sqrt(first.cardinality()) * Math.sqrt(second.cardinality())); } - /** - * Estimates the number of items in the intersection of the sets represented by two - * Bloom filters. - * - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return an estimate of the size of the intersection between the two filters. - */ - public static long estimateIntersectionSize(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - // do subtraction early to avoid Long overflow. - return estimateSize(first) - estimateUnionSize(first, second) + estimateSize(second); - } - - /** - * Estimates the number of items in the Bloom filter based on the shape and the number - * of bits that are enabled. - * - * @param filter the Bloom filter to estimate size for. - * @return an estimate of the number of items that were placed in the Bloom filter. - */ - public static long estimateSize(final BloomFilter filter) { - final Shape shape = filter.getShape(); - final double estimate = -(shape.getNumberOfBits() * - Math.log(1.0 - filter.cardinality() * 1.0 / shape.getNumberOfBits())) / - shape.getNumberOfHashFunctions(); - return Math.round(estimate); - } - - /** - * Estimates the number of items in the union of the sets represented by two - * Bloom filters. - * - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return an estimate of the size of the union between the two filters. - */ - public static long estimateUnionSize(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final Shape shape = first.getShape(); - final double estimate = -(shape.getNumberOfBits() * - Math.log(1.0 - first.orCardinality(second) * 1.0 / shape.getNumberOfBits())) / - shape.getNumberOfHashFunctions(); - return Math.round(estimate); - } - /** * Calculates the Hamming distance between two Bloom filters. * @@ -108,8 +170,7 @@ public static long estimateUnionSize(final BloomFilter first, final BloomFilter * @return the Hamming distance. */ public static int hammingDistance(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - return first.xorCardinality(second); + return xorCardinality(first.getShape(), first, second); } /** @@ -135,28 +196,14 @@ public static double jaccardDistance(final BloomFilter first, final BloomFilter * @return the Jaccard similarity. */ public static double jaccardSimilarity(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final int orCard = first.orCardinality(second); + final int orCard = orCardinality(first.getShape(), first, second); // if the orCard is zero then the hamming distance will also be zero. return orCard == 0 ? 0 : hammingDistance(first, second) / (double) orCard; } - /** - * Verifies the Bloom filters have the same shape. - * - * @param first the first filter to check. - * @param second the second filter to check. - * @throws IllegalArgumentException if the shapes are not the same. - */ - private static void verifyShape(final BloomFilter first, final BloomFilter second) { - if (!first.getShape().equals(second.getShape())) { - throw new IllegalArgumentException(String.format("Shape %s is not the same as %s", - first.getShape(), second.getShape())); - } - } - /** * Do not instantiate. */ - private SetOperations() {} + private SetOperations() { + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java new file mode 100644 index 0000000000..fcaa971da8 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -0,0 +1,467 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; + +/** + * The definition of a Bloom filter shape. + * + *

This class contains the values for the filter configuration and is used to + * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are + * compatible. (i.e. can be compared or merged)

+ * + *

Interrelatedness of values

+ * + *
Number of Items ({@code n})
+ *
{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}
Probability of + * False Positives ({@code p})
{@code p = pow(1 - exp(-k / (m / n)), k)}
Number + * of Bits ({@code m})
+ *
{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}
Number of + * Functions ({@code k})
{@code k = round((m / n) * ln(2))}
+ * + * @see Bloom Filter calculator + * @see Bloom filter + * [Wikipedia] + * @since 4.5 + */ +public final class Shape implements Comparable { + + /** + * Number of hash functions to create a filter ({@code k}). + */ + private final int numberOfHashFunctions; + + /** + * Number of bits in the filter ({@code m}). + */ + private final int numberOfBits; + + /** + * Constructs a filter configuration with the specified number of items ({@code n}) and + * bits ({@code m}). + * + *

The optimal number of hash functions ({@code k}) is computed. + *

k = round((m / n) * ln(2))
+ * + *

The false-positive probability is computed using the number of items, bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. + * @param numberOfBits The number of bits in the filter + * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} + */ + public Shape(final int numberOfHashFunctions, final int numberOfBits) { + this.numberOfBits = checkNumberOfBits(numberOfBits); + this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); + } + + /** + * Check number of bits is strictly positive. + * + * @param numberOfBits the number of bits + * @return the number of bits + * @throws IllegalArgumentException if the number of bits is {@code < 1} + */ + private static int checkNumberOfBits(final int numberOfBits) { + if (numberOfBits < 1) { + throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); + } + return numberOfBits; + } + + /** + * Check number of hash functions is strictly positive + * + * @param numberOfHashFunctions the number of hash functions + * @return the number of hash functions + * @throws IllegalArgumentException if the number of hash functions is {@code < 1} + */ + private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { + if (numberOfHashFunctions < 1) { + throw new IllegalArgumentException( + "Number of hash functions must be greater than 0: " + numberOfHashFunctions); + } + return numberOfHashFunctions; + } + + @Override + public int compareTo(Shape other) { + int i = Integer.compare(numberOfBits, other.numberOfBits); + return i == 0 ? Integer.compare(numberOfHashFunctions, other.numberOfHashFunctions) : i; + } + + @Override + public boolean equals(final Object o) { + return (o instanceof Shape) ? compareTo((Shape) o) == 0 : false; + } + + @Override + public int hashCode() { + return Objects.hash(numberOfBits, numberOfHashFunctions); + } + + /** + * Gets the number of bits in the Bloom filter. + * This is also known as {@code m}. + * + * @return the number of bits in the Bloom filter ({@code m}). + */ + public int getNumberOfBits() { + return numberOfBits; + } + + /** + * Gets the number of hash functions used to construct the filter. + * This is also known as {@code k}. + * + * @return the number of hash functions used to construct the filter ({@code k}). + */ + public int getNumberOfHashFunctions() { + return numberOfHashFunctions; + } + + /** + * Calculates the probability of false positives ({@code p}) given + * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). + *

p = pow(1 - exp(-k / (m / n)), k)
+ * + *

This is the probability that a Bloom filter will return true for the presence of an item + * when it does not contain the item.

+ * + *

The probability assumes that the Bloom filter is filled with the expected number of + * items. If the filter contains fewer items then the actual probability will be lower. + * Thus, this returns the worst-case false positive probability for a filter that has not + * exceeded its expected number of items.

+ * + * @param numberOfItems the number of items hashed into the Bloom filter. + * @return the probability of false positives. + */ + public double getProbability(int numberOfItems) { + if (numberOfItems < 0) { + throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems); + } + if (numberOfItems == 0) { + return 0; + } + return Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), + numberOfHashFunctions); + } + + @Override + public String toString() { + return String.format("Shape[ m=%s k=%s ]", numberOfBits, numberOfHashFunctions); + } + + /** + * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled. + * + *

Note:

+ *
    + *
  • if hammingValue == numberOfBits, then result is infinity.
  • + *
  • if hammingValue > numberOfBits, then result is NaN.
  • + *
+ * + * @param hammingValue the number of enabled bits. + * @return An estimate of the number of items in the Bloom filter. + */ + public double estimateN(int hammingValue) { + double c = hammingValue; + double m = numberOfBits; + double k = numberOfHashFunctions; + return -(m / k) * Math.log(1.0 - (c / m)); + } + + /** + * The factory to assist in the creation of proper Shapes. + * + * In the methods of this factory the `from` names are appended with the standard variable + * names in the order expected: + * + *
+ *
{@code N})
The number of items to be placed in the Bloom filter
+ *
{@code M})
The number of bits in the Bloom filter
+ *
{@code K})
The number of hash functions for each item placed in the Bloom filter
+ *
{@code P})
The probability of a collision once N items have been placed in the Bloom filter
+ *
+ */ + public static class Factory { + + /** + * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. + */ + private static final double LN_2 = Math.log(2.0); + + /** + * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201. + * + *

ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2) + */ + private static final double DENOMINATOR = -LN_2 * LN_2; + + /** + * Do not instantiate. + */ + private Factory() { + + } + + /** + * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the + * specified number of bits ({@code m}) and hash functions ({@code k}). + * + *

The number of items ({@code n}) to be stored in the filter is computed. + *

n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))
+ * + *

The actual probability will be approximately equal to the + * desired probability but will be dependent upon the calculated Bloom filter capacity + * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param probability The desired false-positive probability in the range {@code (0, 1)} + * @param numberOfBits The number of bits in the filter + * @param numberOfHashFunctions The number of hash functions in the filter + * @return a valid Shape. + * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}, + * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual + * probability is {@code >= 1.0} + */ + public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) { + checkProbability(probability); + checkNumberOfBits(numberOfBits); + checkNumberOfHashFunctions(numberOfHashFunctions); + + // Number of items (n): + // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) + final double n = Math.ceil(numberOfBits + / (-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions)))); + + // log of probability is always < 0 + // number of hash functions is >= 1 + // e^x where x < 0 = [0,1) + // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53 + // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0 + // ceil( >0 ) >= 1 + // so we can not produce a negative value thus we don't check for it. + // + // similarly we can not produce a number greater than numberOfBits so we + // do not have to check for Integer.MAX_VALUE either. + + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); + // check that probability is within range + checkCalculatedProbability(shape.getProbability((int) n)); + return shape; + } + + /** + * Constructs a filter configuration with the specified number of items ({@code n}) and + * desired false-positive probability ({@code p}). + * + *

The number of bits ({@code m}) for the filter is computed. + *

m = ceil(n * ln(p) / ln(1 / 2^ln(2)))
+ * + *

The optimal number of hash functions ({@code k}) is computed. + *

k = round((m / n) * ln(2))
+ * + *

The actual probability will be approximately equal to the + * desired probability but will be dependent upon the calculated number of bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfItems Number of items to be placed in the filter + * @param probability The desired false-positive probability in the range {@code (0, 1)} + * @return a valid Shape + * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability + * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}. + */ + public static Shape fromNP(final int numberOfItems, final double probability) { + checkNumberOfItems(numberOfItems); + checkProbability(probability); + + // Number of bits (m) + final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); + if (m > Integer.MAX_VALUE) { + throw new IllegalArgumentException( + "Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); + } + int numberOfBits = (int) m; + + int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); + // check that probability is within range + checkCalculatedProbability(shape.getProbability(numberOfItems)); + return shape; + } + + /** + * Constructs a filter configuration with the specified number of items ({@code n}) and + * bits ({@code m}). + * + *

The optimal number of hash functions ({@code k}) is computed. + *

k = round((m / n) * ln(2))
+ * + *

The false-positive probability is computed using the number of items, bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfItems Number of items to be placed in the filter + * @param numberOfBits The number of bits in the filter + * @return a valid Shape. + * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, + * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0} + */ + public static Shape fromNM(final int numberOfItems, final int numberOfBits) { + checkNumberOfItems(numberOfItems); + checkNumberOfBits(numberOfBits); + int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); + // check that probability is within range + checkCalculatedProbability(shape.getProbability(numberOfItems)); + return shape; + } + + /** + * Constructs a filter configuration with the specified number of items, bits + * and hash functions. + * + *

The false-positive probability is computed using the number of items, bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfItems Number of items to be placed in the filter + * @param numberOfBits The number of bits in the filter. + * @param numberOfHashFunctions The number of hash functions in the filter + * @return a valid Shape. + * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, + * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}. + */ + public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) { + checkNumberOfItems(numberOfItems); + checkNumberOfBits(numberOfBits); + checkNumberOfHashFunctions(numberOfHashFunctions); + // check that probability is within range + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); + // check that probability is within range + checkCalculatedProbability(shape.getProbability(numberOfItems)); + return shape; + } + + /** + * Check number of items is strictly positive. + * + * @param numberOfItems the number of items + * @return the number of items + * @throws IllegalArgumentException if the number of items is {@code < 1}. + */ + private static int checkNumberOfItems(final int numberOfItems) { + if (numberOfItems < 1) { + throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems); + } + return numberOfItems; + } + + /** + * Check number of bits is strictly positive. + * + * @param numberOfBits the number of bits + * @return the number of bits + * @throws IllegalArgumentException if the number of bits is {@code < 1}. + */ + private static int checkNumberOfBits(final int numberOfBits) { + if (numberOfBits < 1) { + throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); + } + return numberOfBits; + } + + /** + * Check number of hash functions is strictly positive + * + * @param numberOfHashFunctions the number of hash functions + * @return the number of hash functions + * @throws IllegalArgumentException if the number of hash functions is {@code < 1}. + */ + private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { + if (numberOfHashFunctions < 1) { + throw new IllegalArgumentException( + "Number of hash functions must be greater than 0: " + numberOfHashFunctions); + } + return numberOfHashFunctions; + } + + /** + * Check the probability is in the range 0.0, exclusive, to 1.0, exclusive. + * + * @param probability the probability + * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)} + */ + private static void checkProbability(final double probability) { + // Using the negation of within the desired range will catch NaN + if (!(probability > 0.0 && probability < 1.0)) { + throw new IllegalArgumentException( + "Probability must be greater than 0 and less than 1: " + probability); + } + } + + /** + * Check the calculated probability is {@code < 1.0}. + * + *

This function is used to verify that the dynamically calculated probability for the + * Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon + * construction. + * + * @param probability the probability + * @throws IllegalArgumentException if the probability is {@code >= 1.0}. + */ + private static void checkCalculatedProbability(final double probability) { + // We do not need to check for p <= 0.0 since we only allow positive values for + // parameters and the closest we can come to exp(-kn/m) == 1 is + // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will + // always be 00 + if (probability >= 1.0) { + throw new IllegalArgumentException( + String.format("Calculated probability is greater than or equal to 1: " + probability)); + } + } + + /** + * Calculates the number of hash functions given numberOfItems and numberofBits. + * This is a method so that the calculation is consistent across all constructors. + * + * @param numberOfItems the number of items in the filter. + * @param numberOfBits the number of bits in the filter. + * @return the optimal number of hash functions. + * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1} + */ + private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) { + // k = round((m / n) * ln(2)) We change order so that we use real math rather + // than integer math. + final long k = Math.round(LN_2 * numberOfBits / numberOfItems); + if (k < 1) { + throw new IllegalArgumentException( + String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k)); + } + // Normally we would check that numberofHashFunctions <= Integer.MAX_VALUE but + // since numberOfBits is at most Integer.MAX_VALUE the numerator of + // numberofHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the + // value of k can not be above Integer.MAX_VALUE. + return (int) k; + } + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java new file mode 100644 index 0000000000..0cb733e0bf --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; + +import org.apache.commons.collections4.bloomfilter.exceptions.NoMatchException; +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; + +/** + * A bloom filter using an array of BitMaps to track enabled bits. This is a standard + * implementation and should work well for most Bloom filters. + * @since 4.5 + */ +public class SimpleBloomFilter implements BloomFilter { + + /** + * The array of BitMap longs that defines this Bloom filter. + */ + private long[] bitMap; + + /** + * The Shape of this Bloom filter + */ + private final Shape shape; + + /** + * The cardinality of this Bloom filter. + */ + private int cardinality; + + /** + * Constructs an empty SimpleBloomFilter. + * + * @param shape The shape for the filter. + */ + public SimpleBloomFilter(Shape shape) { + Objects.requireNonNull(shape, "shape"); + this.shape = shape; + this.bitMap = new long[0]; + this.cardinality = 0; + } + + /** + * Constructor. + * @param shape The shape for the filter. + * @param hasher the Hasher to initialize the filter with. + */ + public SimpleBloomFilter(final Shape shape, Hasher hasher) { + Objects.requireNonNull(shape, "shape"); + Objects.requireNonNull(hasher, "hasher"); + this.shape = shape; + this.bitMap = new long[0]; + mergeInPlace(hasher); + } + + /** + * Constructor. + * @param shape The shape for the filter. + * @param producer the BitMap Producer to initialize the filter with. + * @throws IllegalArgumentException if the producer returns too many bit maps. + */ + public SimpleBloomFilter(final Shape shape, BitMapProducer producer) { + Objects.requireNonNull(shape, "shape"); + Objects.requireNonNull(producer, "producer"); + this.shape = shape; + + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape); + try { + producer.forEachBitMap(builder); + this.bitMap = builder.getArray(); + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException( String.format("BitMapProducer should only send %s maps", + BitMap.numberOfBitMaps( shape.getNumberOfBits())), e); + } + this.cardinality = -1; + } + + @Override + public boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + Shape shape = getShape(); + + hasher.indices(shape).forEachIndex(idx -> { + int lidx = BitMap.getLongIndex(idx); + if (bitMap.length <= lidx) { + long[] newMap = new long[lidx + 1]; + System.arraycopy(bitMap, 0, newMap, 0, bitMap.length); + bitMap = newMap; + } + BitMap.set(bitMap, idx); + }); + this.cardinality = -1; + return true; + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + Objects.requireNonNull(other, "other"); + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape, this.bitMap); + other.forEachBitMap(builder); + this.bitMap = builder.getArray(); + this.cardinality = -1; + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean isSparse() { + return false; + } + + @Override + public int cardinality() { + if (this.cardinality == -1) { + synchronized (this) { + if (this.cardinality == -1) { + this.cardinality = 0; + forEachBitMap(w -> this.cardinality += Long.bitCount(w)); + } + } + } + return this.cardinality; + } + + @Override + public void forEachIndex(IntConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); + IndexProducer.fromBitMapProducer(this).forEachIndex(consumer); + } + + @Override + public void forEachBitMap(LongConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); + for (long l : bitMap) { + consumer.accept(l); + } + } + + @Override + public boolean contains(IndexProducer indexProducer) { + try { + indexProducer.forEachIndex(idx -> { + if (!BitMap.contains(bitMap, idx)) { + throw new NoMatchException(); + } + }); + return true; + } catch (NoMatchException e) { + return false; + } + } + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + LongConsumer consumer = new LongConsumer() { + int i = 0; + + @Override + public void accept(long w) { + if ((bitMap[i++] & w) != w) { + throw new NoMatchException(); + } + } + }; + try { + bitMapProducer.forEachBitMap(consumer); + return true; + } catch (NoMatchException e) { + return false; + } + + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java new file mode 100644 index 0000000000..92ea2a2be2 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.List; +import java.util.Objects; +import java.util.TreeSet; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; + +import org.apache.commons.collections4.bloomfilter.exceptions.NoMatchException; +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; + +/** + * A bloom filter using a TreeSet of integers to track enabled bits. This is a standard + * implementation and should work well for most low cardinality Bloom filters. + * @since 4.5 + */ +public class SparseBloomFilter implements BloomFilter { + + /** + * The bitSet that defines this BloomFilter. + */ + private final TreeSet indices; + + /** + * The shape of this BloomFilter + */ + private final Shape shape; + + /** + * Constructs an empty BitSetBloomFilter. + * + * @param shape The shape of the filter. + */ + public SparseBloomFilter(Shape shape) { + Objects.requireNonNull(shape, "shape"); + this.shape = shape; + this.indices = new TreeSet(); + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape for the bloom filter. + * @param hasher the hasher to provide the initial data. + */ + public SparseBloomFilter(final Shape shape, Hasher hasher) { + this(shape); + Objects.requireNonNull(hasher, "hasher"); + hasher.indices(shape).forEachIndex(this.indices::add); + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape of the filter. + * @param indices a list of indices to to enable. + * @throws IllegalArgumentException if indices contains a value greater than the number + * of bits in the shape. + */ + public SparseBloomFilter(Shape shape, List indices) { + this(shape); + Objects.requireNonNull(indices, "indices"); + this.indices.addAll(indices); + if (! this.indices.isEmpty()) { + if (this.indices.last() >= shape.getNumberOfBits()) { + throw new IllegalArgumentException(String.format("Value in list {} is greater than maximum value ({})", + this.indices.last(), shape.getNumberOfBits())); + } + if (this.indices.first() < 0 ) { + throw new IllegalArgumentException(String.format("Value in list {} is less than 0", + this.indices.first())); + } + } + } + + @Override + public boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + hasher.indices(shape).forEachIndex(this.indices::add); + return true; + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + Objects.requireNonNull(other, "other"); + other.forEachIndex(indices::add); + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean isSparse() { + return true; + } + + @Override + public int cardinality() { + return indices.size(); + } + + @Override + public void forEachIndex(IntConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); + for (int value : indices) { + consumer.accept(value); + } + } + + @Override + public void forEachBitMap(LongConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); + if (cardinality() == 0) { + return; + } + // because our indices are always in order we can + // shorten the time necessary to create the longs for the + // consumer + long bitMap = 0; + int idx = 0; + for (int i : indices) { + while (BitMap.getLongIndex(i) != idx) { + consumer.accept(bitMap); + bitMap = 0; + idx++; + } + bitMap |= BitMap.getLongBit(i); + } + if (bitMap != 0) { + consumer.accept(bitMap); + } + } + + @Override + public boolean contains(IndexProducer indexProducer) { + try { + indexProducer.forEachIndex(idx -> { + if (!indices.contains(idx)) { + throw new NoMatchException(); + } + }); + return true; + } catch (NoMatchException e) { + return false; + } + } + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + return contains(IndexProducer.fromBitMapProducer(bitMapProducer)); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java similarity index 55% rename from src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java rename to src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java index 9a2078d80c..b0efff37f4 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java @@ -14,24 +14,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.commons.collections4.bloomfilter; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; +package org.apache.commons.collections4.bloomfilter.exceptions; /** - * Tests for the {@link BitSetBloomFilter}. + * An exception to short circuit Bloom filter match functionality using producers. + * + * @since 4.5 */ -public class BitSetBloomFilterTest extends AbstractBloomFilterTest { - @Override - protected BitSetBloomFilter createEmptyFilter(final Shape shape) { - return new BitSetBloomFilter(shape); - } +public class NoMatchException extends RuntimeException { + + /** + * + */ + private static final long serialVersionUID = 1L; - @Override - protected BitSetBloomFilter createFilter(final Hasher hasher, final Shape shape) { - final BitSetBloomFilter testFilter = new BitSetBloomFilter(shape); - testFilter.merge( hasher ); - return testFilter; + /** + * Constructor. + */ + public NoMatchException() { } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java similarity index 77% rename from src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java rename to src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java index 95951ad7fe..4c00ea13e4 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java @@ -14,11 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - /** - * Provides implementations of the Bloom filter - * {@link org.apache.commons.collections4.bloomfilter.hasher.HashFunction HashFunction} interface. - * - * @since 4.5 + * Exceptions specific to Bloom filter processing. */ -package org.apache.commons.collections4.bloomfilter.hasher.function; +package org.apache.commons.collections4.bloomfilter.exceptions; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java deleted file mode 100644 index ab6b773d6c..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator; - -/** - * The class that performs hashing on demand. - * @since 4.5 - */ -public class DynamicHasher implements Hasher { - - /** - * The builder for DynamicHashers. - * @since 4.5 - */ - public static class Builder implements Hasher.Builder { - - /** - * The list of items (each as a byte[]) that are to be hashed. - */ - private final List buffers; - - /** - * The function that the resulting DynamicHasher will use. - */ - private final HashFunction function; - - /** - * Constructs a DynamicHasher builder. - * - * @param function the function implementation. - */ - public Builder(final HashFunction function) { - this.function = function; - this.buffers = new ArrayList<>(); - } - - @Override - public DynamicHasher build() throws IllegalArgumentException { - // Assumes the hasher will create a copy of the buffers - final DynamicHasher hasher = new DynamicHasher(function, buffers); - // Reset for further use - buffers.clear(); - return hasher; - } - - @Override - public final DynamicHasher.Builder with(final byte[] property) { - buffers.add(property); - return this; - } - - @Override - public DynamicHasher.Builder with(final CharSequence item, final Charset charset) { - Hasher.Builder.super.with(item, charset); - return this; - } - - @Override - public DynamicHasher.Builder withUnencoded(final CharSequence item) { - Hasher.Builder.super.withUnencoded(item); - return this; - } - } - - /** - * The iterator of integers. - * - *

This assumes that the list of buffers is not empty. - */ - private class Iterator implements PrimitiveIterator.OfInt { - /** The number of hash functions per item. */ - private final int k; - /** The number of bits in the shape. */ - private final int m; - /** The current item. */ - private byte[] item; - /** The index of the next item. */ - private int nextItem; - /** The count of hash functions for the current item. */ - private int functionCount; - - /** - * Constructs iterator with the specified shape. - * - * @param shape - */ - private Iterator(final Shape shape) { - // Assumes that shape returns non-zero positive values for hash functions and bits - k = shape.getNumberOfHashFunctions(); - m = shape.getNumberOfBits(); - // Assume non-empty - item = buffers.get(0); - nextItem = 1; - } - - @Override - public boolean hasNext() { - if (functionCount != k) { - return true; - } - // Reached the number of hash functions for the current item. - // Try and advance to the next item. - if (nextItem != buffers.size()) { - item = buffers.get(nextItem++); - functionCount = 0; - return true; - } - // Finished. - // functionCount == shape.getNumberOfHashFunctions() - // nextItem == buffers.size() - return false; - } - - @SuppressWarnings("cast") // Cast to long to workaround a bug in animal-sniffer. - @Override - public int nextInt() { - if (hasNext()) { - return (int) Math.floorMod(function.apply(item, functionCount++), - // Cast to long to workaround a bug in animal-sniffer. - (long) m); - } - throw new NoSuchElementException(); - } - } - - /** - * An iterator of integers to use when there are no values. - */ - private static class NoValuesIterator implements PrimitiveIterator.OfInt { - /** The singleton instance. */ - private static final NoValuesIterator INSTANCE = new NoValuesIterator(); - - /** - * Empty constructor. - */ - private NoValuesIterator() {} - - @Override - public boolean hasNext() { - return false; - } - - @Override - public int nextInt() { - throw new NoSuchElementException(); - } - } - - /** - * The list of byte arrays that are to be hashed. - * Package private for access by the iterator. - */ - final List buffers; - - /** - * The function to hash the buffers. - * Package private for access by the iterator. - */ - final HashFunction function; - - /** - * Constructs a DynamicHasher. - * - * @param function the function to use. - * @param buffers the byte buffers that will be hashed. - */ - public DynamicHasher(final HashFunction function, final List buffers) { - this.buffers = new ArrayList<>(buffers); - this.function = function; - } - - @Override - public PrimitiveIterator.OfInt iterator(final Shape shape) { - HashFunctionValidator.checkAreEqual(getHashFunctionIdentity(), - shape.getHashFunctionIdentity()); - // Use optimised iterator for no values - return buffers.isEmpty() ? NoValuesIterator.INSTANCE : new Iterator(shape); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return function; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java deleted file mode 100644 index 0ff2edb8d4..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.nio.charset.StandardCharsets; -import java.util.Locale; - -/** - * Defines the hash function used by a {@link Hasher}. - * - * @since 4.5 - */ -public interface HashFunctionIdentity { - - /** - * Identifies the process type of this function. - * - *

- *
Iterative processes
- *
Call the underlying hash algorithm for each (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)}.
- *
Cyclic processes
- *
Call the underlying hash algorithm using a (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)} to initialize the state. Subsequent - * calls can generate hash values without calling the underlying algorithm.
- *
- */ - enum ProcessType { - /** - * Call the underlying hash algorithm for a (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)} when the state is uninitialized or - * the seed is zero. This initializes the state. Subsequent calls with a non-zero - * seed use the state to generate a new value. - */ - CYCLIC, - /** - * Call the underlying hash algorithm for each (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)}. - */ - ITERATIVE - } - - /** - * Identifies the signedness of the calculations for this function. - *

- * When the hash function executes it typically returns an array of bytes. - * That array is converted into one or more numerical values which will be provided - * as a {@code long} primitive type. - * The signedness identifies if those {@code long} values are signed or unsigned. - * For example a hash function that outputs only 32-bits can be unsigned if converted - * using {@link Integer#toUnsignedLong(int)}. A hash function that outputs more than - * 64-bits is typically signed. - *

- */ - enum Signedness { - /** - * The result of {@link HashFunction#apply(byte[], int)} is signed, - * thus the sign bit may be set. - * - *

- * The result can be used with {@code Math.floorMod(x, y)} to generate a positive - * value if y is positive. - *

- * - * @see Math#floorMod(int, int) - */ - SIGNED, - /** - * The result of {@link HashFunction#apply(byte[], int)} is unsigned, - * thus the sign bit is never set. - * - *

- * The result can be used with {@code x % y} to generate a positive - * value if y is positive. - *

- */ - UNSIGNED - } - - /** - * Gets a common formatted string for general display. - * - * @param identity the identity to format. - * @return the String representing the identity. - */ - static String asCommonString(final HashFunctionIdentity identity) { - return String.format("%s-%s-%s", identity.getName(), identity.getSignedness(), identity.getProcessType()); - } - - /** - * Gets a {@code byte[]} buffer for a HashFunctionIdentity to create a signature. The - * {@code byte[]} is composed using properties of the hash function as: - * - *

-     * String.format("%s-%s-%s",
-     *               getName().toUpperCase(Locale.ROOT), getSignedness(), getProcess())
-     *       .getBytes("UTF-8");
-     * 
- * - * @param identity The HashFunctionIdentity to create the buffer for. - * @return the signature buffer for the identity - * @see #getSignature() - */ - static byte[] prepareSignatureBuffer(final HashFunctionIdentity identity) { - return String.format("%s-%s-%s", - identity.getName().toUpperCase(Locale.ROOT), identity.getSignedness(), - identity.getProcessType()).getBytes(StandardCharsets.UTF_8); - } - - /** - * Gets the name of this hash function. - *

- * Hash function should be the common name - * for the hash. This may include indications as to hash length - *

- *

- * Names are not case specific. Thus, "MD5" and "md5" should be considered as the same. - *

- * @return the Hash name - */ - String getName(); - - /** - * Gets the process type of this function. - * - * @return process type of this function. - */ - ProcessType getProcessType(); - - /** - * Gets the name of the provider of this hash function implementation. - *

- * Provider names are not case specific. Thus, "Apache Commons Collection" and - * "apache commons collection" should be considered as the same. - *

- * @return the name of the provider of this hash implementation. - */ - String getProvider(); - - /** - * Gets the signature of this function. The signature is the output of the hash function - * when applied to a set of bytes composed using properties of the hash function. - * - *

- * Implementations should define the method used to generate the signature. - *

- * - * @return the signature of this function. - * @see #prepareSignatureBuffer(HashFunctionIdentity) - */ - long getSignature(); - - /** - * Gets the signedness of this function. - * - * @return signedness of this function. - */ - Signedness getSignedness(); -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java deleted file mode 100644 index c75973a376..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -/** - * An instance of HashFunctionIdentity that is suitable for deserializing - * HashFunctionIdentity data from a stream or any other situation where the - * hash function is not available but the identify of the function is required. - * - * @since 4.5 - */ -public final class HashFunctionIdentityImpl implements HashFunctionIdentity { - private final String name; - private final String provider; - private final Signedness signedness; - private final ProcessType process; - private final long signature; - - /** - * Creates a copy of the HashFunctionIdentity. - * @param identity the identity to copy. - */ - public HashFunctionIdentityImpl(final HashFunctionIdentity identity) { - this.name = identity.getName(); - this.provider = identity.getProvider(); - this.signedness = identity.getSignedness(); - this.process = identity.getProcessType(); - this.signature = identity.getSignature(); - } - - /** - * Creates a HashFunctionIdentity from component values. - * @param provider the name of the provider. - * @param name the name of the hash function. - * @param signedness the signedness of the hash function. - * @param process the processes of the hash function. - * @param signature the signature for the hash function. - */ - public HashFunctionIdentityImpl(final String provider, final String name, final Signedness signedness, final ProcessType process, - final long signature) { - this.name = name; - this.provider = provider; - this.signedness = signedness; - this.process = process; - this.signature = signature; - } - - @Override - public String getName() { - return name; - } - - @Override - public ProcessType getProcessType() { - return process; - } - - @Override - public String getProvider() { - return provider; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return signedness; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidator.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidator.java deleted file mode 100644 index 3ec0753e4a..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidator.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.util.Locale; -import java.util.Objects; - -/** - * Contains validation for hash functions. - */ -public final class HashFunctionValidator { - /** Do not instantiate. */ - private HashFunctionValidator() {} - - /** - * Generates a hash code for the identity of the hash function. The hash code is - * generated using the same properties as those tested in - * {@link #areEqual(HashFunctionIdentity, HashFunctionIdentity)}, that is the - * signedness, process type and name. The name is not case specific and is converted - * to lower-case using the {@link Locale#ROOT root locale}. - * - *

The generated value is suitable for use in generation of a hash code that satisfies - * the contract of {@link Object#hashCode()} if the {@link Object#equals(Object)} method - * is implemented using {@link #areEqual(HashFunctionIdentity, HashFunctionIdentity)}. That - * is two objects considered equal will have the same hash code. - * - *

If the hash function identity is a field within a larger object the generated hash code - * should be incorporated into the entire hash, for example using - * {@link Objects#hash(Object...)}. - * - * @param a hash function. - * @return hash code - * @see String#toLowerCase(Locale) - * @see Locale#ROOT - */ - static int hash(final HashFunctionIdentity a) { - return Objects.hash(a.getSignedness(), - a.getProcessType(), - a.getName().toLowerCase(Locale.ROOT)); - } - - /** - * Compares the identity of the two hash functions. The functions are considered - * equal if the signedness, process type and name are equal. The name is not - * case specific. - * - *

A pair of functions that are equal would be expected to produce the same - * hash output from the same input. - * - * @param a First hash function. - * @param b Second hash function. - * @return true, if successful - * @see String#equalsIgnoreCase(String) - */ - public static boolean areEqual(final HashFunctionIdentity a, final HashFunctionIdentity b) { - return (a.getSignedness() == b.getSignedness() && - a.getProcessType() == b.getProcessType() && - a.getName().equalsIgnoreCase(b.getName())); - } - - /** - * Compares the identity of the two hash functions and throws an exception if they - * are not equal. - * - * @param a First hash function. - * @param b Second hash function. - * @see #areEqual(HashFunctionIdentity, HashFunctionIdentity) - * @throws IllegalArgumentException if the hash functions are not equal - */ - public static void checkAreEqual(final HashFunctionIdentity a, final HashFunctionIdentity b) { - if (!areEqual(a, b)) { - throw new IllegalArgumentException(String.format("Hash functions are not equal: (%s) != (%s)", - HashFunctionIdentity.asCommonString(a), HashFunctionIdentity.asCommonString(b))); - } - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index 3700567f1a..58b78d61ae 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -16,117 +16,127 @@ */ package org.apache.commons.collections4.bloomfilter.hasher; -import java.nio.charset.Charset; -import java.util.PrimitiveIterator; +import org.apache.commons.collections4.bloomfilter.Shape; + +import java.util.function.IntConsumer; + +import org.apache.commons.collections4.bloomfilter.BitMap; +import org.apache.commons.collections4.bloomfilter.IndexProducer; /** - * A Hasher represents items of arbitrary byte size as a byte representation of - * fixed size (a hash). The hash representations can be used to create indexes - * for a Bloom filter. - * - *

The hash for each item is created using a hash function; use of different - * seeds allows generation of different hashes for the same item. The hashes can - * be dynamically converted into the bit index representation used by a Bloom - * filter. The shape of the Bloom filter defines the number of indexes per item - * and the range of the indexes. The hasher can generate the correct number of - * indexes in the range required by the Bloom filter for each item it - * represents. - * - *

Note that the process of generating hashes and mapping them to a Bloom - * filter shape may create duplicate indexes. The hasher may generate fewer than - * the required number of hash functions per item if duplicates have been - * removed. Implementations of {@code iterator()} may return duplicate values - * and may return values in a random order. See implementation javadoc notes as - * to the guarantees provided by the specific implementation. - * - *

Hashers have an identity based on the hashing algorithm used. + * A Hasher creates IndexProducer based on the hash implementation and the + * provided Shape. * * @since 4.5 */ public interface Hasher { /** - * A builder to build a hasher. + * Creates an IndexProducer for this hasher based on the Shape. * - *

A hasher represents one or more items of arbitrary byte size. The builder - * contains methods to collect byte representations of items. Each method to add - * to the builder will add an entire item to the final hasher created by the - * {@link #build()} method. + *

The @{code IndexProducer} will create indices within the range defined by the number of bits in + * the shape. The total number of indices will respect the number of hash functions per item + * defined by the shape. However the count of indices may not be a multiple of the number of + * hash functions once implementation has removed duplicates.

* - * @since 4.5 + *

This IndexProducer must be deterministic in that it must return the same indices for the + * same Shape.

+ * + *

No guarantee is made as to order of indices.

+ *

Duplicates indices for a single item must be removed.

+ * + * @param shape the shape of the desired Bloom filter. + * @return the iterator of integers */ - interface Builder { + IndexProducer indices(Shape shape); - /** - * Builds the hasher from all the items. - * - *

This method will clear the builder for future use. - * - * @return the fully constructed hasher - */ - Hasher build(); + /** + * Gets the number of items that will be hashed by the {@code IndexProducer}. + * @return The number of items that will be hashed by the {@code IndexProducer}. + */ + int size(); - /** - * Adds a byte array item to the hasher. - * - * @param item the item to add - * @return a reference to this object - */ - Builder with(byte[] item); + /** + * Returns true if there are no items to be hashed. + * @return {@code true} if there are no items to be hashed. + */ + default boolean isEmpty() { + return size() == 0; + } + + /** + * A convenience class for Hasher implementations to filter out duplicate indices. + * + *

If the index is negative the behavior is not defined.

+ * + *

This is conceptually a unique filter implemented as a {@code Predicate}.

+ * @since 4.5 + */ + class Filter { + private long[] bits; + private int size; /** - * Adds a character sequence item to the hasher using the specified {@code charset} - * encoding. + * Constructor. * - * @param item the item to add - * @param charset the character set - * @return a reference to this object + * @param size The number of numbers to track. Values from 0 to size-1 will be tracked. */ - default Builder with(final CharSequence item, final Charset charset) { - return with(item.toString().getBytes(charset)); + public Filter(int size) { + bits = new long[BitMap.numberOfBitMaps(size)]; + this.size = size; } /** - * Adds a character sequence item to the hasher. Each 16-bit character is - * converted to 2 bytes using little-endian order. + * Test if the number has not been seen. + * + *

The first time a number is tested the method returns {@code true} and returns + * {@code false} for every time after that.

* - * @param item the item to add - * @return a reference to this object + *

If the input is not in the range [0,size) an IndexOutOfBoundsException exception is thrown.

+ * + * @param number the number to check. + * @return {@code true} if the number has not been seen, {@code false} otherwise. + * @see Hasher.Filter#Filter(int) */ - default Builder withUnencoded(final CharSequence item) { - final int length = item.length(); - final byte[] bytes = new byte[length * 2]; - for (int i = 0; i < length; i++) { - final char ch = item.charAt(i); - bytes[i * 2] = (byte) ch; - bytes[i * 2 + 1] = (byte) (ch >>> 8); + public boolean test(int number) { + BitMap.checkPositive(number); + if (number >= size) { + throw new IndexOutOfBoundsException(String.format("number to large %d >= %d", number, size)); } - return with(bytes); + boolean retval = !BitMap.contains(bits, number); + BitMap.set(bits, number); + return retval; } } /** - * Gets an iterator of integers that are the bits to enable in the Bloom - * filter based on the shape. - * - *

The iterator will create indexes within the range defined by the number of bits in - * the shape. The total number of indexes will respect the number of hash functions per item - * defined by the shape. However the count of indexes may not be a multiple of the number of - * hash functions if the implementation has removed duplicates. + * Class to wrap an that an IntConsumer only receives an integer value once. * - *

No guarantee is made as to order of values. + *

If the index is negative the behavior is not defined.

* - * @param shape the shape of the desired Bloom filter - * @return the iterator of integers - * @throws IllegalArgumentException if the hasher cannot generate indexes for - * the specified @{@code shape} + * @since 4.5 */ - PrimitiveIterator.OfInt iterator(Shape shape); + class FilteredIntConsumer implements IntConsumer { + private Hasher.Filter filter; + private IntConsumer consumer; + + /** + * Constructor. + *

integers ouside the range [0,size) will throw an IndexOutOfBoundsException. + * @param size The number of integers to track. Values in the range [0,size) will be tracked. + * @param consumer to wrap. + */ + public FilteredIntConsumer(int size, IntConsumer consumer) { + this.filter = new Hasher.Filter(size); + this.consumer = consumer; + } + + @Override + public void accept(int value) { + if (filter.test(value)) { + consumer.accept(value); + } + } + } - /** - * Gets the identify of the hash function used by the the hasher. - * - * @return the identity of the hash function - */ - HashFunctionIdentity getHashFunctionIdentity(); } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java new file mode 100644 index 0000000000..bc3ab940bd --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Objects; +import java.util.function.IntConsumer; +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; + +/** + * A collection of Hashers. Useful when the generation of a Bloom filter depends upon + * multiple items. + * + * Hashers for each item are added to the HasherCollection and then + * the collection is used wherever a Hasher can be used in the API. + * + * @since 4.5 + */ +public class HasherCollection implements Hasher { + + /** + * The list of hashers to be used to generate the indices. + */ + private final List hashers; + + /** + * Constructs an empty HasherCollection. + */ + public HasherCollection() { + this.hashers = new ArrayList<>(); + } + + /** + * Constructs a HasherCollection from a collection of Hasher objects. + * + * @param hashers A collections of Hashers to build the indices with. + */ + public HasherCollection(final Collection hashers) { + Objects.requireNonNull(hashers, "hashers"); + this.hashers = new ArrayList<>(hashers); + } + + /** + * Constructor. + * + * @param hashers A list of Hashers to initialize the collection with. + */ + public HasherCollection(Hasher... hashers) { + this(Arrays.asList(hashers)); + } + + /** + * Adds a hasher to the collection. + * @param hasher The hasher to add. + */ + public void add(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + hashers.add(hasher); + } + + /** + * Add all the Hashers in a collection to this HasherCollection. + * @param hashers The hashers to add. + */ + public void add(Collection hashers) { + Objects.requireNonNull(hashers, "hashers"); + this.hashers.addAll(hashers); + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + return new IndexProducer() { + @Override + public void forEachIndex(IntConsumer consumer) { + for (Hasher hasher : hashers) { + hasher.indices(shape).forEachIndex(consumer); + } + } + }; + } + + /** + * Allow child classes access to the hashers. + * @return hashers + */ + protected List getHashers() { + return hashers; + } + + @Override + public int size() { + int i = 0; + for (Hasher h : hashers) { + i += h.size(); + } + return i; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java similarity index 52% rename from src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java rename to src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java index d14fd3d830..0349b22c6d 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java @@ -16,34 +16,43 @@ */ package org.apache.commons.collections4.bloomfilter.hasher; +import java.util.Objects; +import java.util.function.IntConsumer; + +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; + /** - * Defines a hash function used by a {@link Hasher} . + * A Hasher that returns no values. + * * @since 4.5 */ -public interface HashFunction extends HashFunctionIdentity { +public final class NullHasher implements Hasher { /** - * Applies the hash function to the buffer. - * - * @param buffer the buffer to apply the hash function to. - * @param seed the seed for the hashing. - * @return the long value of the hash. + * The instance of the Null Hasher. */ - long apply(byte[] buffer, int seed); + public static final NullHasher INSTANCE = new NullHasher(); + + + private static final IndexProducer PRODUCER = new IndexProducer() { + @Override + public void forEachIndex(IntConsumer consumer) { + // do nothing + } + }; + + private NullHasher() { + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + return PRODUCER; + } - /** - * Gets the signature of this function. - * - *

The signature of this function is calculated as: - *


-     * int seed = 0;
-     * apply(String.format("%s-%s-%s",
-     *                     getName().toUpperCase(Locale.ROOT), getSignedness(), getProcess())
-     *             .getBytes("UTF-8"), seed);
-     * 
- * - * @see HashFunctionIdentity#prepareSignatureBuffer(HashFunctionIdentity) - */ @Override - long getSignature(); + public int size() { + return 0; + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java deleted file mode 100644 index a82586fe4e..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java +++ /dev/null @@ -1,420 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.util.Objects; - -/** - * The definition of a Bloom filter shape. - * - *

This class contains the values for the filter configuration and is used to - * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are - * compatible. (i.e. can be compared or merged)

- * - *

Interrelatedness of values

- * - *
Number of Items ({@code n})
- *
{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}
Probability of - * False Positives ({@code p})
{@code p = pow(1 - exp(-k / (m / n)), k)}
Number - * of Bits ({@code m})
- *
{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}
Number of - * Functions ({@code k})
{@code k = round((m / n) * ln(2))}
- * - *

Comparisons

For purposes of equality checking and hashCode - * calculations a {@code Shape} is defined by the hashing function identity, the number of - * bits ({@code m}), and the number of functions ({@code k}).

- * - * @see Bloom Filter calculator - * @see Bloom filter - * [Wikipedia] - * @since 4.5 - */ -public final class Shape { - - /** - * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. - */ - private static final double LN_2 = Math.log(2.0); - - /** - * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201. - * - *

ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2) - */ - private static final double DENOMINATOR = -LN_2 * LN_2; - - /** - * Number of items in the filter ({@code n}). - */ - private final int numberOfItems; - - /** - * Number of bits in the filter ({@code m}). - */ - private final int numberOfBits; - - /** - * Number of hash functions ({@code k}). - */ - private final int numberOfHashFunctions; - - /** - * The hash code for this filter. - */ - private final int hashCode; - - /** - * The identity of the hasher function. - */ - private final HashFunctionIdentity hashFunctionIdentity; - - /** - * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the - * specified number of bits ({@code m}) and hash functions ({@code k}). - * - *

The number of items ({@code n}) to be stored in the filter is computed. - *

n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))
- * - *

The actual probability will be approximately equal to the - * desired probability but will be dependent upon the calculated Bloom filter capacity - * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param hashFunctionIdentity The identity of the hash function this shape uses - * @param probability The desired false-positive probability in the range {@code (0, 1)} - * @param numberOfBits The number of bits in the filter - * @param numberOfHashFunctions The number of hash functions in the filter - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}; - * if {@code numberOfBits < 1}; if {@code numberOfHashFunctions < 1}; or if the actual - * probability is {@code >= 1.0} - * @see #getProbability() - */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final double probability, final int numberOfBits, - final int numberOfHashFunctions) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - checkProbability(probability); - this.numberOfBits = checkNumberOfBits(numberOfBits); - this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); - - // Number of items (n): - // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) - final double n = Math.ceil(numberOfBits / - (-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions)))); - - // log of probability is always < 0 - // number of hash functions is >= 1 - // e^x where x < 0 = [0,1) - // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53 - // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0 - // ceil( >0 ) >= 1 - // so we can not produce a negative value thus we don't check for it. - // - // similarly we can not produce a number greater than numberOfBits so we - // do not have to check for Integer.MAX_VALUE either. - this.numberOfItems = (int) n; - // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); - } - - /** - * Constructs a filter configuration with the specified number of items ({@code n}) and - * desired false-positive probability ({@code p}). - * - *

The number of bits ({@code m}) for the filter is computed. - *

m = ceil(n * ln(p) / ln(1 / 2^ln(2)))
- * - *

The optimal number of hash functions ({@code k}) is computed. - *

k = round((m / n) * ln(2))
- * - *

The actual probability will be approximately equal to the - * desired probability but will be dependent upon the calculated number of bits and hash - * functions. An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param hashFunctionIdentity The identity of the hash function this shape uses - * @param numberOfItems Number of items to be placed in the filter - * @param probability The desired false-positive probability in the range {@code (0, 1)} - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if the desired probability - * is not in the range {@code (0, 1)}; or if the actual probability is {@code >= 1.0} - * @see #getProbability() - */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final double probability) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - this.numberOfItems = checkNumberOfItems(numberOfItems); - checkProbability(probability); - - // Number of bits (m) - final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); - if (m > Integer.MAX_VALUE) { - throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); - } - this.numberOfBits = (int) m; - - this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); - // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); - } - - /** - * Constructs a filter configuration with the specified number of items ({@code n}) and - * bits ({@code m}). - * - *

The optimal number of hash functions ({@code k}) is computed. - *

k = round((m / n) * ln(2))
- * - *

The false-positive probability is computed using the number of items, bits and hash - * functions. An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param hashFunctionIdentity The identity of the hash function this shape uses - * @param numberOfItems Number of items to be placed in the filter - * @param numberOfBits The number of bits in the filter - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; - * if the calculated number of hash function is {@code < 1}; - * or if the actual probability is {@code >= 1.0} - * @see #getProbability() - */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - this.numberOfItems = checkNumberOfItems(numberOfItems); - this.numberOfBits = checkNumberOfBits(numberOfBits); - this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); - // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); - } - - /** - * Constructs a filter configuration with the specified number of items, bits - * and hash functions. - * - *

The false-positive probability is computed using the number of items, bits and hash - * functions. An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param hashFunctionIdentity The identity of the hash function this shape uses - * @param numberOfItems Number of items to be placed in the filter - * @param numberOfBits The number of bits in the filter. - * @param numberOfHashFunctions The number of hash functions in the filter - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; - * if {@code numberOfHashFunctions < 1}; or if the actual probability is {@code >= 1.0} - * @see #getProbability() - */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits, - final int numberOfHashFunctions) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - this.numberOfItems = checkNumberOfItems(numberOfItems); - this.numberOfBits = checkNumberOfBits(numberOfBits); - this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); - // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); - } - - /** - * Check number of items is strictly positive. - * - * @param numberOfItems the number of items - * @return the number of items - * @throws IllegalArgumentException if the number of items is {@code < 1} - */ - private static int checkNumberOfItems(final int numberOfItems) { - if (numberOfItems < 1) { - throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems); - } - return numberOfItems; - } - - /** - * Check number of bits is strictly positive. - * - * @param numberOfBits the number of bits - * @return the number of bits - * @throws IllegalArgumentException if the number of bits is {@code < 1} - */ - private static int checkNumberOfBits(final int numberOfBits) { - if (numberOfBits < 1) { - throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); - } - return numberOfBits; - } - - /** - * Check number of hash functions is strictly positive - * - * @param numberOfHashFunctions the number of hash functions - * @return the number of hash functions - * @throws IllegalArgumentException if the number of hash functions is {@code < 1} - */ - private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { - if (numberOfHashFunctions < 1) { - throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); - } - return numberOfHashFunctions; - } - - /** - * Check the probability is in the range 0.0, exclusive, to 1.0, exclusive. - * - * @param probability the probability - * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)} - */ - private static void checkProbability(final double probability) { - // Using the negation of within the desired range will catch NaN - if (!(probability > 0.0 && probability < 1.0)) { - throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability); - } - } - - /** - * Check the calculated probability is {@code < 1.0}. - * - *

This function is used to verify that the dynamically calculated probability for the - * Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon - * construction. - * - * @param probability the probability - * @throws IllegalArgumentException if the probability is {@code >= 1.0} - */ - private static void checkCalculatedProbability(final double probability) { - // We do not need to check for p <= 0.0 since we only allow positive values for - // parameters and the closest we can come to exp(-kn/m) == 1 is - // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will - // always be 00 - if (probability >= 1.0) { - throw new IllegalArgumentException( - String.format("Calculated probability is greater than or equal to 1: " + probability)); - } - } - - /** - * Calculates the number of hash functions given numberOfItems and numberofBits. - * This is a method so that the calculation is consistent across all constructors. - * - * @param numberOfItems the number of items in the filter. - * @param numberOfBits the number of bits in the filter. - * @return the optimal number of hash functions. - * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1} - */ - private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) { - // k = round((m / n) * ln(2)) We change order so that we use real math rather - // than integer math. - final long k = Math.round(LN_2 * numberOfBits / numberOfItems); - if (k < 1) { - throw new IllegalArgumentException( - String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k)); - } - // Normally we would check that numberofHashFunctions <= Integer.MAX_VALUE but - // since numberOfBits is at most Integer.MAX_VALUE the numerator of - // numberofHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the - // value of k can not be above Integer.MAX_VALUE. - return (int) k; - } - - @Override - public boolean equals(final Object o) { - if (o instanceof Shape) { - final Shape other = (Shape) o; - return numberOfBits == other.numberOfBits && - numberOfHashFunctions == other.numberOfHashFunctions && - HashFunctionValidator.areEqual(hashFunctionIdentity, - other.hashFunctionIdentity); - } - return false; - } - - @Override - public int hashCode() { - return hashCode; - } - - private int generateHashCode() { - return Objects.hash(numberOfBits, numberOfHashFunctions, HashFunctionValidator.hash(hashFunctionIdentity)); - } - - /** - * Gets the HashFunctionIdentity of the hash function this shape uses. - * @return the HashFunctionIdentity of the hash function this shape uses. - */ - public HashFunctionIdentity getHashFunctionIdentity() { - return hashFunctionIdentity; - } - - /** - * Gets the number of bits in the Bloom filter. - * This is also known as {@code m}. - * - * @return the number of bits in the Bloom filter ({@code m}). - */ - public int getNumberOfBits() { - return numberOfBits; - } - - /** - * Gets the number of hash functions used to construct the filter. - * This is also known as {@code k}. - * - * @return the number of hash functions used to construct the filter ({@code k}). - */ - public int getNumberOfHashFunctions() { - return numberOfHashFunctions; - } - - /** - * Gets the number of items that are expected in the filter. - * This is also known as {@code n}. - * - * @return the number of items ({@code n}). - */ - public int getNumberOfItems() { - return numberOfItems; - } - - /** - * Calculates the probability of false positives ({@code p}) given - * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). - *

p = pow(1 - exp(-k / (m / n)), k)
- * - *

This is the probability that a Bloom filter will return true for the presence of an item - * when it does not contain the item. - * - *

The probability assumes that the Bloom filter is filled with the expected number of - * items. If the filter contains fewer items then the actual probability will be lower. - * Thus this returns the worst-case false positive probability for a filter that has not - * exceeded its expected number of items. - * - * @return the probability of false positives. - * @see #getNumberOfItems() - */ - public double getProbability() { - return Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), - numberOfHashFunctions); - } - - @Override - public String toString() { - return String.format("Shape[ %s n=%s m=%s k=%s ]", - HashFunctionIdentity.asCommonString(hashFunctionIdentity), - numberOfItems, numberOfBits, numberOfHashFunctions); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java new file mode 100644 index 0000000000..64adb12c3b --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.Objects; +import java.util.function.IntConsumer; + +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; + +/** + * A Hasher that implements combinatorial hashing as as described by + * Krisch amd Mitzenmacher. + *

+ * Common use for this hasher is to generate a byte array as the output of a hashing + * or MessageDigest algorithm.

+ * + * @since 4.5 + */ +public final class SimpleHasher implements Hasher { + + /** + * The initial hash value. + */ + private final long initial; + + /** + * The value to increment the hash value by. + */ + private final long increment; + + /** + * Convert bytes to long. + * @param byteArray the byte array to extract the values from. + * @param offset the offset to start extraction from. + * @param len the length of the extraction, may be longer than 8. + * @return + */ + private static long toLong(byte[] byteArray, int offset, int len) { + long val = 0; + len = Math.min(len, Long.BYTES); + for (int i = 0; i < len; i++) { + val <<= 8; + val |= (byteArray[offset + i] & 0x00FF); + } + return val; + } + + /** + * Constructs the SimpleHasher from a byte array. + *

The byte array is split in 2 and each half is interpreted as a long value. + * Excess bytes are ignored. This simplifies the conversion from a Digest or hasher algorithm output + * to the two values used by the SimpleHasher.

+ * @param buffer the buffer to extract the longs from. + * @throws IllegalArgumentException is buffer length is zero. + */ + public SimpleHasher(byte[] buffer) { + if (buffer.length == 0) { + throw new IllegalArgumentException("buffer length must be greater than 0"); + } + int segment = buffer.length / 2; + this.initial = toLong(buffer, 0, segment); + this.increment = toLong(buffer, segment, buffer.length - segment); + } + + /** + * Constructs the SimpleHasher from 2 longs. The long values will be interpreted as unsigned values. + * @param initial The initial value for the hasher.. + * @param increment The value to increment the hash by on each iteration. + */ + public SimpleHasher(long initial, long increment) { + this.initial = initial; + this.increment = increment; + } + + /** + * Gets an IndexProducer that produces indices based on the shape. + * The iterator will not return the same value multiple + * times. Values will be returned in ascending order. + * + * @param shape {@inheritDoc} + * @return {@inheritDoc} + * @throws IllegalArgumentException {@inheritDoc} + */ + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + + return new IndexProducer() { + + /** The index of the next item. */ + private long next = SimpleHasher.this.initial; + + @Override + public void forEachIndex(IntConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); + FilteredIntConsumer filtered = new FilteredIntConsumer(shape.getNumberOfBits(), consumer); + for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) { + int value = (int) Long.remainderUnsigned(next, shape.getNumberOfBits()); + filtered.accept(value); + next += SimpleHasher.this.increment; + } + } + }; + } + + @Override + public int size() { + return 1; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java new file mode 100644 index 0000000000..7de34276c4 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.Collection; +import java.util.Objects; +import java.util.function.IntConsumer; +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; + +/** + * A collection of Hashers that are combined to be a single item. This differs from + * the HasherCollection in that the HasherCollection counts each Hasher in the collection as + * a different item, or in the case of an enclosed HasherCollection multiple items. This collection + * assumes that all hashers are combined to make a single item. + * + * @since 4.5 + */ +public class SingleItemHasherCollection extends HasherCollection { + + /** + * Constructs an empty SingleItemHasherCollection. + */ + public SingleItemHasherCollection() { + super(); + } + + /** + * Constructs a SingleItemHasherCollection from a collection of Hasher objects. + * + * @param hashers A collections of Hashers to build the indices with. + */ + public SingleItemHasherCollection(Collection hashers) { + super(hashers); + } + + /** + * Constructor. + * + * @param hashers A list of Hashers to initialize the collection with. + */ + public SingleItemHasherCollection(Hasher... hashers) { + super(hashers); + } + + /** + * Produces unique indices. + * + *

Specifically, this method create an IndexProducer that will not return duplicate indices. The effect is + * to make the entire collection appear as one item. This useful when working with complex Bloom filters like the + * CountingBloomFilter.

+ * + * @param shape The shape of the desired Bloom filter. + * @return an IndexProducer that only produces unique values. + */ + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + IndexProducer baseProducer = super.indices(shape); + + return new IndexProducer() { + @Override + public void forEachIndex(IntConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); + FilteredIntConsumer filtered = new FilteredIntConsumer(shape.getNumberOfBits() - 1, consumer); + baseProducer.forEachIndex(filtered); + } + }; + } + + @Override + public int size() { + for (Hasher hasher : getHashers()) { + if (hasher.size() > 0) { + return 1; + } + } + return 0; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java deleted file mode 100644 index 430f99b565..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.PrimitiveIterator.OfInt; -import java.util.Set; -import java.util.TreeSet; - -/** - * A Hasher implementation that contains the index for all enabled bits for a specific - * Shape. - * @since 4.5 - */ -public final class StaticHasher implements Hasher { - - /** - * The shape of this hasher - */ - private final Shape shape; - - /** - * The ordered set of values that this hasher will return. - */ - private final int[] values; - - /** - * Constructs the StaticHasher from a Hasher and a Shape. - * @param hasher the Hasher to read. - * @param shape the Shape for the resulting values. - * @throws IllegalArgumentException if the hasher function and the shape function are not the same. - */ - public StaticHasher(final Hasher hasher, final Shape shape) { - this(hasher.iterator(shape), shape); - HashFunctionValidator.checkAreEqual(hasher.getHashFunctionIdentity(), - shape.getHashFunctionIdentity()); - } - - /** - * Constructs a StaticHasher from an Iterator of Integers and a Shape. - * @param iter the Iterator of Integers. - * @param shape the Shape that the integers were generated for. - * @throws IllegalArgumentException if any Integer is outside the range [0,shape.getNumberOfBits()) - */ - public StaticHasher(final Iterator iter, final Shape shape) { - this.shape = shape; - final Set workingValues = new TreeSet<>(); - iter.forEachRemaining(idx -> { - if (idx >= this.shape.getNumberOfBits()) { - throw new IllegalArgumentException(String.format("Bit index (%s) is too big for %s", idx, shape)); - } - if (idx < 0) { - throw new IllegalArgumentException(String.format("Bit index (%s) may not be less than zero", idx)); - } - workingValues.add(idx); - }); - this.values = new int[workingValues.size()]; - int i = 0; - for (final Integer value : workingValues) { - values[i++] = value.intValue(); - } - } - - /** - * Constructs the StaticHasher from a StaticHasher and a Shape. - * @param hasher the StaticHasher to read. - * @param shape the Shape for the resulting values. - * @throws IllegalArgumentException if the shape of the hasher and the shape parameter are not the same. - */ - public StaticHasher(final StaticHasher hasher, final Shape shape) { - if (!hasher.shape.equals(shape)) { - throw new IllegalArgumentException(String.format("Hasher shape (%s) is not the same as shape (%s)", - hasher.getShape().toString(), shape.toString())); - } - this.shape = shape; - this.values = hasher.values; - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return shape.getHashFunctionIdentity(); - } - - /** - * Gets the shape this static hasher was created with. - * - * @return the Shape of this hasher. - */ - public Shape getShape() { - return shape; - } - - /** - * Tests emptiness (size == 0). - * - * @return Whether or not this is empty. - */ - public boolean isEmpty() { - return size() == 0; - } - - /** - * Gets an iterator of integers that are the bits to enable in the Bloom - * filter based on the shape. The iterator will not return the same value multiple - * times. Values will be returned in ascending order. - * - * @param shape {@inheritDoc} - * @return {@inheritDoc} - * @throws IllegalArgumentException {@inheritDoc} - */ - @Override - public OfInt iterator(final Shape shape) { - if (!this.shape.equals(shape)) { - throw new IllegalArgumentException( - String.format("shape (%s) does not match internal shape (%s)", shape, this.shape)); - } - return Arrays.stream(values).iterator(); - } - - /** - * Gets the the number of unique values in this hasher. - * @return the number of unique values. - */ - public int size() { - return values.length; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java deleted file mode 100644 index 8e07793b7f..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import java.nio.ByteBuffer; - -import java.nio.LongBuffer; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * performs MD5 hashing using a signed cyclic method. - * @since 4.5 - */ -public final class MD5Cyclic implements HashFunction { - - /** - * The name of this hash function. - */ - public static final String NAME = "MD5"; - - /** - * The MD5 digest implementation. - */ - private final MessageDigest messageDigest; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * The result from the digest 0 - */ - private final long[] result = new long[2]; - - /** - * Constructs the MD5 hashing function. - */ - public MD5Cyclic() { - try { - messageDigest = MessageDigest.getInstance(NAME); - } catch (final NoSuchAlgorithmException e) { - // This should not happen - throw new IllegalStateException("Missing the standard MD5 message digest algorithm", e); - } - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - - if (seed == 0) { - final byte[] hash; - synchronized (messageDigest) { - messageDigest.update(buffer); - hash = messageDigest.digest(); - messageDigest.reset(); - } - - final LongBuffer lb = ByteBuffer.wrap(hash).asLongBuffer(); - result[0] = lb.get(0); - result[1] = lb.get(1); - } else { - result[0] += result[1]; - } - return result[0]; - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64Cyclic.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64Cyclic.java deleted file mode 100644 index 99c27c8819..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64Cyclic.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import org.apache.commons.codec.digest.MurmurHash3; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * uses an underlying Murmur3 128-bit hash with a signed cyclic method. - * - *

Requires the optional Apache Commons Codec - * library which contains a Java port of the 128-bit hash function - * {@code MurmurHash3_x64_128} from Austin Applyby's original {@code c++} - * code in SMHasher.

- * - * @see SMHasher - * @since 4.5 - */ -public final class Murmur128x64Cyclic implements HashFunction { - - /** - * The name of this hash method. - */ - public static final String NAME = "Murmur3_x64_128"; - - /** - * The result of the hash 0 call. - */ - private long[] parts; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * Constructs a Murmur3 x64 128 hash. - */ - public Murmur128x64Cyclic() { - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - if (parts == null || seed == 0) { - parts = MurmurHash3.hash128x64(buffer, 0, buffer.length, 0); - } else { - parts[0] += parts[1]; - } - return parts[0]; - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java deleted file mode 100644 index 982ef5c869..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import org.apache.commons.codec.digest.MurmurHash3; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * uses an underlying Murmur3 32-bit hash with a signed iterative method. - * - *

Requires the optional Apache Commons Codec - * library which contains a Java port of the 32-bit hash function - * {@code MurmurHash3_x86_32} from Austin Applyby's original {@code c++} - * code in SMHasher.

- * - * @see Apache Commons Codec - * @see SMHasher - * @since 4.5 - */ -public final class Murmur32x86Iterative implements HashFunction { - - /** - * The name of this hash function. - */ - public static final String NAME = "Murmur3_x86_32"; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * Constructs a Murmur3 x86 32 hash - */ - public Murmur32x86Iterative() { - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - return MurmurHash3.hash32x86(buffer, 0, buffer.length, seed); - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.ITERATIVE; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java deleted file mode 100644 index da0fc2c2db..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import java.util.Arrays; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * performs {@code Objects.hash} hashing using a signed iterative method. - *

- * Except in the case of seed 0, the value of the previous hash is - * used as a seed for the next hash. Hashes are seeded by calling - * {@code Arrays.deepHashCode( new Object[]{seed, buffer} )}. - *

- * @since 4.5 - */ -public final class ObjectsHashIterative implements HashFunction { - - /** - * The name of the hash function. - */ - public static final String NAME = "Objects32"; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * The value of the last hash. - */ - private long last; - - /** - * Constructs a hash that uses the Objects.hash method to has values. - */ - public ObjectsHashIterative() { - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - if (seed == 0) { - last = 0; - } - // Effectively: - // result = Arrays.deepHashCode(new Object[] { last, buffer }); - // The method loops over items starting with result=1 - // for i in items: - // result = 31 * result + hashCode(i) - // Here we unroll the computation to 2 iterations. - // The computation is done using 32-bit integers then cast to a long - final long result = 31 * (31 + Long.hashCode(last)) + Arrays.hashCode(buffer); - last += result; - return result; - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.ITERATIVE; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Signatures.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Signatures.java deleted file mode 100644 index b7f35ac051..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Signatures.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; - -/** - * Allow computation of HashFunction signatures. - * @since 4.5 - */ -final class Signatures { - - /** No instances. */ - private Signatures() {} - - /** - * Gets the standard signature for the hash function. The signature is prepared as: - *


-     * int seed = 0;
-     * return hashFunction.apply(HashFunctionIdentity.prepareSignatureBuffer(hashFunction), seed);
-     * 
- * - * @param hashFunction the hash function - * @return the signature - * @see HashFunctionIdentity#prepareSignatureBuffer(HashFunctionIdentity) - * @see HashFunction#apply(byte[], int) - */ - static long getSignature(final HashFunction hashFunction) { - return hashFunction.apply(HashFunctionIdentity.prepareSignatureBuffer(hashFunction), 0); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java index b73675ed28..2922477edc 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java @@ -16,10 +16,64 @@ */ /** - * Provides classes and interfaces to define the shape of a Bloom filter and the conversion - * of generic bytes to a hash of bit indexes to be used with a Bloom filter. + * Hasher implementations and tools. * + * + *

Hasher

+ * + *

A Hasher converts one or more items into an {@code IndexProducer} series of integers based on a {@code Shape}. + * + * + * The base Hasher implementations + * are as follows:

+ * + *

SimpleHasher

+ * + *

The SimpleHasher represents one item being added to the Bloom filter. It utilizes the combinatorial strategy + * as described by Krisch and Mitzenmacher. + * Generally, a hash value is created by hashing together multiple properties of the item being added. The hash value is + * then used to create a SimpleHasher.

+ * + *

This hasher represents a single item and thus does not return duplicate indices.

+ * + *

HasherCollection

+ * + *

The HasherCollection is a collection of Hashers that implements the Hasher interface. Each hasher within the collection + * represents a single item, or in the case of a HasherCollections multiple items.

+ * + *

This hahser represents multiple items and thus may return duplicate indices.

+ * + *

SingleItemHasherCollection

+ * + *

A collection of Hashers that are combined to represent a single item. Like the HasherCollection this Hasher is composed + * of multiple Hashers. Unlike the HasherCollection, this hasher reports that it is only one item.

+ * + * + *

This hasher represents a single item and thus does not return duplicate indices.

+ * + *

Other Implementations

+ * + *

Other implementations of the Hasher are easy to implement. Hashers that represent single items should make use of the + * {@code Hahser.Filter} and/or {@code Hahser.FileredIntConsumer} classes to filter out duplicate indices.

+ * + *

With the exception of the HasherCollection, a Hasher represents an item of arbitrary + * byte size as multiple byte representations of fixed size (multiple hashes). The hashers + * are be used to create indices for a Bloom filter.

+ * + *

Hashers create @{code IndexProducer} instances for hashed items based + * on a @{code Shape}.

+ * + *

The method used to generate the multiple hashes is dependent upon the Hasher + * implementation. The SimpleHasher uses a combinatorial strategy to create the + * multiple hashes from a single starting hash.

+ * + *

Note that the process of generating hashes and mapping them to a Bloom + * filter shape may create duplicate indexes. The Hasher implementation is required to + * remove all duplicate values for a single item. Thus the hasher may generate fewer + * than the required number of hash values per item after duplicates have been + * removed.

+ * + * @see org.apache.commons.collections4.bloomfilter.IndexProducer * @since 4.5 */ package org.apache.commons.collections4.bloomfilter.hasher; - diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java index bfc3d67abe..50a8f723b7 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java @@ -18,101 +18,78 @@ /** * A collection of extensible Bloom filter classes and interfaces. * - *

- * Background:

- *

- * A Bloom filter is conceptually a bit vector. It is used to - * tell you where things are not. Basically, you create a Bloom filter by creating hashes - * and converting those to enabled bits in a vector. You can merge the Bloom filters - * together with logical "or" (call this filter "B"). You can then check to see if filter - * "A" was "or"ed into "B" by testing A & B == A. if the statement is false then "A" was - * not merged into "B", otherwise it _might_ have. They are generally used where hash - * tables would be too large or as a filter front end for longer processes. For example + *

Background:

+ * + *

The Bloom filter is a probabilistic data structure that indicates where things are not. + * Conceptually it is a a bit vector. You create a Bloom filter by creating hashes + * and converting those to enabled bits in the vector. Multiple Bloom filters may be merged + * together into one Bloom filter. It is possible to test if a filter {@code B} as merged into + * another filter {@code A} by verifying that {@code (A & B) == B}.

+ * + *

Bloom filters are generally used where hash + * tables would be too large, or as a filter front end for longer processes. For example * most browsers have a Bloom filter that is built from all known bad URLs (ones that * serve up malware). When you enter a URL the browser builds a Bloom filter and checks to * see if it is "in" the bad URL filter. If not the URL is good, if it matches, then the * expensive lookup on a remote system is made to see if it actually is in the list. There * are lots of other uses, and in most cases the reason is to perform a fast check as a * gateway for a longer operation.

- *

- * BloomFilter

- *

- * The bloom filter code is - * an abstract class that requires implementation of 4 methods:

    - *
  • - * getBits() which - * returns the set bits as a buffer encoded into an array of long.
  • - *
  • - * getHasher() - * which returns a list of integers that are indexes of the bits that are enabled. These - * are returned in a Hasher construct.
  • - *
  • - * merge( BloomFilter ) to merge another - * Bloom filter into this one.
  • - *
  • - * merge( Hasher ) to merge the values in a hasher - * into this Bloom filter.
  • - *
- * There are 3 implementations of Bloom filter - * provided:
    - *
  • - * BitSetBloomFilter - based on the Java BitSet class.
  • - *
  • - * - * CountingBloomFilter - uses a sparse array of integers (Map) to implement a counting - * Bloom filter. This filter also implements remove() methods as that is the great - * advantage of a counting Bloom filter.
  • - *
  • - * HasherBloomFilter - implements bloom - * filter on a Hasher. A rather slow implementation but convenient in some - * situations.
  • - *
- * - *

- * Shape

- *

- * Describes the Bloom filter using the - * standard number of bits, number of hash functions and number of items along with a - * description of the HashFunction. It is this description that has caused the most issues - * of late.

- *

- * Hasher

- *

- * converts byte buffers into an iterator if int based - * on a Shape. There are 2 implementations of Hasher provided

    - *
  • - * Dynamic - calls - * the HashFunction for each value required in the Bloom filter.
  • - *
  • - * Static - based - * on a pre-calculated list of Bloom filter index values. It is also limited to generating - * values for a specific Shape.
  • - *
- * - *

- * Hash Functions

- *

- * Hash - * functions generate individual index values for the filter from a byte buffer. There are - * four implementations provided.

- *

- * HashFunctionIdentity

- *

- * The - * HashFunctionIdentity is the base interface for the HashFunction. It tracks three (3) - * properties:

    - *
  • - * The Hashing algorithm
  • - *
  • - * Whether the contents of the - * resulting hash buffer are read as signed or unsigned values.
  • - *
  • - * Whether the hash - * function uses an iterative or cyclic method. In traditional iterative methods this is - * done by calling the selected hash function with a different seed for each hash - * required. The second method described by Adam Kirsch and Micheal Mitzenmacher[1] has - * become more common and is used in applications like Cassandra[2].
  • - *
+ * + *

BloomFilter

+ * + *

The Bloom filter architecture here is designed so that the implementation of the storage of bit is abstracted. + * Programs that utilize the Bloom filters may use the {@code BitMapProducer} or {@code IndexProducer} to retrieve a + * representation of the internal structure. Additional methods are available in the {@code BitMap} to assist in + * manipulation of the representations.

+ * + *

The bloom filter code is an interface that requires implementation of 6 methods:

+ *
    + *
  • {@code cardinality()} + * returns the number of bits enabled in the Bloom filter.
  • + * + *
  • {@code contains(BitMapProducer)} which + * returns true if the bits specified by the BitMaps generated by the BitMapProducer are enabled in the Bloom filter.
  • + * + *
  • {@code contains(IndexProducer)} which + * returns true if the bits specified by the Indices generated by IndexProducer are enabled in the Bloom filter.
  • + * + *
  • {@code getShape()} which + * returns shape the Bloom filter was created with.
  • + + *
  • {@code isSparse()} which + * returns true if an the implementation tracks indices natively, false if BitMaps are used. In cases where + * neither are used the {@code isSparse} return value should reflect which is faster to produce.
  • + * + *
  • {@code mergeInPlace(BloomFilter)} which + * utilizes either the {@code BitMapProducer} or {@code IndexProducer} from the argument to enable extra bits + * in the internal representation of the Bloom filter..
  • + *
+ * + *

Other methods should be implemented where they can be done so more efficiently than the default implementations. + *

+ * + *

CountingBloomFilter

+ * + *

The counting bloom filter extends the Bloom filter by counting the number of times a specific bit has been + * enabled or disabled. This allows the removal (opposite of merge) of Bloom filters at the expense of additional + * overhead.

+ * + *

Shape

+ * + *

The Shape describes the Bloom filter using the number of bits and the number of hash functions

+ * + *

Hasher

+ * + *

A Hasher converts bytes into an series of integers based on a Shape. With the exception of the HasherCollecton, + * Each hasher represents one item being added to the Bloom filter. The HasherCollection represents the + * number of items as the sum of the number of items represented by the Hashers in the collection.

+ * + *

The SimpleHasher uses a combinatorial generation technique to create the integers. It is easily + * initialized by using a standard {@code MessageDigest} or other Hash function to hash the item to insert and + * then splitting the hash bytes in half and considering each as a long value.

+ * + *

Other implementations of the Hasher are easy to implement, and should make use of the {@code Hahser.Filter} + * and/r {@code Hahser.FileredIntConsumer} classes to filter out duplicate indices.

* *

References

* diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 2a1faa18ea..b5d26c6e15 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -16,214 +16,173 @@ */ package org.apache.commons.collections4.bloomfilter; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -import java.util.List; -import java.util.PrimitiveIterator.OfInt; -import java.util.function.BiFunction; -import java.util.function.IntConsumer; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import org.apache.commons.collections4.bloomfilter.hasher.HasherCollection; +import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; import org.junit.jupiter.api.Test; /** * Test standard methods in the {@link BloomFilter} interface. */ -public abstract class AbstractBloomFilterTest { +public abstract class AbstractBloomFilterTest { + + protected final SimpleHasher from1 = new SimpleHasher(1, 1); + protected final long from1Value = 0x3FFFEL; + protected final SimpleHasher from11 = new SimpleHasher(11, 1); + protected final long from11Value = 0xFFFF800L; + protected final HasherCollection bigHasher = new HasherCollection(from1, from11); + protected final long bigHashValue = 0xFFFFFFEL; + protected final HasherCollection fullHasher = new HasherCollection(new SimpleHasher(0, 1)/* 0-16 */, + new SimpleHasher(17, 1)/* 17-33 */, new SimpleHasher(33, 1)/* 33-49 */, new SimpleHasher(50, 1)/* 50-66 */, + new SimpleHasher(67, 1)/* 67-83 */ + ); + protected final long[] fullHashValue = { 0xFFFFFFFFFFFFFFFFL, 0xFFFFFL }; /** - * An implementation of BloomFilter that is used to test merge and cardinality - * operations with a filter type that does not match the type of the filter - * being tested. + * The shape of the Bloom filters for testing */ - private static class TestBloomFilter extends AbstractBloomFilter { - /** The bits. */ - final BitSet bits; - - protected TestBloomFilter(final Shape shape, final BitSet bits) { - super(shape); - this.bits = bits; - } - - @Override - public long[] getBits() { - return bits.toLongArray(); - } - - @Override - public StaticHasher getHasher() { - return new StaticHasher(bits.stream().iterator(), getShape()); - } - - @Override - public boolean merge(final BloomFilter other) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean merge(final Hasher hasher) { - throw new UnsupportedOperationException(); - } - } + protected Shape shape = new Shape(17, 72); /** - * A HashFunctionIdentity for testing. + * Create an empty version of the BloomFilter implementation we are testing. + * + * @param shape the shape of the filter. + * @return a BloomFilter implementation. */ - protected HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; + protected abstract T createEmptyFilter(Shape shape); /** - * A second HashFunctionIdentity for testing. + * Create the BloomFilter implementation we are testing. + * + * @param hasher the hasher to use to create the filter. + * @param shape the shape of the filter. + * @return a BloomFilter implementation. */ - protected HashFunctionIdentity testFunctionX = new HashFunctionIdentity() { + protected abstract T createFilter(Shape shape, Hasher hasher); - @Override - public String getName() { - return "Test FunctionX"; + @Test + public void asIndexArrayTest() { + final BloomFilter bf = createFilter( shape, from1 ); + int[] ary = BloomFilter.asIndexArray( bf ); + assertEquals( 17, ary.length ); + for (int i=0; i filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + @Test + public void estimateIntersectionTest() { - final BloomFilter bf = createFilter(hasher, shape); + final BloomFilter bf = createFilter(shape, from1); + final BloomFilter bf2 = createFilter(shape, bigHasher); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + assertEquals(1, bf.estimateIntersection(bf2)); + assertEquals(1, bf2.estimateIntersection(bf)); + } - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); + @Test + public void estimateIntersectionTest_empty() { + final BloomFilter bf = createFilter(shape, from1); + final BloomFilter bf2 = createEmptyFilter(shape); - assertEquals(7, bf.andCardinality(bf2)); + assertEquals(0, bf.estimateIntersection(bf2)); + assertEquals(0, bf2.estimateIntersection(bf)); } /** - * Tests that the andCardinality calculations are correct when there are more than Long.LENGTH bits. + * Tests that the andCardinality calculations are correct. + * + * @param filterFactory the factory function to create the filter */ @Test - public final void andCardinalityTest_ExtraLongs() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + public void estimateUnionTest() { + final BloomFilter bf = createFilter(shape, from1); - final BloomFilter bf = createFilter(hasher, shape); + final BloomFilter bf2 = createFilter(shape, from11); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - - final BloomFilter bf2 = createFilter(hasher2, shape); - - assertEquals(7, bf.andCardinality(bf2)); - assertEquals(7, bf2.andCardinality(bf)); + assertEquals(2, bf.estimateUnion(bf2)); + assertEquals(2, bf2.estimateUnion(bf)); } - /** - * Compare 2 static hashers to verify they have the same bits enabled. - * - * @param hasher1 the first static hasher. - * @param hasher2 the second static hasher. - */ - private void assertSameBits(final StaticHasher hasher1, final StaticHasher hasher2) { - final OfInt iter1 = hasher1.iterator(shape); - final OfInt iter2 = hasher2.iterator(shape); + @Test + public void estimateUnionTest_empty() { + final BloomFilter bf = createFilter(shape, from1); + final BloomFilter bf2 = createEmptyFilter(shape); - while (iter1.hasNext()) { - assertTrue(iter2.hasNext(), "Not enough data in second hasher"); - assertEquals(iter1.nextInt(), iter2.nextInt()); - } - assertFalse(iter2.hasNext(), "Too much data in second hasher"); + assertEquals(1, bf.estimateUnion(bf2)); + assertEquals(1, bf2.estimateUnion(bf)); } /** - * Tests that cardinality is correct. + * Tests that the size estimate is correctly calculated. */ @Test - public final void cardinalityTest() { + public void estimateNTest() { + // build a filter + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + assertEquals(1, filter1.estimateN()); - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + // the data provided above do not generate an estimate that is equivalent to the + // actual. + filter1.mergeInPlace(new SimpleHasher(4, 1)); - final BloomFilter bf = createFilter(hasher, shape); - assertEquals(17, bf.cardinality()); + assertEquals(1, filter1.estimateN()); + + filter1.mergeInPlace(new SimpleHasher(17, 1)); + + assertEquals(3, filter1.estimateN()); } /** @@ -233,7 +192,7 @@ public final void cardinalityTest() { public final void constructorTest_Empty() { final BloomFilter bf = createEmptyFilter(shape); - final long[] lb = bf.getBits(); + final long[] lb = BloomFilter.asBitMapArray(bf); assertEquals(0, lb.length); } @@ -242,171 +201,28 @@ public final void constructorTest_Empty() { */ @Test public final void constructorTest_Hasher() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + Hasher hasher = new SimpleHasher(0, 1); - final BloomFilter bf = createFilter(hasher, shape); - final long[] lb = bf.getBits(); + final BloomFilter bf = createFilter(shape, hasher); + final long[] lb = BloomFilter.asBitMapArray(bf); assertEquals(0x1FFFF, lb[0]); assertEquals(1, lb.length); } - /** - * Tests that creating a Bloom filter with a Static hasher that has one shape and a - * different specified shape fails. - */ - @Test - public final void constructorTest_WrongShape() { - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), anotherShape); - try { - createFilter(hasher, shape); - fail("Should throw IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests that contains() with a Bloom filter argument returns the proper results. - */ - @Test - public final void containsTest_BloomFilter() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - final BloomFilter bf2 = createFilter(hasher2, shape); - assertTrue(bf.contains(bf2)); - assertFalse(bf2.contains(bf)); - } - - /** - * Tests that contains() fails properly if the other Bloom filter is not of the proper shape. - */ - @Test - public final void containsTest_BloomFilter_WrongShape() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - final Hasher hasher2 = new StaticHasher(lst.iterator(), anotherShape); - final BloomFilter bf2 = createFilter(hasher2, anotherShape); - try { - bf.contains(bf2); - fail("Should throw IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests that contains() with a Hasher argument returns the proper results. - */ - @Test - public final void containsTest_Hasher() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - assertTrue(bf.contains(hasher2)); - - lst2 = Arrays.asList(17, 18, 19, 20); - hasher2 = new StaticHasher(lst2.iterator(), shape); - assertFalse(bf.contains(hasher2)); - - lst2 = Arrays.asList(10, 11, 12, 17, 18, 19, 20); - hasher2 = new StaticHasher(lst2.iterator(), shape); - assertFalse(bf.contains(hasher2)); - } - - /** - * Tests that contains() fails properly if the hasher is not of the proper shape. - */ - @Test - public final void containsTest_Hasher_WrongShape() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - - final List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); - try { - bf.contains(hasher2); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } - - /** - * Create an empty version of the BloomFilter implementation we are testing. - * - * @param shape the shape of the filter. - * @return a BloomFilter implementation. - */ - protected abstract AbstractBloomFilter createEmptyFilter(Shape shape); - - /** - * Create the BloomFilter implementation we are testing. - * - * @param hasher the hasher to use to create the filter. - * @param shape the shape of the filter. - * @return a BloomFilter implementation. - */ - protected abstract AbstractBloomFilter createFilter(Hasher hasher, Shape shape); - - /** - * Create a generic BloomFilter implementation. - * - * @param hasher the hasher to use to create the filter. - * @param shape the shape of the filter. - * @return a BloomFilter implementation. - */ - private AbstractBloomFilter createGenericFilter(final Hasher hasher, final Shape shape) { - final BitSet bits = new BitSet(); - hasher.iterator(shape).forEachRemaining((IntConsumer) bits::set); - return new TestBloomFilter(shape, bits); - } - /** * Tests that getBits() works correctly when multiple long values are returned. */ @Test public final void getBitsTest_SpanLong() { - final List lst = Arrays.asList(63, 64); - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - final long[] lb = bf.getBits(); + + final SimpleHasher hasher = new SimpleHasher(63, 1); + final BloomFilter bf = createFilter(new Shape(2, 72), hasher); + final long[] lb = BloomFilter.asBitMapArray(bf); assertEquals(2, lb.length); assertEquals(0x8000000000000000L, lb[0]); assertEquals(0x1, lb[1]); } - /** - * Tests that the the hasher returned from getHasher() works correctly. - */ - @Test - public final void getHasherTest() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final StaticHasher hasher2 = bf.getHasher(); - - assertEquals(shape, hasher2.getShape()); - assertSameBits(hasher, hasher2); - } - /** * Tests that isFull() returns the proper values. */ @@ -414,228 +230,80 @@ public final void getHasherTest() { public final void isFullTest() { // create empty filter - AbstractBloomFilter filter = createEmptyFilter(shape); - assertFalse(filter.isFull()); - - final List values = new ArrayList<>(shape.getNumberOfBits()); - for (int i = 0; i < shape.getNumberOfBits(); i++) { - values.add(i); - } - - StaticHasher hasher2 = new StaticHasher(values.iterator(), shape); - filter = createFilter(hasher2, shape); + BloomFilter filter = createEmptyFilter(shape); + assertFalse("Should not be full", filter.isFull()); - assertTrue(filter.isFull()); + filter = createFilter(shape, fullHasher); + assertTrue("Should be full", filter.isFull()); - final int mid = shape.getNumberOfBits() / 2; - values.remove(Integer.valueOf(mid)); - hasher2 = new StaticHasher(values.iterator(), shape); - filter = createFilter(hasher2, shape); - assertFalse(filter.isFull()); - } - - /** - * Tests that merging bloom filters works as expected. - */ - @Test - public final void mergeTest_BloomFilter() { - mergeTest_BloomFilter(this::createFilter); + filter = createFilter(shape, new SimpleHasher(1, 3)); + assertFalse("Should not be full", filter.isFull()); } /** * Tests that merging bloom filters works as expected with a generic BloomFilter. */ @Test - public final void mergeTest_GenericBloomFilter() { - mergeTest_BloomFilter(this::createGenericFilter); - } + public final void mergeTest_Bloomfilter() { - /** - * Tests that merging bloom filters works as expected. - * - * @param filterFactory the factory function to create the filter - */ - private void mergeTest_BloomFilter(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + final BloomFilter bf1 = createFilter(shape, from1); - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); + final BloomFilter bf2 = createFilter(shape, from11); - assertTrue(bf.merge(bf2), "Merge should not fail"); - assertEquals(27, bf.cardinality()); - } + final BloomFilter bf3 = bf1.merge(bf2); + assertTrue("Should contain", bf3.contains(bf1)); + assertTrue("Should contain", bf3.contains(bf2)); - /** - * Tests that merging bloom filters with different shapes fails properly - */ - @Test - public final void mergeTest_BloomFilter_WrongShape() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); - - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); - final BloomFilter bf2 = createFilter(hasher2, anotherShape); - - try { - bf.merge(bf2); - fail("Should throw IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } + final BloomFilter bf4 = bf2.merge(bf1); + assertTrue("Should contain", bf4.contains(bf1)); + assertTrue("Should contain", bf4.contains(bf2)); + assertTrue("Should contain", bf4.contains(bf3)); + assertTrue("Should contain", bf3.contains(bf4)); } - /** - * Tests that merging a hasher into a Bloom filter works as expected - */ @Test public final void mergeTest_Hasher() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + final BloomFilter bf1 = createFilter(shape, from1); + final BloomFilter bf2 = createFilter(shape, from11); - assertTrue(bf.merge(hasher2), "Merge should not fail"); - assertEquals(27, bf.cardinality()); + final BloomFilter bf3 = bf1.merge(from11); + assertTrue("Should contain", bf3.contains(bf1)); + assertTrue("Should contain", bf3.contains(bf2)); } /** - * Tests that merging a static hasher with the wrong shape into a Bloom filter fails as expected - */ - @Test - public final void mergeTest_Hasher_WrongShape() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); - - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); - - try { - bf.merge(hasher2); - fail("Should throw IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests that the orCardinality calculations are correct. + * Tests that merging bloom filters works as expected with a generic BloomFilter. */ @Test - public final void orCardinalityTest() { - orCardinalityTest(this::createFilter); - } + public final void mergeInPlaceTest_Bloomfilter() { - /** - * Tests that the orCardinality calculations are correct with a generic BloomFilter. - */ - @Test - public final void orCardinalityTest_GenericBloomFilter() { - orCardinalityTest(this::createGenericFilter); - } + final BloomFilter bf1 = createFilter(shape, from1); - /** - * Tests that the andCardinality calculations are correct. - * - * @param filterFactory the factory function to create the filter - */ - private void orCardinalityTest(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final BloomFilter bf2 = createFilter(shape, from11); - final AbstractBloomFilter bf = createFilter(hasher, shape); + final BloomFilter bf3 = bf1.merge(bf2); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + bf1.mergeInPlace(bf2); - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); + assertTrue("Should contain", bf1.contains(bf2)); + assertTrue("Should contain", bf1.contains(bf3)); - assertEquals(27, bf.orCardinality(bf2)); } - /** - * Tests that the orCardinality calculations are correct when there are more than Long.LENGTH bits. - */ @Test - public final void orCardinalityTest_ExtraLongs() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + public final void mergeInPlaceTest_Hasher() { - final AbstractBloomFilter bf = createFilter(hasher, shape); + final BloomFilter bf1 = createFilter(shape, from1); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + final BloomFilter bf2 = createFilter(shape, from11); - final AbstractBloomFilter bf2 = createFilter(hasher2, shape); + final BloomFilter bf3 = bf1.merge(bf2); - assertEquals(27, bf.orCardinality(bf2)); - assertEquals(27, bf2.orCardinality(bf)); - } + bf1.mergeInPlace(from11); - /** - * Tests that the xorCardinality calculations are correct. - */ - @Test - public final void xorCardinalityTest() { - xorCardinalityTest(this::createFilter); - } - - /** - * Tests that the xorCardinality calculations are correct with a generic BloomFilter. - */ - @Test - public final void xorCardinalityTest_GenericBloomFilter() { - xorCardinalityTest(this::createGenericFilter); + assertTrue("Should contain Bf2", bf1.contains(bf2)); + assertTrue("Should contain Bf3", bf1.contains(bf3)); } - /** - * Tests that the andCardinality calculations are correct. - * - * @param filterFactory the factory function to create the filter - */ - private void xorCardinalityTest(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); - - assertEquals(20, bf.xorCardinality(bf2)); - } - - /** - * Tests that the xorCardinality calculations are correct when there are more than Long.LENGTH bits. - */ - @Test - public final void xorCardinalityTest_ExtraLongs() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - final BloomFilter bf2 = createFilter(hasher2, shape); - - assertEquals(20, bf.xorCardinality(bf2)); - assertEquals(20, bf2.xorCardinality(bf)); - } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java new file mode 100644 index 0000000000..95b54f2c02 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.HashMap; +import java.util.Map; +import org.junit.jupiter.api.Test; + +/** + * Tests for the {@link ArrayCountingBloomFilter}. + */ +public abstract class AbstractCountingBloomFilterTest + extends AbstractBloomFilterTest { + protected int[] from1Counts = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; + protected int[] from11Counts = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; + protected int[] bigHashCounts = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; + + protected final BitCountProducer maximumValueProducer = new BitCountProducer() { + + @Override + public void forEachCount(BitCountProducer.BitCountConsumer consumer) { + for (int i = 1; i < 18; i++) { + consumer.accept(i, Integer.MAX_VALUE); + } + } + }; + + /** + * Assert the counts match the expected values. Values are for indices starting + * at 0. Assert the cardinality equals the number of non-zero counts. + * + * @param bf the bloom filter + * @param expected the expected counts + */ + private static void assertCounts(final CountingBloomFilter bf, final int[] expected) { + final Map m = new HashMap<>(); + bf.forEachCount(m::put); + int zeros = 0; + for (int i = 0; i < expected.length; i++) { + if (m.get(i) == null) { + assertEquals(expected[i], 0, "Wrong value for " + i); + zeros++; + } else { + assertEquals(expected[i], m.get(i).intValue(), "Wrong value for " + i); + } + } + assertEquals(expected.length - zeros, bf.cardinality()); + } + + /** + * Tests that counts are correct when a hasher with duplicates is used in the + * constructor. + */ + @Test + public void constructorTest_Hasher_Duplicates() { + // bit hasher has duplicates for 11, 12,13,14,15,16, and 17 + final CountingBloomFilter bf = createFilter(shape, from1); + bf.add(BitCountProducer.from(from11.indices(shape))); + + final long[] lb = BloomFilter.asBitMapArray(bf); + assertEquals(1, lb.length); + assertEquals(bigHashValue, lb[0]); + + assertCounts(bf, bigHashCounts); + } + + @Override + @Test + public void containsTest() { + final BloomFilter bf = new SimpleBloomFilter(shape, from1); + final CountingBloomFilter bf2 = createFilter(shape, bigHasher); + + assertTrue("BF Should contain itself", bf.contains(bf)); + assertTrue("BF2 Should contain itself", bf2.contains(bf2)); + assertFalse("BF should not contain BF2", bf.contains(bf2)); + assertTrue("BF2 should contain BF", bf2.contains(bf)); + BitMapProducer producer = bf2; + assertTrue("BF2 should contain BF bitMapProducer", bf2.contains(producer) ); + + } + + + /** + * Tests that merging bloom filters works as expected with a generic BloomFilter. + */ + @Test + public final void mergeTest_Mixed() { + final BloomFilter bf1 = createFilter(shape, from1); + + final BloomFilter bf2 = new SimpleBloomFilter(shape, from11); + + final BloomFilter bf3 = bf1.merge(bf2); + assertTrue("Should contain", bf3.contains(bf1)); + assertTrue("Should contain", bf3.contains(bf2)); + + final BloomFilter bf4 = bf2.merge(bf1); + assertTrue("Should contain", bf4.contains(bf1)); + assertTrue("Should contain", bf4.contains(bf2)); + assertTrue("Should contain", bf4.contains(bf3)); + assertTrue("Should contain", bf3.contains(bf4)); + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void addTest() { + final CountingBloomFilter bf1 = createFilter(shape, from1); + assertTrue("Add should work", bf1.add(createFilter(shape, from11))); + assertTrue("Should contain", bf1.contains(from1)); + assertTrue("Should contain", bf1.contains(from11)); + assertCounts(bf1, bigHashCounts); + + } + + @Test + public void addTest_overflow() { + + final CountingBloomFilter bf1 = createEmptyFilter(shape); + assertTrue("Should add to empty", bf1.add(maximumValueProducer)); + assertTrue("Should be valid", bf1.isValid()); + + assertFalse("Should not add", bf1.add(createFilter(shape, from1))); + assertFalse("Should not be valid", bf1.isValid()); + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void subtractTest() { + final CountingBloomFilter bf1 = createFilter(shape, from1); + bf1.add(BitCountProducer.from(from11.indices(shape))); + + final CountingBloomFilter bf2 = createFilter(shape, from11); + + assertTrue("Subtract should work", bf1.subtract(bf2)); + assertFalse("Should not contain bitHasher", bf1.contains(bigHasher)); + assertTrue("Should contain from1", bf1.contains(from1)); + + assertCounts(bf1, from1Counts); + + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void subtractTest_underflow() { + final CountingBloomFilter bf1 = createFilter(shape, from1); + + final CountingBloomFilter bf2 = createFilter(shape, from11); + + assertFalse("Subtract should not work", bf1.subtract(bf2)); + assertFalse("isValid should return false", bf1.isValid()); + assertFalse("Should not contain", bf1.contains(from1)); + assertFalse("Should not contain", bf1.contains(bf2)); + + assertCounts(bf1, new int[] { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }); + + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void removeTest() { + final CountingBloomFilter bf1 = createFilter(shape, from1); + bf1.add(BitCountProducer.from(from11.indices(shape))); + + assertTrue("Remove should work", bf1.remove(new SimpleBloomFilter(shape, from11))); + assertFalse("Should not contain", bf1.contains(from11)); + assertTrue("Should contain", bf1.contains(from1)); + + assertCounts(bf1, from1Counts); + + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void removeTest_hasher() { + final CountingBloomFilter bf1 = createFilter(shape, from1); + bf1.add(BitCountProducer.from(from11.indices(shape))); + + assertTrue("Remove should work", bf1.remove(from11)); + assertFalse("Should not contain", bf1.contains(from11)); + assertTrue("Should contain", bf1.contains(from1)); + + assertCounts(bf1, from1Counts); + + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void removeTest_underflow() { + final CountingBloomFilter bf1 = createFilter(shape, from1); + + final BloomFilter bf2 = new SimpleBloomFilter(shape, from11); + + assertFalse("Subtract should not work", bf1.remove(bf2)); + assertFalse("isValid should return false", bf1.isValid()); + assertFalse("Should not contain", bf1.contains(from1)); + assertFalse("Should not contain", bf1.contains(bf2)); + + assertCounts(bf1, new int[] { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); + + } + + @Test + public void mergeTest_overflow() { + + final CountingBloomFilter bf1 = createEmptyFilter(shape); + assertTrue("Should add to empty", bf1.add(maximumValueProducer)); + assertTrue("Should be valid", bf1.isValid()); + + CountingBloomFilter bf2 = bf1.merge(new SimpleBloomFilter(shape, from1)); + assertFalse("Should not be valid", bf2.isValid()); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java index a661f93fde..117194b6a1 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java @@ -16,520 +16,23 @@ */ package org.apache.commons.collections4.bloomfilter; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.ThreadLocalRandom; -import java.util.function.BiPredicate; -import java.util.function.Function; -import java.util.function.ToIntBiFunction; - import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.junit.jupiter.api.Test; /** * Tests for the {@link ArrayCountingBloomFilter}. */ -public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { - - /** - * Function to convert int arrays to BloomFilters for testing. - */ - private final Function converter = counts -> { - final BloomFilter testingFilter = new BitSetBloomFilter(shape); - testingFilter.merge(new FixedIndexesTestHasher(shape, counts)); - return testingFilter; - }; +public class ArrayCountingBloomFilterTest extends AbstractCountingBloomFilterTest { @Override - protected ArrayCountingBloomFilter createEmptyFilter(final Shape shape) { + protected ArrayCountingBloomFilter createEmptyFilter(Shape shape) { return new ArrayCountingBloomFilter(shape); } @Override - protected ArrayCountingBloomFilter createFilter(final Hasher hasher, final Shape shape) { - final ArrayCountingBloomFilter result = new ArrayCountingBloomFilter(shape); - result.merge( hasher ); - return result; - } - - private ArrayCountingBloomFilter createFromCounts(final int[] counts) { - // Use a dummy filter to add the counts to an empty filter - final CountingBloomFilter dummy = new ArrayCountingBloomFilter(shape) { - @Override - public void forEachCount(final BitCountConsumer action) { - for (int i = 0; i < counts.length; i++) { - action.accept(i, counts[i]); - } - } - }; - final ArrayCountingBloomFilter bf = new ArrayCountingBloomFilter(shape); - bf.add(dummy); - return bf; - } - - /** - * Assert the counts match the expected values. Values are for indices starting - * at 0. Assert the cardinality equals the number of non-zero counts. - * - * @param bf the bloom filter - * @param expected the expected counts - */ - private static void assertCounts(final CountingBloomFilter bf, final int[] expected) { - final Map m = new HashMap<>(); - bf.forEachCount(m::put); - int zeros = 0; - for (int i = 0; i < expected.length; i++) { - if (m.get(i) == null) { - assertEquals(expected[i], 0, "Wrong value for " + i); - zeros++; - } else { - assertEquals(expected[i], m.get(i).intValue(), "Wrong value for " + i); - } - } - assertEquals(expected.length - zeros, bf.cardinality()); - } - - /** - * Tests that counts are correct when a hasher with duplicates is used in the - * constructor. - */ - @Test - public void constructorTest_Hasher_Duplicates() { - final int[] expected = {0, 1, 1, 0, 0, 1}; - // Some indexes with duplicates - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 2, 5); - - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - final long[] lb = bf.getBits(); - assertEquals(1, lb.length); - assertEquals(0b100110L, lb[0]); - - assertCounts(bf, expected); - } - - /** - * Test the contains function with a standard Bloom filter. - * The contains function is tested using a counting Bloom filter in the parent test class. - */ - @Test - public void contains_BloomFilter() { - // Some indexes with duplicates - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 5); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - BitSetBloomFilter testingFilter = new BitSetBloomFilter(shape); - testingFilter.merge( new FixedIndexesTestHasher(shape, 3, 4)); - assertFalse(bf.contains(testingFilter)); - testingFilter = new BitSetBloomFilter(shape); - testingFilter.merge( new FixedIndexesTestHasher(shape, 2, 5)); - assertTrue(bf.contains(testingFilter)); - } - - /** - * Tests that merge correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void mergeTest_Counts_CountingBloomFilter() { - assertMerge(counts -> createFilter(new FixedIndexesTestHasher(shape, counts), shape), - BloomFilter::merge); - } - - /** - * Tests that merge correctly updates the counts when a BloomFilter is passed. - */ - @Test - public void mergeTest_Counts_BloomFilter() { - assertMerge(converter, BloomFilter::merge); - } - - /** - * Test that merge correctly updates the counts when a Hasher is passed. - */ - @Test - public void mergeTest_Counts_Hasher() { - assertMerge(counts -> new FixedIndexesTestHasher(shape, counts), - BloomFilter::merge); - } - - /** - * Test that merge correctly updates the counts when a Hasher is passed with duplicates. - */ - @Test - public void mergeTest_Counts_Hasher_Duplicates() { - assertMerge(counts -> new FixedIndexesTestHasher(shape, createDuplicates(counts)), - BloomFilter::merge); - } - - /** - * Tests that remove correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void removeTest_Counts_CountingBloomFilter() { - assertRemove(counts -> createFilter(new FixedIndexesTestHasher(shape, counts), shape), - CountingBloomFilter::remove); - } - - /** - * Tests that remove correctly updates the counts when a BloomFilter is passed. - */ - @Test - public void removeTest_Counts_BloomFilter() { - assertRemove(converter, CountingBloomFilter::remove); - } - - /** - * Test that remove correctly updates the counts when a Hasher is passed. - */ - @Test - public void removeTest_Counts_Hasher() { - assertRemove(counts -> new FixedIndexesTestHasher(shape, counts), - CountingBloomFilter::remove); - } - - /** - * Test that remove correctly updates the counts when a Hasher is passed with duplicates. - */ - @Test - public void removeTest_Counts_Hasher_Duplicates() { - assertRemove(counts -> new FixedIndexesTestHasher(shape, createDuplicates(counts)), - CountingBloomFilter::remove); - } - - /** - * Creates duplicates in the counts. - * - * @param counts the counts - * @return the new counts - */ - private static int[] createDuplicates(final int[] counts) { - // Duplicate some values randomly - final int length = counts.length; - final int[] countsWithDuplicates = Arrays.copyOf(counts, 2 * length); - for (int i = length; i < countsWithDuplicates.length; i++) { - // Copy a random value from the counts into the end position - countsWithDuplicates[i] = countsWithDuplicates[ThreadLocalRandom.current().nextInt(i)]; - } - return countsWithDuplicates; - } - - /** - * Assert a merge operation. The converter should construct a suitable object - * to remove the indices from the provided Bloom filter with the remove operation. - * - * @param the type of the filter - * @param converter the converter - * @param merge the merge operation - */ - private void assertMerge(final Function converter, - final BiPredicate merge) { - final int[] indexes1 = { 1, 2, 4, 5, 6}; - final int[] indexes2 = { 3, 4, 6}; - final int[] expected = {0, 1, 1, 1, 2, 1, 2}; - assertOperation(indexes1, indexes2, converter, merge, true, expected); - } - - /** - * Assert a remove operation. The converter should construct a suitable object - * to remove the indices from the provided Bloom filter with the remove operation. - * - * @param the type of the filter - * @param converter the converter - * @param remove the remove operation - */ - private void assertRemove(final Function converter, - final BiPredicate remove) { - final int[] indexes1 = { 1, 2, 4, 5, 6}; - final int[] indexes2 = { 2, 5, 6}; - final int[] expected = {0, 1, 0, 0, 1, 0, 0}; - assertOperation(indexes1, indexes2, converter, remove, true, expected); - } - - /** - * Assert a counting operation. The first set of indexes is used to create the - * CountingBloomFilter. The second set of indices is passed to the converter to - * construct a suitable object to combine with the counting Bloom filter. The counts - * of the first Bloom filter are checked using the expected counts. - * - *

Counts are assumed to map to indexes starting from 0. - * - * @param the type of the filter - * @param indexes1 the first set of indexes - * @param indexes2 the second set of indexes - * @param converter the converter - * @param operation the operation - * @param isValid the expected value for the operation result - * @param expected the expected counts after the operation - */ - private void assertOperation(final int[] indexes1, final int[] indexes2, - final Function converter, - final BiPredicate operation, - final boolean isValid, final int[] expected) { - final Hasher hasher = new FixedIndexesTestHasher(shape, indexes1); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - final F filter = converter.apply(indexes2); - final boolean result = operation.test(bf, filter); - assertEquals(isValid, result); - assertEquals(isValid, bf.isValid()); - assertCounts(bf, expected); - } - - /** - * Tests that merge errors when the counts overflow the maximum integer value. - */ - @Test - public void mergeTest_Overflow() { - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - - final ArrayCountingBloomFilter bf2 = createFromCounts(new int[] {0, 0, Integer.MAX_VALUE}); - - // Small + 1 = OK - // should not fail as the counts are ignored - assertTrue(bf.merge(bf2)); - assertTrue(bf.isValid()); - assertCounts(bf, new int[] {0, 1, 2, 1}); - - // Big + 1 = Overflow - assertTrue(bf2.isValid()); - assertFalse(bf2.merge(bf)); - assertFalse(bf2.isValid(), "Merge should overflow and the filter is invalid"); - - // The counts are not clipped to max. They have simply overflowed. - // Note that this is a merge and the count is only incremented by 1 - // and not the actual count at each index. So it is not 2 + Integer.MAX_VALUE. - assertCounts(bf2, new int[] {0, 1, 1 + Integer.MAX_VALUE, 1}); - } - - /** - * Tests that removal errors when the counts become negative. - */ - @Test - public void removeTest_Negative() { - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - - final Hasher hasher2 = new FixedIndexesTestHasher(shape, 2); - final ArrayCountingBloomFilter bf2 = createFilter(hasher2, shape); - - // More - Less = OK - bf.remove(bf2); - assertTrue(bf.isValid()); - assertCounts(bf, new int[] {0, 1, 0, 1}); - - // Less - More = Negative - assertTrue(bf2.isValid()); - bf2.remove(bf); - assertFalse(bf2.isValid(), "Remove should create negative counts and the filter is invalid"); - - // The counts are not clipped to zero. They have been left as negative. - assertCounts(bf2, new int[] {0, -1, 1, -1}); - } - - /** - * Tests that counts can be added to a new instance. - * - *

Note: This test ensures the CountingBloomFilter - * can be created with whatever counts are required for other tests. - */ - @Test - public void addTest_NewInstance() { - for (final int[] counts : new int[][] { - { /* empty */}, - {0, 0, 1}, - {0, 1, 2}, - {2, 3, 4}, - {66, 77, 0, 99}, - {Integer.MAX_VALUE, 42}, - }) { - assertCounts(createFromCounts(counts), counts); - } - } - - /** - * Test that add correctly ignores an empty CountingBloomFilter. - */ - @Test - public void addTest_Empty() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[0], - CountingBloomFilter::add, - true, - new int[] {5, 2, 1}); - } - - /** - * Test that add correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void addTest_Counts() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[] {0, 6, 4, 1}, - CountingBloomFilter::add, - true, - new int[] {5, 8, 5, 1}); - } - - /** - * Test that add correctly updates the isValid state when a CountingBloomFilter is - * passed and an integer overflow occurs. - */ - @Test - public void addTest_Overflow() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[] {0, 6, Integer.MAX_VALUE}, - CountingBloomFilter::add, - false, - new int[] {5, 8, 1 + Integer.MAX_VALUE}); - } - - /** - * Test that subtract correctly ignores an empty CountingBloomFilter. - */ - @Test - public void subtractTest_Empty() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[0], - CountingBloomFilter::subtract, - true, - new int[] {5, 2, 1}); - } - - /** - * Test that subtract correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void subtractTest_Counts() { - assertCountingOperation(new int[] {5, 9, 1, 1}, - new int[] {0, 2, 1}, - CountingBloomFilter::subtract, - true, - new int[] {5, 7, 0, 1}); - } - - /** - * Test that subtract correctly updates the isValid state when a CountingBloomFilter is - * passed and the counts become negative. - */ - @Test - public void subtractTest_Negative() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[] {0, 6, 1}, - CountingBloomFilter::subtract, - false, - new int[] {5, -4, 0}); + protected ArrayCountingBloomFilter createFilter(Shape shape, Hasher hasher) { + ArrayCountingBloomFilter filter = createEmptyFilter(shape); + filter.add(BitCountProducer.from(hasher.indices(shape))); + return filter; } - /** - * Assert a counting operation. Two CountingBloomFilters are created from the - * two sets of counts. The operation is applied and the counts of the first - * Bloom filter is checked using the expected counts. - * - *

Counts are assumed to map to indexes starting from 0. - * - * @param counts1 the first set counts - * @param counts2 the first set counts - * @param operation the operation - * @param isValid the expected value for the operation result - * @param expected the expected counts after the operation - */ - private void assertCountingOperation(final int[] counts1, final int[] counts2, - final BiPredicate operation, - final boolean isValid, final int[] expected) { - final ArrayCountingBloomFilter bf1 = createFromCounts(counts1); - final ArrayCountingBloomFilter bf2 = createFromCounts(counts2); - final boolean result = operation.test(bf1, bf2); - assertEquals(isValid, result); - assertEquals(isValid, bf1.isValid()); - assertCounts(bf1, expected); - } - - /** - * Tests that the andCardinality calculation executes correctly when using a - * CountingBloomFilter argument. - */ - @Test - public void andCardinalityTest_CountingBloomFilter() { - assertCardinalityOperation(new int[] {1, 1}, - new int[] {1, 1}, - BloomFilter::andCardinality, - 2); - assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, - new int[] {1, 1, 0, 0, 0, 1}, - BloomFilter::andCardinality, - 2); - assertCardinalityOperation(new int[] {1, 1}, - new int[] {0, 0, 1, 1, 1}, - BloomFilter::andCardinality, - 0); - } - - /** - * Tests that the orCardinality calculation executes correctly when using a - * CountingBloomFilter argument. - */ - @Test - public void orCardinalityTest_CountingBloomFilter() { - assertCardinalityOperation(new int[] {1, 1}, - new int[] {1, 1}, - BloomFilter::orCardinality, - 2); - assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, - new int[] {1, 1, 0, 0, 0, 1}, - BloomFilter::orCardinality, - 6); - assertCardinalityOperation(new int[] {1, 1}, - new int[] {0, 0, 1, 1, 1}, - BloomFilter::orCardinality, - 5); - } - - /** - * Tests that the xorCardinality calculation executes correctly when using a - * CountingBloomFilter argument. - */ - @Test - public void xorCardinalityTest_CountingBloomFilter() { - assertCardinalityOperation(new int[] {1, 1}, - new int[] {1, 1}, - BloomFilter::xorCardinality, - 0); - assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, - new int[] {1, 1, 0, 0, 0, 1}, - BloomFilter::xorCardinality, - 4); - assertCardinalityOperation(new int[] {1, 1}, - new int[] {0, 0, 1, 1, 1}, - BloomFilter::xorCardinality, - 5); - } - - /** - * Assert a cardinality operation. Two CountingBloomFilters are created from the - * two sets of counts. The operation is applied and the counts of the first - * Bloom filter is checked using the expected counts. - * - *

Counts are assumed to map to indexes starting from 0. - * - * @param counts1 the first set counts - * @param counts2 the first set counts - * @param operation the operation - * @param expected the expected cardinality - */ - private void assertCardinalityOperation(final int[] counts1, final int[] counts2, - final ToIntBiFunction operation, - final int expected) { - final ArrayCountingBloomFilter bf1 = createFromCounts(counts1); - final ArrayCountingBloomFilter bf2 = createFromCounts(counts2); - assertEquals(expected, operation.applyAsInt(bf1, bf2)); - assertEquals(expected, operation.applyAsInt(bf2, bf1)); - } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java new file mode 100644 index 0000000000..e4a377b5a6 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.IntConsumer; + +import org.junit.Test; + +public class BitCountProducerTest { + + @Test + public void fromIndexProducer() { + IndexProducer iProducer = new IndexProducer() { + + @Override + public void forEachIndex(IntConsumer consumer) { + consumer.accept(0); + consumer.accept(1); + consumer.accept(63); + consumer.accept(64); + consumer.accept(127); + consumer.accept(128); + } + }; + BitCountProducer producer = BitCountProducer.from(iProducer); + Map m = new HashMap(); + + producer.forEachCount((i, v) -> m.put(i, v)); + + assertEquals(6, m.size()); + assertEquals(Integer.valueOf(1), m.get(0)); + assertEquals(Integer.valueOf(1), m.get(1)); + assertEquals(Integer.valueOf(1), m.get(63)); + assertEquals(Integer.valueOf(1), m.get(64)); + assertEquals(Integer.valueOf(1), m.get(127)); + assertEquals(Integer.valueOf(1), m.get(128)); + + } + + @Test + public void forEachIndexTest() { + BitCountProducer producer = new BitCountProducer() { + + @Override + public void forEachCount(BitCountConsumer consumer) { + consumer.accept(1, 11); + consumer.accept(3, 13); + } + }; + + List lst = new ArrayList(); + producer.forEachIndex( lst::add ); + assertEquals( 2, lst.size() ); + assertEquals( Integer.valueOf(1), lst.get(0) ); + assertEquals( Integer.valueOf(3), lst.get(1) ); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java new file mode 100644 index 0000000000..2cbff7c8b6 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.IntConsumer; + +import org.junit.Test; + +public class BitMapProducerTest { + + @Test + public void fromIndexProducer() { + IndexProducer iProducer = new IndexProducer() { + + @Override + public void forEachIndex(IntConsumer consumer) { + consumer.accept(0); + consumer.accept(1); + consumer.accept(63); + consumer.accept(64); + consumer.accept(127); + consumer.accept(128); + } + }; + BitMapProducer producer = BitMapProducer.fromIndexProducer(iProducer, new Shape(1, 200)); + List lst = new ArrayList(); + producer.forEachBitMap(lst::add); + long[] buckets = lst.stream().mapToLong(l -> l.longValue()).toArray(); + assertTrue(BitMap.contains(buckets, 0)); + assertTrue(BitMap.contains(buckets, 1)); + assertTrue(BitMap.contains(buckets, 63)); + assertTrue(BitMap.contains(buckets, 64)); + assertTrue(BitMap.contains(buckets, 127)); + assertTrue(BitMap.contains(buckets, 128)); + } + + @Test + public void fromLongArrayTest() { + long[] ary = new long[] {1L, 2L, 3L, 4L, 5L}; + BitMapProducer producer = BitMapProducer.fromLongArray( ary ); + List lst = new ArrayList(); + producer.forEachBitMap( lst::add ); + assertEquals( Long.valueOf(1), lst.get(0) ); + assertEquals( Long.valueOf(2), lst.get(1) ); + assertEquals( Long.valueOf(3), lst.get(2) ); + assertEquals( Long.valueOf(4), lst.get(3) ); + assertEquals( Long.valueOf(5), lst.get(4) ); + + } + + @Test + public void arrayBuilderTest() { + try { + new BitMapProducer.ArrayBuilder( new Shape( 1, 4 ), new long[] {1L, 2L, 3L, 4L, 5L }); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java new file mode 100644 index 0000000000..145a28aa7f --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.junit.Test; + +public class BitMapTest { + + @Test + public void checkPositiveTest() { + BitMap.checkPositive(0); + BitMap.checkPositive(0); + try { + BitMap.checkPositive(-1); + + } catch (IndexOutOfBoundsException expected) { + // do nothing + } + } + + @Test + public void getLongBitTest() { + assertEquals(1, BitMap.getLongBit(0)); + assertEquals(0x8000000000000000L, BitMap.getLongBit(63)); + assertEquals(1, BitMap.getLongBit(64)); + assertEquals(0x8000000000000000L, BitMap.getLongBit(127)); + assertEquals(1, BitMap.getLongBit(128)); + } + + @Test + public void getLongIndexTest() { + assertEquals(0, BitMap.getLongIndex(0)); + assertEquals(0, BitMap.getLongIndex(63)); + assertEquals(1, BitMap.getLongIndex(64)); + assertEquals(1, BitMap.getLongIndex(127)); + assertEquals(2, BitMap.getLongIndex(128)); + } + + @Test + public void isSparseTest() { + Shape shape = new Shape(17, 64); + assertTrue(BitMap.isSparse(0, shape)); + assertTrue(BitMap.isSparse(1, shape)); + assertTrue(BitMap.isSparse(2, shape)); + assertFalse(BitMap.isSparse(3, shape)); + + shape = new Shape(17, 64 * 3); + + for (int i = 0; i < 7; i++) { + assertTrue(BitMap.isSparse(i, shape)); + } + assertFalse(BitMap.isSparse(7, shape)); + } + + @Test + public void numberOfBitMapsTest() { + assertEquals("Number of bits 0", 0, BitMap.numberOfBitMaps(0)); + for (int i = 1; i < 65; i++) { + assertEquals(String.format("Number of bits %d", i), 1, BitMap.numberOfBitMaps(i)); + } + for (int i = 65; i < 129; i++) { + assertEquals(String.format("Number of bits %d", i), 2, BitMap.numberOfBitMaps(i)); + } + assertEquals("Number of bits 129", 3, BitMap.numberOfBitMaps(129)); + + } + + @Test + public void setTest() { + long[] bitMaps = new long[BitMap.numberOfBitMaps(129)]; + for (int i = 0; i < 129; i++) { + BitMap.set(bitMaps, i); + assertTrue(String.format("Failed at index: %d", i), BitMap.contains(bitMaps, i)); + } + assertEquals(0xFFFFFFFFFFFFFFFFL, bitMaps[0]); + assertEquals(0xFFFFFFFFFFFFFFFFL, bitMaps[1]); + assertEquals(1L, bitMaps[2]); + } + + @Test + public void containsTest() { + long[] bitMaps = new long[1]; + + for (int i = 0; i < 64; i++) { + bitMaps[0] = 0L; + BitMap.set(bitMaps, i); + for (int j = 0; j < 64; j++) { + if (j == i) { + assertTrue(String.format("Failed at index: %d for %d", i, j), BitMap.contains(bitMaps, j)); + } else { + assertFalse(String.format("Failed at index %d for %d", i, j), BitMap.contains(bitMaps, j)); + } + } + + } + } + + @Test + public void contains_boundaryConditionTest() { + long[] ary = new long[1]; + + assertFalse(BitMap.contains(ary, 0)); + ary[0] = 0x01; + assertTrue(BitMap.contains(ary, 0)); + + assertFalse(BitMap.contains(ary, 63)); + ary[0] = (1L << 63); + assertTrue(BitMap.contains(ary, 63)); + + ary = new long[2]; + assertFalse(BitMap.contains(ary, 64)); + ary[1] = 1; + assertTrue(BitMap.contains(ary, 64)); + + } + + @Test + public void checkRangeTest() { + try { + BitMap.checkRange( 1, Long.SIZE + 1); + fail( "Should have thrown IndexOutOfBoundsException" ); + } catch (IndexOutOfBoundsException expected) { + // + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java deleted file mode 100644 index ffd2d0d8c5..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.ArrayList; -import java.util.Random; -import java.util.concurrent.ThreadLocalRandom; - -import static org.junit.jupiter.api.Assertions.assertThrows; - -/** - * Tests for the {@link BloomFilterIndexer}. - */ -public class BloomFilterIndexerTest { - - @Test - public void testCheckPositiveThrows() { - assertThrows(IndexOutOfBoundsException.class, () -> BloomFilterIndexer.checkPositive(-1)); - } - - @Test - public void testGetLongIndex() { - assertEquals(0, BloomFilterIndexer.getLongIndex(0)); - - for (final int index : getIndexes()) { - // getLongIndex is expected to identify a block of 64-bits (starting from zero) - assertEquals(index / Long.SIZE, BloomFilterIndexer.getLongIndex(index)); - - // Verify the behavior for negatives. It should produce a negative (invalid) - // as a simple trip for incorrect usage. - assertTrue(BloomFilterIndexer.getLongIndex(-index) < 0); - - // If index is not zero then when negated this is what a signed shift - // of 6-bits actually does - assertEquals(((1 - index) / Long.SIZE) - 1, - BloomFilterIndexer.getLongIndex(-index)); - } - } - - @Test - public void testGetLongBit() { - assertEquals(1L, BloomFilterIndexer.getLongBit(0)); - - for (final int index : getIndexes()) { - // getLongBit is expected to identify a single bit in a 64-bit block - assertEquals(1L << (index % Long.SIZE), BloomFilterIndexer.getLongBit(index)); - - // Verify the behavior for negatives - assertEquals(1L << (64 - (index & 0x3f)), BloomFilterIndexer.getLongBit(-index)); - } - } - - /** - * Gets non-zero positive indexes for testing. - * - * @return the indices - */ - private static int[] getIndexes() { - final Random rng = ThreadLocalRandom.current(); - final ArrayList indexes = new ArrayList<>(40); - for (int i = 0; i < 10; i++) { - // random positive numbers - indexes.add(rng.nextInt() >>> 1); - indexes.add(rng.nextInt(23647826)); - indexes.add(rng.nextInt(245)); - } - // Quickly remove zeros (as these cannot be negated) - indexes.removeIf(i -> i == 0); - // Add edge cases here - indexes.add(1); - indexes.add(2); - indexes.add(63); - indexes.add(64); - indexes.add(Integer.MAX_VALUE); - return indexes.stream().mapToInt(Integer::intValue).toArray(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java deleted file mode 100644 index 0d6443355c..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.BitSet; -import java.util.function.IntConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; - -/** - * Test all the default implementations of the BloomFilter in {@link AbstractBloomFilter}. - */ -public class DefaultBloomFilterMethodsTest extends AbstractBloomFilterTest { - - /** - * A testing class that implements only the abstract methods from BloomFilter. - * - */ - private static class BF extends AbstractBloomFilter { - - /** - * The bits for this BloomFilter. - */ - private final BitSet bitSet; - - /** - * Constructs a BitSetBloomFilter from a hasher and a shape. - * - * @param hasher the Hasher to use. - * @param shape the desired shape of the filter. - */ - BF(final Hasher hasher, final Shape shape) { - this(shape); - verifyHasher(hasher); - hasher.iterator(shape).forEachRemaining((IntConsumer) bitSet::set); - } - - /** - * Constructs an empty BitSetBloomFilter. - * - * @param shape the desired shape of the filter. - */ - BF(final Shape shape) { - super(shape); - this.bitSet = new BitSet(); - } - - @Override - public long[] getBits() { - return bitSet.toLongArray(); - } - - @Override - public StaticHasher getHasher() { - return new StaticHasher(bitSet.stream().iterator(), getShape()); - } - - @Override - public boolean merge(final BloomFilter other) { - verifyShape(other); - bitSet.or(BitSet.valueOf(other.getBits())); - return true; - } - - @Override - public boolean merge(final Hasher hasher) { - verifyHasher(hasher); - hasher.iterator(getShape()).forEachRemaining((IntConsumer) bitSet::set); - return true; - } - } - - @Override - protected AbstractBloomFilter createEmptyFilter(final Shape shape) { - return new BF(shape); - } - - @Override - protected AbstractBloomFilter createFilter(final Hasher hasher, final Shape shape) { - return new BF(hasher, shape); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java new file mode 100644 index 0000000000..9d615eb6fa --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.TreeSet; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; + +import org.apache.commons.collections4.bloomfilter.exceptions.NoMatchException; +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; + +/** + * Tests for the {@link BloomFilter}. + */ +public class DefaultBloomFilterTest extends AbstractBloomFilterTest { + @Override + protected DefaultBloomFilter createEmptyFilter(final Shape shape) { + return new DefaultBloomFilter(shape); + } + + @Override + protected DefaultBloomFilter createFilter(final Shape shape, final Hasher hasher) { + return new DefaultBloomFilter(shape, hasher); + } + + public class DefaultBloomFilter implements BloomFilter { + private Shape shape; + private TreeSet indices; + + DefaultBloomFilter(Shape shape) { + this.shape = shape; + this.indices = new TreeSet(); + } + + DefaultBloomFilter(Shape shape, Hasher hasher) { + this( shape ); + hasher.indices(shape).forEachIndex( indices::add ); + } + + @Override + public void forEachIndex(IntConsumer consumer) { + indices.forEach( i -> consumer.accept( i.intValue() ) ); + } + + @Override + public void forEachBitMap(LongConsumer consumer) { + BitMapProducer.fromIndexProducer(this, shape).forEachBitMap(consumer); + } + + @Override + public boolean isSparse() { + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean contains(IndexProducer indexProducer) { + try { + indexProducer.forEachIndex( i -> { + if (!indices.contains( i )) { + throw new NoMatchException(); + } + } ); + return true; + } catch (NoMatchException e) { + return false; + } + } + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + return contains( IndexProducer.fromBitMapProducer(bitMapProducer) ); + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + other.forEachIndex( indices::add ); + return true; + } + + @Override + public int cardinality() { + return indices.size(); + } + + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java b/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java deleted file mode 100644 index ec4886294c..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; - -import java.util.Arrays; -import java.util.PrimitiveIterator.OfInt; - -/** - * A Hasher implementation to return fixed indexes. Duplicates are allowed. - * The shape is ignored when generating the indexes. - * - *

This is not a real hasher and is used for testing only. - */ -class FixedIndexesTestHasher implements Hasher { - /** The shape. */ - private final Shape shape; - /** The indexes. */ - private final int[] indexes; - - /** - * Create an instance. - * - * @param shape the shape - * @param indexes the indexes - */ - FixedIndexesTestHasher(final Shape shape, final int... indexes) { - this.shape = shape; - this.indexes = indexes; - } - - @Override - public OfInt iterator(final Shape shape) { - if (!this.shape.equals(shape)) { - throw new IllegalArgumentException( - String.format("shape (%s) does not match internal shape (%s)", shape, this.shape)); - } - return Arrays.stream(indexes).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return shape.getHashFunctionIdentity(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java deleted file mode 100644 index a10df81643..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.DynamicHasher; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; -import org.junit.jupiter.api.Test; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.PrimitiveIterator.OfInt; - -/** - * Tests the {@link HasherBloomFilter}. - */ -public class HasherBloomFilterTest extends AbstractBloomFilterTest { - - /** - * Tests that the constructor works correctly. - */ - @Test - public void constructorTest_NonStatic() { - final Shape shape = new Shape(new MD5Cyclic(), 3, 72, 17); - final DynamicHasher hasher = new DynamicHasher.Builder(new MD5Cyclic()).with("Hello", StandardCharsets.UTF_8).build(); - final HasherBloomFilter filter = createFilter(hasher, shape); - final long[] lb = filter.getBits(); - assertEquals(2, lb.length); - assertEquals(0x6203101001888c44L, lb[0]); - assertEquals(0x60L, lb[1]); - } - - @Override - protected AbstractBloomFilter createEmptyFilter(final Shape shape) { - return new HasherBloomFilter(shape); - } - - @Override - protected HasherBloomFilter createFilter(final Hasher hasher, final Shape shape) { - return new HasherBloomFilter(hasher, shape); - } - - /** - * Test the edge case where the filter is empty and the getBits() function returns a - * zero length array. - */ - @Test - public void getBitsTest_Empty() { - final BloomFilter filter = createEmptyFilter(shape); - assertArrayEquals(new long[0], filter.getBits()); - } - - /** - * Test the edge case where the filter has only 1 bit in the lowest index and the getBits() - * function returns an array of length 1. - */ - @Test - public void getBitsTest_LowestBitOnly() { - final BloomFilter filter = createEmptyFilter(shape); - // Set the lowest bit index only. - filter.merge(new Hasher() { - @Override - public OfInt iterator(final Shape shape) { - return Arrays.stream(new int[] {0}).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return shape.getHashFunctionIdentity(); - } - }); - assertArrayEquals(new long[] {1L}, filter.getBits()); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java deleted file mode 100644 index c6c6a03b2e..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentityImpl; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; -import org.junit.jupiter.api.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Set; -import java.util.function.IntConsumer; -import java.util.stream.Collectors; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -/** - * Tests for the {@link IndexFilters}. - */ -public class IndexFilterTest { - - /** - * The shape of the dummy Bloom filter. - * This is used as an argument to a Hasher that just returns fixed indexes - * so the parameters do not matter. - */ - private final Shape shape = new Shape(new HashFunctionIdentityImpl( - "Apache Commons Collections", "Dummy", Signedness.SIGNED, ProcessType.CYCLIC, 0L), - 50, 3000, 4); - - @Test - public void testApplyThrowsWithNullArguments() { - final FixedIndexesTestHasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); - final Shape shape = this.shape; - final ArrayList actual = new ArrayList<>(); - final IntConsumer consumer = actual::add; - - try { - IndexFilters.distinctIndexes(null, shape, consumer); - fail("null hasher"); - } catch (final NullPointerException expected) { - // Ignore - } - - try { - IndexFilters.distinctIndexes(hasher, null, consumer); - fail("null shape"); - } catch (final NullPointerException expected) { - // Ignore - } - - try { - IndexFilters.distinctIndexes(hasher, shape, null); - fail("null consumer"); - } catch (final NullPointerException expected) { - // Ignore - } - - // All OK together - IndexFilters.distinctIndexes(hasher, shape, consumer); - } - - @Test - public void testApply() { - assertFilter(1, 4, 6, 7, 9); - } - - @Test - public void testApplyWithDuplicates() { - assertFilter(1, 4, 4, 6, 7, 7, 7, 7, 7, 9); - } - - private void assertFilter(final int... indexes) { - final FixedIndexesTestHasher hasher = new FixedIndexesTestHasher(shape, indexes); - final Set expected = Arrays.stream(indexes).boxed().collect(Collectors.toSet()); - final ArrayList actual = new ArrayList<>(); - - IndexFilters.distinctIndexes(hasher, shape, actual::add); - - assertEquals(expected.size(), actual.size()); - // Check the array has all the values. - // We do not currently check the order of indexes from the - // hasher.iterator() function. - for (final Integer index : actual) { - assertTrue(expected.contains(index)); - } - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java new file mode 100644 index 0000000000..7fd7b81512 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.LongConsumer; + +import org.junit.jupiter.api.Test; + +public class IndexProducerTest { + + @Test + public void fromBitMapProducerTest() { + TestingBitMapProducer producer = new TestingBitMapProducer(new long[] { 1L, 2L, 3L }); + IndexProducer underTest = IndexProducer.fromBitMapProducer(producer); + List lst = new ArrayList(); + + underTest.forEachIndex(lst::add); + assertEquals(4, lst.size()); + assertEquals(Integer.valueOf(0), lst.get(0)); + assertEquals(Integer.valueOf(1 + 64), lst.get(1)); + assertEquals(Integer.valueOf(0 + 128), lst.get(2)); + assertEquals(Integer.valueOf(1 + 128), lst.get(3)); + + producer = new TestingBitMapProducer(new long[] { 0xFFFFFFFFFFFFFFFFL }); + underTest = IndexProducer.fromBitMapProducer(producer); + lst = new ArrayList(); + + underTest.forEachIndex(lst::add); + + assertEquals(64, lst.size()); + for (int i = 0; i < 64; i++) { + assertEquals(Integer.valueOf(i), lst.get(i)); + } + + } + + private class TestingBitMapProducer implements BitMapProducer { + long[] values; + + TestingBitMapProducer(long[] values) { + this.values = values; + } + + @Override + public void forEachBitMap(LongConsumer consumer) { + for (long l : values) { + consumer.accept(l); + } + } + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index 541428989f..5c9b7cd405 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -17,14 +17,11 @@ package org.apache.commons.collections4.bloomfilter; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.fail; -import java.util.List; import java.util.Arrays; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; + +import org.apache.commons.collections4.bloomfilter.hasher.HasherCollection; +import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; import org.junit.jupiter.api.Test; /** @@ -32,88 +29,38 @@ */ public class SetOperationsTest { - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - private final Shape shape = new Shape(testFunction, 3, 72, 17); - - @Test - public void testDifferentShapesThrows() { - final List lst = Arrays.asList(1, 2); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - final Shape shape2 = new Shape(testFunction, 3, 72, 18); - final List lst2 = Arrays.asList(2, 3); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape2); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape2); - - try { - SetOperations.cosineDistance(filter1, filter2); - fail("Expected an IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // Ignore - } - } + protected final SimpleHasher from1 = new SimpleHasher(1, 1); + protected final long from1Value = 0x3FFFEL; + protected final SimpleHasher from11 = new SimpleHasher(11, 1); + protected final long from11Value = 0xFFFF800L; + protected final HasherCollection bigHasher = new HasherCollection(from1, from11); + protected final long bigHashValue = 0xFFFFFFEL; + private final Shape shape = new Shape(17, 72); /** * Tests that the Cosine similarity is correctly calculated. */ @Test public final void cosineDistanceTest() { - List lst = Arrays.asList(1, 2); - Hasher hasher = new StaticHasher(lst.iterator(), shape); - BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(2, 3); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(0.5, SetOperations.cosineDistance(filter1, filter2), 0.0001); - assertEquals(0.5, SetOperations.cosineDistance(filter2, filter1), 0.0001); - - lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - hasher = new StaticHasher(lst.iterator(), shape); - filter1 = new HasherBloomFilter(hasher, shape); - lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); assertEquals(0.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); + Shape shape2 = new Shape(2, 72); + filter1 = new SimpleBloomFilter(shape2, from1); + filter2 = new SimpleBloomFilter(shape2, new SimpleHasher(2, 1)); - assertEquals(0.514928749927334, SetOperations.cosineDistance(filter1, filter2), 0.000000000000001); - assertEquals(0.514928749927334, SetOperations.cosineDistance(filter2, filter1), 0.000000000000001); + assertEquals(0.5, SetOperations.cosineDistance(filter1, filter2), 0.0001); + assertEquals(0.5, SetOperations.cosineDistance(filter2, filter1), 0.0001); + + filter1 = new SimpleBloomFilter(shape, from1); + filter2 = new SimpleBloomFilter(shape, from11); + + assertEquals(0.58823529, SetOperations.cosineDistance(filter1, filter2), 0.00000001); + assertEquals(0.58823529, SetOperations.cosineDistance(filter2, filter1), 0.00000001); } /** @@ -122,17 +69,14 @@ public final void cosineDistanceTest() { */ @Test public final void cosineDistanceTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); - // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape); + BloomFilter filter3 = new SimpleBloomFilter(shape); assertEquals(1.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); - assertEquals(1.0, SetOperations.cosineDistance(filter1, filter3), 0.0001); - assertEquals(1.0, SetOperations.cosineDistance(filter3, filter1), 0.0001); + assertEquals(1.0, SetOperations.cosineDistance(filter2, filter3), 0.0001); + assertEquals(1.0, SetOperations.cosineDistance(filter3, filter2), 0.0001); } /** @@ -140,23 +84,16 @@ public final void cosineDistanceTest_NoValues() { */ @Test public final void cosineSimilarityTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); assertEquals(1.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.cosineSimilarity(filter2, filter1), 0.0001); - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); + filter2 = new SimpleBloomFilter(shape, from11); - assertEquals(0.485071250072666, SetOperations.cosineSimilarity(filter1, filter2), 0.000000000000001); - assertEquals(0.485071250072666, SetOperations.cosineSimilarity(filter2, filter1), 0.000000000000001); + assertEquals(0.41176470, SetOperations.cosineSimilarity(filter1, filter2), 0.00000001); + assertEquals(0.41176470, SetOperations.cosineSimilarity(filter2, filter1), 0.00000001); } /** @@ -165,12 +102,10 @@ public final void cosineSimilarityTest() { */ @Test public final void cosineSimilarityTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape); + final BloomFilter filter2 = new SimpleBloomFilter(shape); // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); + final BloomFilter filter3 = new SimpleBloomFilter(shape, from1); assertEquals(0.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.cosineSimilarity(filter2, filter1), 0.0001); @@ -178,92 +113,21 @@ public final void cosineSimilarityTest_NoValues() { assertEquals(0.0, SetOperations.cosineSimilarity(filter3, filter1), 0.0001); } - /** - * Tests that the intersection size estimate is correctly calculated. - */ - @Test - public final void estimateIntersectionSizeTest() { - // build a filter - List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - lst = Arrays.asList(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 34, 35, 36, 37, 38, 39, 40); - final Hasher hasher2 = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - final long estimate = SetOperations.estimateIntersectionSize(filter1, filter2); - assertEquals(1, estimate); - } - - /** - * Tests that the size estimate is correctly calculated. - */ - @Test - public final void estimateSizeTest() { - // build a filter - List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher = new StaticHasher(lst.iterator(), shape); - BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - assertEquals(1, SetOperations.estimateSize(filter1)); - - // the data provided above do not generate an estimate that is equivalent to the - // actual. - lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); - hasher = new StaticHasher(lst.iterator(), shape); - filter1 = new HasherBloomFilter(hasher, shape); - assertEquals(1, SetOperations.estimateSize(filter1)); - - lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 33); - final Hasher hasher2 = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(3, SetOperations.estimateSize(filter2)); - } - - /** - * Tests that the union size estimate is correctly calculated. - */ - @Test - public final void estimateUnionSizeTest() { - // build a filter - List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - lst = Arrays.asList(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, - 40); - final Hasher hasher2 = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - final long estimate = SetOperations.estimateUnionSize(filter1, filter2); - assertEquals(3, estimate); - } - /** * Tests that the Hamming distance is correctly calculated. */ @Test public final void hammingDistanceTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); assertEquals(0, SetOperations.hammingDistance(filter1, filter2)); assertEquals(0, SetOperations.hammingDistance(filter2, filter1)); - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); + filter2 = new SimpleBloomFilter(shape, from11); - assertEquals(17, SetOperations.hammingDistance(filter1, filter2)); - assertEquals(17, SetOperations.hammingDistance(filter2, filter1)); + assertEquals(20, SetOperations.hammingDistance(filter1, filter2)); + assertEquals(20, SetOperations.hammingDistance(filter2, filter1)); } /** @@ -271,23 +135,16 @@ public final void hammingDistanceTest() { */ @Test public final void jaccardDistanceTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1), 0.0001); - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); + filter2 = new SimpleBloomFilter(shape, from11); - assertEquals(0.32, SetOperations.jaccardDistance(filter1, filter2), 0.001); - assertEquals(0.32, SetOperations.jaccardDistance(filter2, filter1), 0.001); + assertEquals(0.26, SetOperations.jaccardDistance(filter1, filter2), 0.001); + assertEquals(0.26, SetOperations.jaccardDistance(filter2, filter1), 0.001); } /** @@ -296,12 +153,9 @@ public final void jaccardDistanceTest() { */ @Test public final void jaccardDistanceTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); - // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape); + final BloomFilter filter2 = new SimpleBloomFilter(shape); + final BloomFilter filter3 = new SimpleBloomFilter(shape, from1); assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1), 0.0001); @@ -314,23 +168,16 @@ public final void jaccardDistanceTest_NoValues() { */ @Test public final void jaccardSimilarityTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.jaccardSimilarity(filter2, filter1), 0.0001); - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); + filter2 = new SimpleBloomFilter(shape, from11); - assertEquals(0.68, SetOperations.jaccardSimilarity(filter1, filter2), 0.001); - assertEquals(0.68, SetOperations.jaccardSimilarity(filter2, filter1), 0.001); + assertEquals(0.74, SetOperations.jaccardSimilarity(filter1, filter2), 0.001); + assertEquals(0.74, SetOperations.jaccardSimilarity(filter2, filter1), 0.001); } /** @@ -339,16 +186,72 @@ public final void jaccardSimilarityTest() { */ @Test public final void jaccardSimilarityTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); - // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape); + final BloomFilter filter2 = new SimpleBloomFilter(shape); + final BloomFilter filter3 = new SimpleBloomFilter(shape, from1); assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.jaccardSimilarity(filter2, filter1), 0.0001); assertEquals(1.0, SetOperations.jaccardSimilarity(filter1, filter3), 0.0001); assertEquals(1.0, SetOperations.jaccardSimilarity(filter3, filter1), 0.0001); } + + @Test + public final void orCardinalityTest() { + Shape shape = new Shape(3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63, 64)); + SparseBloomFilter filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(5, SetOperations.orCardinality(shape, filter1, filter2)); + assertEquals(5, SetOperations.orCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(5, SetOperations.orCardinality(shape, filter1, filter2)); + assertEquals(5, SetOperations.orCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(5, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(4, SetOperations.orCardinality(shape, filter1, filter2)); + assertEquals(4, SetOperations.orCardinality(shape, filter2, filter1)); + } + + @Test + public final void andCardinalityTest() { + Shape shape = new Shape(3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63, 64)); + SparseBloomFilter filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(1, SetOperations.andCardinality(shape, filter1, filter2)); + assertEquals(1, SetOperations.andCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(0, SetOperations.andCardinality(shape, filter1, filter2)); + assertEquals(0, SetOperations.andCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(5, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(1, SetOperations.andCardinality(shape, filter1, filter2)); + assertEquals(1, SetOperations.andCardinality(shape, filter2, filter1)); + + } + + @Test + public final void xorCardinalityTest() { + Shape shape = new Shape(3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63, 64)); + SparseBloomFilter filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(4, SetOperations.xorCardinality(shape, filter1, filter2)); + assertEquals(4, SetOperations.xorCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(5, SetOperations.xorCardinality(shape, filter1, filter2)); + assertEquals(5, SetOperations.xorCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(5, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(3, SetOperations.xorCardinality(shape, filter1, filter2)); + assertEquals(3, SetOperations.xorCardinality(shape, filter2, filter1)); + + } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java new file mode 100644 index 0000000000..5e8c6ed1d2 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link Shape} class. + */ +public class ShapeFactoryTest { + + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + * + * n = 5 + * + * p = 0.100375138 (1 in 10) + * + * m = 24 (3B) + * + * k = 3 + */ + + /** + * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. + */ + @Test + public void badNumberOfItemsTest() { + try { + Shape.Factory.fromNM(0, 24); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNMK(0, 24, 5); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNP(0, 0.02); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the number of bits is less than 1 an exception is thrown + */ + @Test + public void badNumberOfBitsTest() { + try { + Shape.Factory.fromNM(5, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNMK(5, 0, 7); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromPMK(0.035, 0, 7); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the number of hash functions is less than 1 an exception is thrown. + */ + @Test + public void badNumberOfHashFunctionsTest() { + try { + Shape.Factory.fromNMK(5, 26, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromPMK(0.35, 26, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown + */ + @Test + public void badProbabilityTest() { + try { + Shape.Factory.fromNMK(4000, 8, 1); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNP(10, 0.0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + try { + Shape.Factory.fromNP(10, 1.0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + try { + Shape.Factory.fromNP(10, Double.NaN); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + } + + /** + * Tests that when the number of items, number of bits and number of hash functions is passed the values are + * calculated correctly. + */ + @Test + public void fromNMK_test() { + /* + * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 + */ + final Shape filterConfig = Shape.Factory.fromNMK(5, 24, 4); + + assertEquals(24, filterConfig.getNumberOfBits()); + assertEquals(4, filterConfig.getNumberOfHashFunctions()); + assertEquals(0.102194782, filterConfig.getProbability(5), 0.000001); + } + + /** + * Tests that the number of items and number of bits is passed the other values are calculated correctly. + */ + @Test + public void fromNM_Test() { + /* + * values from https://hur.st/bloomfilter/?n=5&m=24 + */ + final Shape filterConfig = Shape.Factory.fromNM(5, 24); + + assertEquals(24, filterConfig.getNumberOfBits()); + assertEquals(3, filterConfig.getNumberOfHashFunctions()); + assertEquals(0.100375138, filterConfig.getProbability(5), 0.000001); + } + + /** + * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. + */ + @Test + public void numberOfBitsOverflowTest() { + try { + Shape.Factory.fromNP(Integer.MAX_VALUE, 0.1); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + } + + /** + * Tests the the probability is calculated correctly. + */ + @Test + public void probabilityTest() { + Shape shape = Shape.Factory.fromNMK(5, 24, 3); + assertEquals(24, shape.getNumberOfBits()); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(0.100375138, shape.getProbability(5), 0.000001); + } + + /** + * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash + * functions. + */ + @Test + public void fromPMK_test() { + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 + */ + final Shape shape = Shape.Factory.fromPMK(0.1, 24, 3); + + assertEquals(24, shape.getNumberOfBits()); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(0.100375138, shape.getProbability(5), 0.000001); + } + + /** + * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash + * functions. + */ + @Test + public void fromNP_test() { + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 + */ + final double probability = 1.0/2000000; + final Shape shape = Shape.Factory.fromNP(10, probability ); + + assertEquals(302, shape.getNumberOfBits()); + assertEquals(21, shape.getNumberOfHashFunctions()); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java new file mode 100644 index 0000000000..67c7e53cfd --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link Shape} class. + */ +public class ShapeTest { + + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + * + * n = 5 + * + * p = 0.100375138 (1 in 10) + * + * m = 24 (3B) + * + * k = 3 + */ + + private final Shape shape = new Shape(3, 24); + + /** + * Tests that if the number of bits less than 1 an IllegalArgumentException is thrown. + */ + @Test + public void constructor_items_bits_BadNumberOfBitsTest() { + try { + new Shape(5, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + + /** + * Tests that if the number of hash functions is less than 1 an exception is thrown. + */ + @Test + public void constructor_items_bits_hash_BadNumberOfHashFunctionsTest() { + try { + new Shape(0, 5); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Test equality of shape. + */ + @Test + public void equalsTest() { + + assertEquals(shape, shape); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(24, shape.getNumberOfBits()); + assertEquals(shape.hashCode(), new Shape(3, 24).hashCode()); + assertNotEquals(shape, null); + assertNotEquals(shape, new Shape(3, 25)); + assertNotEquals(shape, new Shape(4, 24)); + } + + @Test + public void estimateNTest() { + double[] expected = { 0.0, 0.3404769153503671, 0.6960910159170385, 1.068251140996181, 1.4585724543516367, + 1.8689188094520417, 2.301456579614247, 2.758723890333837, 3.243720864865314, 3.7600290339658846, + 4.311972005861497, 4.90483578309127, 5.545177444479562, 6.2412684603966, 7.003749898831201, + 7.8466340240938095, 8.788898309344876, 9.85714945034106, 11.090354888959125, 12.54892734331076, + 14.334075753824441, 16.635532333438686, 19.879253198304, 25.424430642783573 }; + for (int i = 0; i < 24; i++) { + assertEquals(expected[i], shape.estimateN(i), 0.00000000000000001); + } + } + + @Test + public void getProbabilityTest() { + double[] expected = { 0.0, 0.0016223626694561954, 0.010823077182670957, 0.030579354491777785, + 0.06091618422799686, 0.1003751381786711, 0.14689159766038104, 0.19829601428155866, 0.25258045782764715, + 0.3080221532988778, 0.3632228594351169, 0.4171013016177174, 0.4688617281200601, 0.5179525036637239, + 0.5640228015164387, 0.6068817738972262, 0.6464623147796981, 0.6827901771310362, 0.7159584363083427, + 0.7461068849672469, 0.7734057607554121, 0.7980431551369204, 0.8202154721379679, 0.8401203636727712 }; + for (int i = 0; i < 24; i++) { + assertEquals(expected[i], shape.getProbability(i), 0.000000000000001); + } + + assertEquals( 0.0, shape.getProbability(0), 0.0 ); + + try { + shape.getProbability( -1 ); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expect) { + // do nothing + } + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java new file mode 100644 index 0000000000..5c0ef45082 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.junit.Test; + +/** + * Tests for the {@link SimpleBloomFilter}. + */ +public class SimpleBloomFilterTest extends AbstractBloomFilterTest { + @Override + protected SimpleBloomFilter createEmptyFilter(final Shape shape) { + return new SimpleBloomFilter(shape); + } + + @Override + protected SimpleBloomFilter createFilter(final Shape shape, final Hasher hasher) { + return new SimpleBloomFilter(shape, hasher); + } + + @Test + public void constructorTest() { + + SimpleBloomFilter filter = new SimpleBloomFilter( shape, BitMapProducer.fromLongArray( new long[] { 500L }) ); + List lst = new ArrayList(); + filter.forEachBitMap( lst::add ); + assertEquals( 1, lst.size() ); + assertEquals( 500L, lst.get(0).intValue() ); + + try { + filter = new SimpleBloomFilter( shape, + BitMapProducer.fromLongArray( new long[] { 500L, 400L, 300L }) ); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java new file mode 100644 index 0000000000..e8f1845322 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.junit.Test; + +/** + * Tests for the {@link SparseBloomFilter}. + */ +public class SparseBloomFilterTest extends AbstractBloomFilterTest { + @Override + protected SparseBloomFilter createEmptyFilter(final Shape shape) { + return new SparseBloomFilter(shape); + } + + @Override + protected SparseBloomFilter createFilter(final Shape shape, final Hasher hasher) { + return new SparseBloomFilter(shape, hasher); + } + + @Test + public void constructor_indexOutOfRange() { + Shape shape = new Shape( 1, 5 ); + List lst = new ArrayList(); + lst.add( 5 ); + try { + new SparseBloomFilter( shape, lst ); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing; + } + lst.clear(); + lst.add( -1 ); + try { + new SparseBloomFilter( shape, lst ); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing; + } + } + + @Test + public void constructor_noValues() { + Shape shape = new Shape( 1, 5 ); + List lst = new ArrayList(); + new SparseBloomFilter( shape, lst ); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java deleted file mode 100644 index afbd6d8b0f..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -import java.nio.charset.StandardCharsets; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator.OfInt; - -import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -/** - * {@link DynamicHasher.Builder} tests. - */ -public class DynamicHasherBuilderTest { - - private DynamicHasher.Builder builder; - private final HashFunction hf = new MD5Cyclic(); - private final Shape shape = new Shape(hf, 1, 345, 1); - private final String testString = HasherBuilderTest.getExtendedString(); - - /** - * Tests that hashing a byte array works as expected. - */ - @Test - public void buildTest_byteArray() { - final byte[] bytes = testString.getBytes(); - final DynamicHasher hasher = builder.with(bytes).build(); - final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); - - final OfInt iter = hasher.iterator(shape); - - assertTrue(iter.hasNext()); - assertEquals(expected, iter.nextInt()); - assertFalse(iter.hasNext()); - } - - /** - * Tests that an empty hasher works as expected. - */ - @Test - public void buildTest_Empty() { - final DynamicHasher hasher = builder.build(); - - final OfInt iter = hasher.iterator(shape); - - assertFalse(iter.hasNext()); - try { - iter.nextInt(); - fail("Should have thrown NoSuchElementException"); - } catch (final NoSuchElementException ignore) { - // do nothing - } - } - - /** - * Tests that hashing a string works as expected. - */ - @Test - public void buildTest_String() { - final byte[] bytes = testString.getBytes(StandardCharsets.UTF_8); - final DynamicHasher hasher = builder.with(testString, StandardCharsets.UTF_8).build(); - final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); - - final OfInt iter = hasher.iterator(shape); - - assertTrue(iter.hasNext()); - assertEquals(expected, iter.nextInt()); - assertFalse(iter.hasNext()); - } - - /** - * Tests that hashing a string works as expected. - */ - @Test - public void buildTest_UnencodedString() { - final byte[] bytes = testString.getBytes(StandardCharsets.UTF_16LE); - final DynamicHasher hasher = builder.withUnencoded(testString).build(); - final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); - - final OfInt iter = hasher.iterator(shape); - - assertTrue(iter.hasNext()); - assertEquals(expected, iter.nextInt()); - assertFalse(iter.hasNext()); - } - - /** - * Tests that build resets the builder. - */ - @Test - public void buildResetTest() { - builder.with(new byte[] {123}); - final OfInt iter = builder.build().iterator(shape); - - assertTrue(iter.hasNext()); - iter.next(); - assertFalse(iter.hasNext()); - - // Nothing added since last build so it should be an empty hasher - final OfInt iter2 = builder.build().iterator(shape); - assertFalse(iter2.hasNext()); - } - - /** - * Sets up the builder for testing. - */ - @BeforeEach - public void setup() { - builder = new DynamicHasher.Builder(hf); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java deleted file mode 100644 index 7b2bbba3e8..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -import java.nio.charset.StandardCharsets; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator.OfInt; - -import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link DynamicHasher}. - */ -public class DynamicHasherTest { - private DynamicHasher.Builder builder; - private Shape shape; - - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - /** - * Sets up the DynamicHasher. - */ - @BeforeEach - public void setup() { - builder = new DynamicHasher.Builder(new MD5Cyclic()); - shape = new Shape(new MD5Cyclic(), 3, 72, 17); - } - - /** - * Tests that the expected bits are returned from hashing. - */ - @Test - public void testGetBits() { - - final int[] expected = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62}; - - final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).build(); - - final OfInt iter = hasher.iterator(shape); - - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that bits from multiple hashes are returned correctly. - */ - @Test - public void testGetBits_MultipleHashes() { - final int[] expected = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62, 1, 63, 53, 43, 17, 7, 69, - 59, 49, 39, 13, 3, 65, 55, 45, 35, 25}; - - final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).with("World", StandardCharsets.UTF_8).build(); - - final OfInt iter = hasher.iterator(shape); - - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - try { - iter.next(); - fail("Should have thrown NoSuchElementException"); - } catch (final NoSuchElementException ignore) { - // do nothing - } - } - - /** - * Tests that retrieving bits for the wrong shape throws an exception. - */ - @Test - public void testGetBits_WrongShape() { - - final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).build(); - - try { - hasher.iterator(new Shape(testFunction, 3, 72, 17)); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java deleted file mode 100644 index 479cfa5188..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.junit.jupiter.api.Test; - -/** - * Tests the HashFunctionIdentity implementation ({@link HashFunctionIdentityImpl}).. - */ -public class HashFunctionIdentityImplTest { - - /** - * Tests a copy constructor of the HashFunctionIdentity. - */ - @Test - public void copyConstructorTest() { - final HashFunctionIdentity identity = new HashFunctionIdentity() { - - @Override - public String getName() { - return "NAME"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Provider"; - } - - @Override - public long getSignature() { - return -1L; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - - }; - final HashFunctionIdentityImpl impl = new HashFunctionIdentityImpl(identity); - assertEquals("NAME", impl.getName()); - assertEquals("Provider", impl.getProvider()); - assertEquals(Signedness.SIGNED, impl.getSignedness()); - assertEquals(ProcessType.CYCLIC, impl.getProcessType()); - assertEquals(-1L, impl.getSignature()); - } - - /** - * Test the constructor from component values. - */ - @Test - public void valuesConstructorTest() { - final HashFunctionIdentityImpl impl = new HashFunctionIdentityImpl("Provider", "NAME", Signedness.UNSIGNED, - ProcessType.ITERATIVE, -2L); - assertEquals("NAME", impl.getName()); - assertEquals("Provider", impl.getProvider()); - assertEquals(Signedness.UNSIGNED, impl.getSignedness()); - assertEquals(ProcessType.ITERATIVE, impl.getProcessType()); - assertEquals(-2L, impl.getSignature()); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidatorTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidatorTest.java deleted file mode 100644 index e68df55b26..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidatorTest.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; -import org.junit.jupiter.api.Test; - -/** - * Tests of the {@link HashFunctionValidator}. - */ -public class HashFunctionValidatorTest { - - /** - * Tests that name is used in the equality check. - */ - @Test - public void testName() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl1, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Tests that name is not affected by case. - */ - @Test - public void testNameIsCaseInsensitive() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "IMPL1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl2)); - } - - /** - * Tests that process type is used in the equality check. - */ - @Test - public void testProcessType() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.ITERATIVE, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl1, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Tests that provider is not used in the equality check. - */ - @Test - public void testProviderIsNotUsedInEqualityCheck() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite2", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertTrue(HashFunctionValidator.areEqual(impl1, impl2)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Tests that signedness is used in the equality check. - */ - @Test - public void testSignedness() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl1, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Test the check method throws when the two hash functions are not equal. - */ - @Test - public void testCheckThrows() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, - ProcessType.CYCLIC, 300L); - assertThrows(IllegalArgumentException.class, () -> HashFunctionValidator.checkAreEqual(impl1, impl2)); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java deleted file mode 100644 index 303034053a..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher.Builder; -import org.apache.commons.lang3.NotImplementedException; -import org.junit.jupiter.api.Test; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; - -/** - * Tests the - * {@link org.apache.commons.collections4.bloomfilter.hasher.Hasher.Builder Hasher.Builder}. - */ -public class HasherBuilderTest { - - /** - * Simple class to collect byte[] items added to the builder. - */ - private static class TestBuilder implements Hasher.Builder { - ArrayList items = new ArrayList<>(); - - @Override - public Hasher build() { - throw new NotImplementedException("Not required"); - } - - @Override - public Builder with(final byte[] item) { - items.add(item); - return this; - } - } - - /** - * Tests that adding CharSequence items works correctly. - */ - @Test - public void withCharSequenceTest() { - final String ascii = "plain"; - final String extended = getExtendedString(); - for (final String s : new String[] {ascii, extended}) { - for (final Charset cs : new Charset[] { - StandardCharsets.ISO_8859_1, StandardCharsets.UTF_8, StandardCharsets.UTF_16 - }) { - final TestBuilder builder = new TestBuilder(); - builder.with(s, cs); - assertArrayEquals(s.getBytes(cs), builder.items.get(0)); - } - } - } - - /** - * Tests that adding unencoded CharSequence items works correctly. - */ - @Test - public void withUnencodedCharSequenceTest() { - final String ascii = "plain"; - final String extended = getExtendedString(); - for (final String s : new String[] {ascii, extended}) { - final TestBuilder builder = new TestBuilder(); - builder.withUnencoded(s); - final byte[] encoded = builder.items.get(0); - final char[] original = s.toCharArray(); - // Should be twice the length - assertEquals(original.length * 2, encoded.length); - // Should be little endian (lower bits first) - final CharBuffer buffer = ByteBuffer.wrap(encoded) - .order(ByteOrder.LITTLE_ENDIAN).asCharBuffer(); - for (int i = 0; i < original.length; i++) { - assertEquals(original[i], buffer.get(i)); - } - } - } - - /** - * Gets a string with non-standard characters. - * - * @return the extended string - */ - static String getExtendedString() { - final char[] data = {'e', 'x', 't', 'e', 'n', 'd', 'e', 'd', ' ', - // Add some characters that are non standard - // non-ascii - 0xCA98, - // UTF-16 surrogate pair - 0xD803, 0xDE6D - // Add other cases here ... - }; - return String.valueOf(data); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java new file mode 100644 index 0000000000..bbcc91a359 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link HasherCollection}. + */ +public class HasherCollectionTest { + + private SimpleHasher hasher1 = new SimpleHasher(1, 1); + private SimpleHasher hasher2 = new SimpleHasher(2, 2); + + @Test + public void sizeTest() { + HasherCollection hasher = new HasherCollection(hasher1, hasher2); + assertEquals(2, hasher.size()); + HasherCollection hasher3 = new HasherCollection(hasher, new SimpleHasher(3, 3)); + assertEquals(3, hasher3.size()); + } + + @Test + public void isEmptyTest() { + HasherCollection hasher = new HasherCollection(); + assertTrue( hasher.isEmpty() ); + hasher.add( hasher1 ); + assertFalse( hasher.isEmpty() ); + } + + @Test + public void testIndices() { + HasherCollection hasher = new HasherCollection(hasher1, hasher2); + assertEquals(2, hasher.size()); + Shape shape = new Shape(5, 10); + Integer[] expected = { 1, 2, 3, 4, 5, 2, 4, 6, 8, 0 }; + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(String.format("error at position %d", i), expected[i], lst.get(i)); + } + } + + @Test + public void testAdd_collection() { + HasherCollection hasher = new HasherCollection(); + hasher.add( Arrays.asList( hasher1, hasher2)); + assertEquals(2, hasher.size()); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java new file mode 100644 index 0000000000..ce6d1aa7da --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.junit.Test; + +/** + * Tests the {@link Hasher.Filter}. + */ +public class HasherFilterTest { + + @Test + public void testBasicFiltering() { + Hasher.Filter filter = new Hasher.Filter(10); + + for (int i = 0; i < 10; i++) { + assertTrue(filter.test(i)); + } + + for (int i = 0; i < 10; i++) { + assertFalse(filter.test(i)); + } + + try { + filter.test(10); + fail("Should have thrown IndexOutOfBounds exception"); + } catch (IndexOutOfBoundsException expected) { + // do nothing. + } + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java similarity index 52% rename from src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java index 9b0d9a83e1..d92b178883 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java @@ -14,38 +14,41 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.commons.collections4.bloomfilter.hasher.function; +package org.apache.commons.collections4.bloomfilter.hasher; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; import org.junit.jupiter.api.Test; /** - * Tests the MD5 cyclic hash function. + * Tests the {@link NullHasher}. */ -public class MD5CyclicTest extends AbstractHashFunctionTest { +public class NullHasherTest { + + private Hasher hasher = NullHasher.INSTANCE; - /** - * Test that the apply function returns the proper values. - */ @Test - public void applyTest() { - final MD5Cyclic md5 = new MD5Cyclic(); - final long l1 = 0x8b1a9953c4611296L; - final long l2 = 0xa827abf8c47804d7L; - final byte[] buffer = "Hello".getBytes(); + public void sizeTest() { + assertEquals(0, hasher.size()); + } - long l = md5.apply(buffer, 0); - assertEquals(l1, l); - l = md5.apply(buffer, 1); - assertEquals(l1 + l2, l); - l = md5.apply(buffer, 2); - assertEquals(l1 + l2 + l2, l); + @Test + public void testIterator() { + Shape shape = new Shape(5, 10); + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(0, lst.size()); } - @Override - protected HashFunction createHashFunction() { - return new MD5Cyclic(); + @Test + public void isEmptyTest() { + assertTrue( hasher.isEmpty() ); } + } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java deleted file mode 100644 index 90f3808d8e..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java +++ /dev/null @@ -1,500 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; -import static org.junit.jupiter.api.Assertions.fail; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; - -import java.util.ArrayList; - -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link Shape} class. - */ -public class ShapeTest { - - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - /* - * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= - * - * n = 5 - * - * p = 0.100375138 (1 in 10) - * - * m = 24 (3B) - * - * k = 3 - */ - - private final Shape shape = new Shape(testFunction, 5, 0.1); - - /** - * Tests that if the number of bits less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_bits_BadNumberOfBitsTest() { - try { - new Shape(testFunction, 5, 0); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of hash functions is less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_bits_BadNumberOfHashFunctionsTest() { - try { - new Shape(testFunction, 16, 8); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_bits_BadNumberOfItemsTest() { - try { - new Shape(testFunction, 0, 24); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of bits is less than 1 an exception is thrown - */ - @Test - public void constructor_items_bits_hash_BadNumberOfBitsTest() { - try { - new Shape(testFunction, 5, 0, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of hash functions is less than 1 an exception is thrown. - */ - @Test - public void constructor_items_bits_hash_BadNumberOfHashFunctionsTest() { - try { - new Shape(testFunction, 5, 24, 0); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of items is less than 1 an exception is thrown. - */ - @Test - public void constructor_items_bits_hash_BadNumberOfItemsTest() { - try { - new Shape(testFunction, 0, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown - */ - @Test - public void constructor_items_bits_hash_BadProbabilityTest() { - try { - new Shape(testFunction, 4000, 8, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that when the number of items, number of bits and number of hash functions is passed the values are - * calculated correctly. - */ - @Test - public void constructor_items_bits_hashTest() { - /* - * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 - */ - final Shape filterConfig = new Shape(testFunction, 5, 24, 4); - - assertEquals(24, filterConfig.getNumberOfBits()); - assertEquals(4, filterConfig.getNumberOfHashFunctions()); - assertEquals(5, filterConfig.getNumberOfItems()); - assertEquals(0.102194782, filterConfig.getProbability(), 0.000001); - } - - /** - * Tests that the number of items and number of bits is passed the other values are calculated correctly. - */ - @Test - public void constructor_items_bitsTest() { - /* - * values from https://hur.st/bloomfilter/?n=5&m=24 - */ - final Shape filterConfig = new Shape(testFunction, 5, 24); - - assertEquals(24, filterConfig.getNumberOfBits()); - assertEquals(3, filterConfig.getNumberOfHashFunctions()); - assertEquals(5, filterConfig.getNumberOfItems()); - assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); - } - - /** - * Tests that if the number of items is less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_probability_BadNumberOfItemsTest() { - try { - new Shape(testFunction, 0, 1.0 / 10); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests that if the probability is less than or equal to 0 or more than or equal to 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_probability_BadProbabilityTest() { - try { - new Shape(testFunction, 10, 0.0); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - try { - new Shape(testFunction, 10, 1.0); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - try { - new Shape(testFunction, 10, Double.NaN); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_probability_NumberOfBitsOverflowTest() { - try { - new Shape(testFunction, Integer.MAX_VALUE, 1.0 / 10); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests the the probability is calculated correctly. - */ - @Test - public void constructor_items_probability_Test() { - - assertEquals(24, shape.getNumberOfBits()); - assertEquals(3, shape.getNumberOfHashFunctions()); - assertEquals(5, shape.getNumberOfItems()); - assertEquals(0.100375138, shape.getProbability(), 0.000001); - } - - /** - * Tests that the constructor with a null name, number of items and size of filter fails. - */ - @Test - public void constructor_nm_noName() { - try { - new Shape(null, 5, 72); - fail("Should throw NullPointerException"); - } catch (final NullPointerException expected) { - // do nothing - } - } - - /** - * Tests that the constructor with a null name, number of items, size of filter, and number of functions fails. - */ - @Test - public void constructor_nmk_noName() { - try { - new Shape(null, 5, 72, 17); - fail("Should throw NullPointerException"); - } catch (final NullPointerException expected) { - // do nothing - } - } - - /** - * Tests that the constructor with a null name, number of items, and probability fails. - */ - @Test - public void constructor_np_noName() { - try { - new Shape(null, 5, 0.1); - fail("Should throw NullPointerException"); - } catch (final NullPointerException expected) { - // do nothing - } - } - - /** - * Tests that the constructor with a null name, probability, size of filter, and number of functions fails. - */ - @Test - public void constructor_pmk_noName() { - try { - new Shape(null, 0.1, 72, 17); - fail("Should throw NullPointerException"); - } catch (final NullPointerException expected) { - // do nothing - } - } - - /** - * Tests that if the number of bits is less than 1 an exception is thrown - */ - @Test - public void constructor_probability_bits_hash_BadNumberOfBitsTest() { - try { - new Shape(testFunction, 0.5, 0, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of functions is less than 1 an exception is thrown - */ - @Test - public void constructor_probability_bits_hash_BadNumberOfHashFunctionsTest() { - try { - new Shape(testFunction, 0.5, 24, 0); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that invalid probability values cause and IllegalArgumentException to be thrown. - */ - @Test - public void constructor_probability_bits_hash_BadProbabilityTest() { - // probability should not be 0 - try { - new Shape(testFunction, 0.0, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - - // probability should not be = -1 - try { - new Shape(testFunction, -1.0, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - - // probability should not be < -1 - try { - new Shape(testFunction, -1.5, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - - // probability should not be = 1 - try { - new Shape(testFunction, 1.0, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - - // probability should not be > 1 - try { - new Shape(testFunction, 2.0, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash - * functions. - */ - @Test - public void constructor_probability_bits_hashTest() { - /* - * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 - */ - final Shape filterConfig = new Shape(testFunction, 0.1, 24, 3); - - assertEquals(24, filterConfig.getNumberOfBits()); - assertEquals(3, filterConfig.getNumberOfHashFunctions()); - assertEquals(5, filterConfig.getNumberOfItems()); - assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); - } - - /** - * Test equality of shape. - */ - @Test - public void equalsTest() { - - assertEquals(shape, shape); - assertEquals(shape, new Shape(testFunction, 5, 1.0 / 10)); - assertNotEquals(shape, null); - assertNotEquals(shape, new Shape(testFunction, 5, 1.0 / 11)); - assertNotEquals(shape, new Shape(testFunction, 4, 1.0 / 10)); - // Number of bits does not change equality, - // only the number of bits and the number of hash functions - final int numberOfBits = 10000; - final int numberOfItems = 15; - final int numberOfHashFunctions = 4; - assertEquals(new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions), - new Shape(testFunction, numberOfItems + 1, numberOfBits, numberOfHashFunctions)); - assertNotEquals(new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions), - new Shape(testFunction, numberOfItems, numberOfBits + 1, numberOfHashFunctions)); - assertNotEquals(new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions), - new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions + 1)); - - final HashFunctionIdentity testFunction2 = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function2"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - assertNotEquals(shape, new Shape(testFunction2, 4, 1.0 / 10)); - } - - /** - * Test that hashCode satisfies the contract between {@link Object#hashCode()} and - * {@link Object#equals(Object)}. Equal shapes must have the same hash code. - */ - @Test - public void hashCodeTest() { - // Hash function equality is based on process type, signedness and name (case insensitive) - final ArrayList list = new ArrayList<>(); - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - // Provider changes - list.add(new HashFunctionIdentityImpl("PROVIDER", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - list.add(new HashFunctionIdentityImpl("Provider2", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - // Name changes - list.add(new HashFunctionIdentityImpl("Provider", "name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - list.add(new HashFunctionIdentityImpl("Provider", "NAME", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - list.add(new HashFunctionIdentityImpl("Provider", "Other", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - // Signedness changes - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.UNSIGNED, ProcessType.ITERATIVE, 0L)); - // ProcessType changes - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.SIGNED, ProcessType.CYCLIC, 0L)); - // Signature changes - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 1L)); - - // Create shapes that only differ in the hash function. - final int numberOfItems = 30; - final int numberOfBits = 3000; - final int numberOfHashFunctions = 10; - final Shape shape1 = new Shape(list.get(0), numberOfItems, numberOfBits, numberOfHashFunctions); - assertEquals(shape1, shape1); - - // Try variations - for (int i = 1; i < list.size(); i++) { - final Shape shape2 = new Shape(list.get(i), numberOfItems, numberOfBits, numberOfHashFunctions); - assertEquals(shape2, shape2); - - // Equal shapes must have the same hash code - if (shape1.equals(shape2)) { - assertEquals(shape1.hashCode(), shape2.hashCode()); - } - } - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java new file mode 100644 index 0000000000..aef6190cf4 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.fail; +import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link SimpleHasher}. + */ +public class SimpleHasherTest { + + private SimpleHasher hasher = new SimpleHasher(1, 1); + + @Test + public void constructor_byteTest() { + try { + hasher = new SimpleHasher( new byte[0] ); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing. + } + } + @Test + public void sizeTest() { + assertEquals(1, hasher.size()); + } + + @Test + public void isEmptyTest() { + assertFalse( hasher.isEmpty() ); + } + + @Test + public void testIterator() { + Shape shape = new Shape(5, 10); + Integer[] expected = { 1, 2, 3, 4, 5 }; + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + } + + @Test + public void constructorBufferTest() { + Shape shape = new Shape(5, 10); + byte[] buffer = { 1, 1 }; + SimpleHasher hasher = new SimpleHasher(buffer); + Integer[] expected = { 1, 2, 3, 4, 5 }; + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + + buffer = new byte[] { 1 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 0, 1, 2, 3, 4 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + + buffer = new byte[] { 1, 0, 1 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 1, 2, 3, 4, 5 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + + buffer = new byte[] { 0, 1, 0, 1 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 1, 2, 3, 4, 5 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + + buffer = new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 1, 2, 3, 4, 5 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + + buffer = new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 1, 2, 3, 4, 5 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + + buffer = new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 1, 2, 3, 4, 5 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java new file mode 100644 index 0000000000..81c19f8d60 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link SingleItemHasherCollection}. + */ +public class SingleItemHasherCollectionTest { + + private SimpleHasher hasher1 = new SimpleHasher(1, 1); + private SimpleHasher hasher2 = new SimpleHasher(2, 2); + + + @Test + public void sizeTest() { + SingleItemHasherCollection hasher = new SingleItemHasherCollection(); + assertEquals(0, hasher.size() ); + hasher.add( NullHasher.INSTANCE ); + assertEquals(0, hasher.size()); + hasher.add( hasher1 ); + hasher.add( hasher2 ); + assertEquals(1, hasher.size()); + HasherCollection hasher3 = new SingleItemHasherCollection(hasher, new SimpleHasher(3, 3)); + assertEquals(1, hasher3.size()); + + } + + @Test + public void isEmptyTest() { + SingleItemHasherCollection hasher = new SingleItemHasherCollection(); + assertTrue( hasher.isEmpty() ); + hasher.add( NullHasher.INSTANCE ); + assertTrue( hasher.isEmpty() ); + hasher.add( hasher1 ); + assertFalse( hasher.isEmpty() ); + } + + + @Test + public void testIndices() { + HasherCollection hasher = new SingleItemHasherCollection(hasher1, hasher2); + Shape shape = new Shape(5, 10); + Integer[] expected = { 1, 2, 3, 4, 5, 6, 8, 0 }; + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + } + + @Test + public void testAdd_collection() { + HasherCollection hasher = new SingleItemHasherCollection(); + hasher.add( Arrays.asList( hasher1, hasher2)); + assertEquals(1, hasher.size()); + Integer[] expected = { 1, 2, 3, 4, 5, 6, 8, 0 }; + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(new Shape(5, 10)); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java deleted file mode 100644 index c3d7c5c51e..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.PrimitiveIterator.OfInt; - -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link StaticHasher}. - */ -public class StaticHasherTest { - - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - private final HashFunctionIdentity testFunctionX = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test FunctionX"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - private final Shape shape = new Shape(testFunction, 3, 72, 17); - - /** - * Compare 2 static hashers to verify they have the same bits enabled. - * - * @param hasher1 the first static hasher. - * @param hasher2 the second static hasher. - */ - private void assertSameBits(final StaticHasher hasher1, final StaticHasher hasher2) { - final OfInt iter1 = hasher1.iterator(shape); - final OfInt iter2 = hasher2.iterator(shape); - - while (iter1.hasNext()) { - assertTrue(iter2.hasNext(), "Not enough data in second hasher"); - assertEquals(iter1.nextInt(), iter2.nextInt()); - } - assertFalse(iter2.hasNext(), "Too much data in second hasher"); - } - - /** - * Tests that passing a hasher other than a StaticHasher to the constructor works as - * expected. - */ - @Test - public void testConstructor_Hasher() { - final int[] expected = {1, 3, 5, 7, 9}; - - final Hasher testHasher = new Hasher() { - - @Override - public OfInt iterator(final Shape shape) { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - return Arrays.stream(values).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return testFunction; - } - }; - - final StaticHasher hasher = new StaticHasher(testHasher, shape); - final OfInt iter = hasher.iterator(shape); - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that passing a hasher other than a StaticHasher and the wrong Shape to the - * constructor throws an IllegalArgumentException. - */ - @Test - public void testConstructor_Hasher_WrongShape() { - final Hasher testHasher = new Hasher() { - - @Override - public OfInt iterator(final Shape shape) { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - return Arrays.stream(values).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return testFunctionX; - } - }; - - try { - new StaticHasher(testHasher, shape); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } - - /** - * Test that the iterator based constructor works correctly and removes duplicates. - */ - @Test - public void testConstructor_Iterator() { - - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - Iterator iter = Arrays.stream(values).iterator(); - final StaticHasher hasher = new StaticHasher(iter, shape); - - assertEquals(5, hasher.size()); - assertEquals(shape, hasher.getShape()); - // All function properties are equal - assertEquals(testFunction.getName(), hasher.getHashFunctionIdentity().getName()); - assertEquals(testFunction.getProcessType(), hasher.getHashFunctionIdentity().getProcessType()); - assertEquals(testFunction.getProvider(), hasher.getHashFunctionIdentity().getProvider()); - assertEquals(testFunction.getSignedness(), hasher.getHashFunctionIdentity().getSignedness()); - - iter = hasher.iterator(shape); - int idx = 0; - while (iter.hasNext()) { - assertEquals(Integer.valueOf(values[idx]), iter.next(), "Error at idx " + idx); - idx++; - } - assertEquals(5, idx); - } - - /** - * Tests that if the iterator passed to the constructor contains a value greater than - * or equal to Shape.numberOfBits() an exception is thrown. - */ - @Test - public void testConstructor_Iterator_ValueTooBig() { - - final int[] values = {shape.getNumberOfBits(), 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - try { - new StaticHasher(iter, shape); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } - - /** - * Tests that if the iterator passed to the constructor contains a value less than 0 - * (zero) an exception is thrown. - */ - @Test - public void testConstructor_Iterator_ValueTooSmall() { - - final int[] values = {-1, 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - try { - new StaticHasher(iter, shape); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } - - /** - * Tests that the constructor that accepts a static hasher properly builds the hasher. - */ - @Test - public void testConstructor_StaticHasher() { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - final StaticHasher hasher = new StaticHasher(iter, shape); - - final StaticHasher hasher2 = new StaticHasher(hasher, shape); - assertEquals(shape, hasher2.getShape()); - assertSameBits(hasher, hasher2); - } - - /** - * Tests that calling the constructor with a hasher and the wrong shape throws an - * IllegalArgumentException. - */ - @Test - public void testConstructor_StaticHasher_WrongShape() { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - final StaticHasher hasher = new StaticHasher(iter, new Shape(testFunctionX, 3, 72, 17)); - - try { - new StaticHasher(hasher, shape); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } - - /** - * Tests that iterator returns the proper values. - */ - @Test - public void testGetBits() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); - assertEquals(17, hasher.size()); - final OfInt iter = hasher.iterator(shape); - for (int i = 0; i < 17; i++) { - assertTrue(iter.hasNext()); - assertEquals(i, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that iterator does not return duplicates and orders the indices. - */ - @Test - public void testGetBits_DuplicateValues() { - final int[] input = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62, 1, 63, 53, 43, 17, 7, 69, 59, - 49, 39, 13, 3, 65, 55, 45, 35, 25}; - final int[] expected = {1, 2, 3, 6, 7, 10, 11, 13, 15, 17, 19, 23, 24, 25, 35, 36, 39, 43, 44, 45, 48, 49, 53, 55, 57, - 59, 61, 62, 63, 65, 69, 70}; - - final StaticHasher hasher = new StaticHasher(Arrays.stream(input).iterator(), shape); - - final OfInt iter = hasher.iterator(shape); - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that gitBits is called with the wrong shape an exception is thrown. - */ - @Test - public void testGetBits_WrongShape() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); - - try { - hasher.iterator(new Shape(testFunctionX, 3, 72, 17)); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/AbstractHashFunctionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/AbstractHashFunctionTest.java deleted file mode 100644 index 5498d699cb..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/AbstractHashFunctionTest.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.junit.jupiter.api.Test; - -/** - * Tests the signature of a hash function. - */ -public abstract class AbstractHashFunctionTest { - - /** - * Test that the signature is properly generated. - */ - @Test - public void signatureTest() { - final HashFunction hf = createHashFunction(); - final long expected = hf.apply(HashFunctionIdentity.prepareSignatureBuffer(hf), 0); - assertEquals(expected, hf.getSignature()); - // Should be repeatable - final long expected2 = hf.apply(HashFunctionIdentity.prepareSignatureBuffer(hf), 0); - assertEquals(expected, expected2); - assertEquals("Apache Commons Collections", hf.getProvider()); - } - - /** - * Creates the hash function. - * - * @return the hash function - */ - protected abstract HashFunction createHashFunction(); -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64CyclicTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64CyclicTest.java deleted file mode 100644 index 9e17c2ec89..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64CyclicTest.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.nio.charset.StandardCharsets; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Test that the Murmur3 128 x64 hash function works correctly. - */ -public class Murmur128x64CyclicTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final Murmur128x64Cyclic murmur = new Murmur128x64Cyclic(); - - final long l1 = 0xe7eb60dabb386407L; - final long l2 = 0xc3ca49f691f73056L; - final byte[] buffer = "Now is the time for all good men to come to the aid of their country" - .getBytes(StandardCharsets.UTF_8); - - long l = murmur.apply(buffer, 0); - assertEquals(l1, l); - l = murmur.apply(buffer, 1); - assertEquals(l1 + l2, l); - l = murmur.apply(buffer, 2); - assertEquals(l1 + l2 + l2, l); - } - - @Override - protected HashFunction createHashFunction() { - return new Murmur128x64Cyclic(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java deleted file mode 100644 index bca60c1e4b..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.nio.charset.StandardCharsets; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Test that the Murmur3 32 x86 hash function works correctly. - */ -public class Murmur32x86IterativeTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final Murmur32x86Iterative murmur = new Murmur32x86Iterative(); - - final byte[] buffer = "Now is the time for all good men to come to the aid of their country" - .getBytes(StandardCharsets.UTF_8); - - long l = murmur.apply(buffer, 0); - assertEquals(82674681, l); - l = murmur.apply(buffer, 1); - assertEquals(-1475490736, l); - l = murmur.apply(buffer, 2); - assertEquals(-1561435247, l); - } - - @Override - protected HashFunction createHashFunction() { - return new Murmur32x86Iterative(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java deleted file mode 100644 index 5595efdc77..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Tests that the Objects hash works correctly. - */ -public class ObjectsHashIterativeTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final ObjectsHashIterative obj = new ObjectsHashIterative(); - - final byte[] buffer = "Now is the time for all good men to come to the aid of their country" - .getBytes(StandardCharsets.UTF_8); - - long l = obj.apply(buffer, 0); - long prev = 0; - assertEquals(Arrays.deepHashCode(new Object[] {prev, buffer}), l); - for (int i = 1; i <= 5; i++) { - prev += l; - l = obj.apply(buffer, i); - assertEquals(Arrays.deepHashCode(new Object[] {prev, buffer}), l); - } - } - - @Override - protected HashFunction createHashFunction() { - return new ObjectsHashIterative(); - } -}