From 3e9b850c317572c001094b3e4044ec6ef028e516 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Thu, 30 Sep 2021 18:52:27 +0100 Subject: [PATCH 01/27] Fixed some unit tests --- .../bloomfilter/AbstractBloomFilter.java | 200 ------ .../bloomfilter/ArrayCountingBloomFilter.java | 52 +- .../bloomfilter/BitSetBloomFilter.java | 126 ---- .../collections4/bloomfilter/BloomFilter.java | 311 ++++++++- .../bloomfilter/BloomFilterIndexer.java | 84 --- .../bloomfilter/CountingBloomFilter.java | 32 +- .../bloomfilter/HasherBloomFilter.java | 155 ----- .../bloomfilter/SetOperations.java | 162 ----- .../bloomfilter/SimpleBloomFilter.java | 97 +++ .../bloomfilter/SparseBloomFilter.java | 124 ++++ .../bloomfilter/hasher/DynamicHasher.java | 204 ------ .../bloomfilter/hasher/HashFunction.java | 49 -- .../hasher/HashFunctionIdentity.java | 173 ----- .../hasher/HashFunctionIdentityImpl.java | 86 --- .../hasher/HashFunctionValidator.java | 90 --- .../bloomfilter/hasher/Hasher.java | 70 +-- .../bloomfilter/hasher/HasherCollection.java | 139 +++++ .../bloomfilter/hasher/Shape.java | 270 +------- .../bloomfilter/hasher/SimpleHasher.java | 113 ++++ .../bloomfilter/hasher/StaticHasher.java | 142 ----- .../hasher/function/MD5Cyclic.java | 112 ---- .../hasher/function/Murmur128x64Cyclic.java | 94 --- .../hasher/function/Murmur32x86Iterative.java | 85 --- .../hasher/function/ObjectsHashIterative.java | 99 --- .../hasher/function/Signatures.java | 46 -- .../bloomfilter/AbstractBloomFilterTest.java | 590 ++++-------------- .../ArrayCountingBloomFilterTest.java | 6 +- .../collections4/bloomfilter/BitMaptTest.java | 90 +++ .../DefaultBloomFilterMethodsTest.java | 6 +- .../bloomfilter/HasherBloomFilterTest.java | 92 --- .../bloomfilter/SetOperationsTest.java | 62 +- ...erTest.java => SimpleBloomFilterTest.java} | 14 +- .../bloomfilter/SparseBloomFilterTest.java} | 21 +- .../hasher/DynamicHasherBuilderTest.java | 14 +- .../bloomfilter/hasher/DynamicHasherTest.java | 6 +- .../bloomfilter/hasher/StaticHasherTest.java | 28 +- .../function/AbstractHashFunctionTest.java | 50 -- .../hasher/function/MD5CyclicTest.java | 51 -- .../function/Murmur128x64CyclicTest.java | 54 -- .../function/Murmur32x86IterativeTest.java | 52 -- .../function/ObjectsHashIterativeTest.java | 55 -- 41 files changed, 1088 insertions(+), 3218 deletions(-) delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidator.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64Cyclic.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Signatures.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java rename src/test/java/org/apache/commons/collections4/bloomfilter/{BitSetBloomFilterTest.java => SimpleBloomFilterTest.java} (70%) rename src/{main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java => test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java} (57%) delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/AbstractHashFunctionTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64CyclicTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java deleted file mode 100644 index 18e1fee029..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilter.java +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.PrimitiveIterator.OfInt; -import java.util.function.LongBinaryOperator; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; - -/** - * An abstract Bloom filter providing default implementations for most Bloom filter - * functions. Specific implementations are encouraged to override the methods that can be - * more efficiently implemented. - *

- * This abstract class provides additional functionality not declared in the interface. - * Specifically: - *

- * - * @since 4.5 - */ -public abstract class AbstractBloomFilter implements BloomFilter { - - /** - * The shape used by this BloomFilter - */ - private final Shape shape; - - /** - * Construct a Bloom filter with the specified shape. - * - * @param shape The shape. - */ - protected AbstractBloomFilter(final Shape shape) { - this.shape = shape; - } - - @Override - public int andCardinality(final BloomFilter other) { - verifyShape(other); - final long[] mine = getBits(); - final long[] theirs = other.getBits(); - final int limit = Integer.min(mine.length, theirs.length); - int count = 0; - for (int i = 0; i < limit; i++) { - count += Long.bitCount(mine[i] & theirs[i]); - } - return count; - } - - @Override - public int cardinality() { - int count = 0; - for (final long bits : getBits()) { - count += Long.bitCount(bits); - } - return count; - } - - @Override - public boolean contains(final BloomFilter other) { - verifyShape(other); - return other.cardinality() == andCardinality(other); - } - - @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - final long[] buff = getBits(); - - final OfInt iter = hasher.iterator(shape); - while (iter.hasNext()) { - final int idx = iter.nextInt(); - BloomFilterIndexer.checkPositive(idx); - final int buffIdx = BloomFilterIndexer.getLongIndex(idx); - final long buffOffset = BloomFilterIndexer.getLongBit(idx); - if ((buff[buffIdx] & buffOffset) == 0) { - return false; - } - } - return true; - } - - @Override - public final Shape getShape() { - return shape; - } - - /** - * Determines if the bloom filter is "full". Full is defined as having no unset - * bits. - * - * @return true if the filter is full. - */ - public final boolean isFull() { - return cardinality() == getShape().getNumberOfBits(); - } - - @Override - public int orCardinality(final BloomFilter other) { - // Logical OR - return opCardinality(other, (a, b) -> a | b); - } - - /** - * Verifies that the hasher has the same name as the shape. - * - * @param hasher the Hasher to check - */ - protected void verifyHasher(final Hasher hasher) { - // It is assumed that the filter and hasher have been constructed using the - // same hash function. Use the signature for a fast check the hash function is equal. - // Collisions will occur at a rate of 1 in 2^64. - if (shape.getHashFunctionIdentity().getSignature() != hasher.getHashFunctionIdentity().getSignature()) { - throw new IllegalArgumentException( - String.format("Hasher (%s) is not the hasher for shape (%s)", - HashFunctionIdentity.asCommonString(hasher.getHashFunctionIdentity()), - shape.toString())); - } - } - - /** - * Verify the other Bloom filter has the same shape as this Bloom filter. - * - * @param other the other filter to check. - * @throws IllegalArgumentException if the shapes are not the same. - */ - protected void verifyShape(final BloomFilter other) { - verifyShape(other.getShape()); - } - - /** - * Verify the specified shape has the same shape as this Bloom filter. - * - * @param shape the other shape to check. - * @throws IllegalArgumentException if the shapes are not the same. - */ - protected void verifyShape(final Shape shape) { - if (!this.shape.equals(shape)) { - throw new IllegalArgumentException(String.format("Shape %s is not the same as %s", shape, this.shape)); - } - } - - @Override - public int xorCardinality(final BloomFilter other) { - // Logical XOR - return opCardinality(other, (a, b) -> a ^ b); - } - - /** - * Perform the operation on the matched longs from this filter and the other filter - * and count the cardinality. - * - *

The remaining unmatched longs from the larger filter are always counted. This - * method is suitable for OR and XOR cardinality. - * - * @param other the other Bloom filter. - * @param operation the operation (e.g. OR, XOR) - * @return the cardinality - */ - private int opCardinality(final BloomFilter other, final LongBinaryOperator operation) { - verifyShape(other); - final long[] mine = getBits(); - final long[] theirs = other.getBits(); - final long[] small; - final long[] big; - if (mine.length > theirs.length) { - big = mine; - small = theirs; - } else { - small = mine; - big = theirs; - } - int count = 0; - for (int i = 0; i < small.length; i++) { - count += Long.bitCount(operation.applyAsLong(small[i], big[i])); - } - for (int i = small.length; i < big.length; i++) { - count += Long.bitCount(big[i]); - } - return count; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index 0722b92576..c7afc71cae 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -21,10 +21,10 @@ import java.util.PrimitiveIterator; import java.util.PrimitiveIterator.OfInt; import java.util.function.IntConsumer; +import java.util.stream.IntStream; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; /** * A counting Bloom filter using an array to track counts for each enabled bit @@ -53,7 +53,9 @@ * @see Shape * @since 4.5 */ -public class ArrayCountingBloomFilter extends AbstractBloomFilter implements CountingBloomFilter { +public class ArrayCountingBloomFilter implements CountingBloomFilter { + + private final Shape shape; /** * The count of each bit index in the filter. @@ -136,19 +138,18 @@ public int nextInt() { * @param shape the shape of the filter */ public ArrayCountingBloomFilter(final Shape shape) { - super(shape); + this.shape = shape; counts = new int[shape.getNumberOfBits()]; } + @Override + public boolean isSparse() { + return BitMap.isSparse( cardinality(), shape); + } + @Override public int cardinality() { - int size = 0; - for (final int c : counts) { - if (c != 0) { - size++; - } - } - return size; + return (int) IntStream.range( 0, counts.length ).filter( i -> counts[i] > 0 ).count(); } @Override @@ -158,20 +159,13 @@ public boolean contains(final BloomFilter other) { // Ideally we use an iterator of bit indexes to allow fail-fast on the // first bit index that is zero. if (other instanceof ArrayCountingBloomFilter) { - verifyShape(other); return contains(((ArrayCountingBloomFilter) other).iterator()); } - - // Note: - // This currently creates a StaticHasher which stores all the indexes. - // It would greatly benefit from direct generation of the index iterator - // avoiding the intermediate storage. - return contains(other.getHasher()); + return CountingBloomFilter.super.contains(other); } @Override public boolean contains(final Hasher hasher) { - verifyHasher(hasher); return contains(hasher.iterator(getShape())); } @@ -201,11 +195,6 @@ public long[] getBits() { return bs.toLongArray(); } - @Override - public StaticHasher getHasher() { - return new StaticHasher(iterator(), getShape()); - } - /** * Returns an iterator over the enabled indexes in this filter. * Any index with a non-zero count is considered enabled. @@ -218,13 +207,13 @@ private PrimitiveIterator.OfInt iterator() { } @Override - public boolean merge(final BloomFilter other) { + public boolean mergeInPlace(final BloomFilter other) { applyAsBloomFilter(other, this::increment); return isValid(); } @Override - public boolean merge(final Hasher hasher) { + public boolean mergeInPlace(final Hasher hasher) { applyAsHasher(hasher, this::increment); return isValid(); } @@ -285,7 +274,6 @@ public void forEachCount(final BitCountConsumer action) { * Apply the action for each index in the Bloom filter. */ private void applyAsBloomFilter(final BloomFilter other, final IntConsumer action) { - verifyShape(other); if (other instanceof ArrayCountingBloomFilter) { // Only use the presence of non-zero and not the counts final int[] counts2 = ((ArrayCountingBloomFilter) other).counts; @@ -303,7 +291,6 @@ private void applyAsBloomFilter(final BloomFilter other, final IntConsumer actio * Apply the action for each index in the hasher. */ private void applyAsHasher(final Hasher hasher, final IntConsumer action) { - verifyHasher(hasher); // We do not naturally handle duplicates so filter them. IndexFilters.distinctIndexes(hasher, getShape(), action); } @@ -312,7 +299,6 @@ private void applyAsHasher(final Hasher hasher, final IntConsumer action) { * Apply the action for each index in the Bloom filter. */ private void applyAsCountingBloomFilter(final CountingBloomFilter other, final BitCountConsumer action) { - verifyShape(other); other.forEachCount(action); } @@ -361,4 +347,14 @@ private void subtract(final int idx, final int subtrahend) { state |= updated; counts[idx] = updated; } + + @Override + public int[] getIndices() { + return IntStream.range( 0, counts.length ).filter( i -> counts[i] > 0 ).toArray(); + } + + @Override + public Shape getShape() { + return shape; + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java deleted file mode 100644 index de55cbe93d..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilter.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.BitSet; -import java.util.PrimitiveIterator.OfInt; -import java.util.function.IntConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; - -/** - * A bloom filter using a Java BitSet to track enabled bits. This is a standard - * implementation and should work well for most Bloom filters. - * @since 4.5 - */ -public class BitSetBloomFilter extends AbstractBloomFilter { - - /** - * The bitSet that defines this BloomFilter. - */ - private final BitSet bitSet; - - /** - * Constructs an empty BitSetBloomFilter. - * - * @param shape the desired shape of the filter. - */ - public BitSetBloomFilter(final Shape shape) { - super(shape); - this.bitSet = new BitSet(); - } - - @Override - public int andCardinality(final BloomFilter other) { - if (other instanceof BitSetBloomFilter) { - verifyShape(other); - final BitSet result = (BitSet) bitSet.clone(); - result.and(((BitSetBloomFilter) other).bitSet); - return result.cardinality(); - } - return super.andCardinality(other); - } - - @Override - public int cardinality() { - return bitSet.cardinality(); - } - - @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - final OfInt iter = hasher.iterator(getShape()); - while (iter.hasNext()) { - if (!bitSet.get(iter.nextInt())) { - return false; - } - } - return true; - } - - @Override - public long[] getBits() { - return bitSet.toLongArray(); - } - - @Override - public StaticHasher getHasher() { - return new StaticHasher(bitSet.stream().iterator(), getShape()); - } - - @Override - public boolean merge(final BloomFilter other) { - verifyShape(other); - if (other instanceof BitSetBloomFilter) { - bitSet.or(((BitSetBloomFilter) other).bitSet); - } else { - bitSet.or(BitSet.valueOf(other.getBits())); - } - return true; - } - - @Override - public boolean merge(final Hasher hasher) { - verifyHasher(hasher); - hasher.iterator(getShape()).forEachRemaining((IntConsumer) bitSet::set); - return true; - } - - @Override - public int orCardinality(final BloomFilter other) { - if (other instanceof BitSetBloomFilter) { - verifyShape(other); - final BitSet result = (BitSet) bitSet.clone(); - result.or(((BitSetBloomFilter) other).bitSet); - return result.cardinality(); - } - return super.orCardinality(other); - } - - @Override - public int xorCardinality(final BloomFilter other) { - if (other instanceof BitSetBloomFilter) { - verifyShape(other); - final BitSet result = (BitSet) bitSet.clone(); - result.xor(((BitSetBloomFilter) other).bitSet); - return result.cardinality(); - } - return super.xorCardinality(other); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index af43ddd51e..defb01f424 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -16,9 +16,12 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.Arrays; +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; + import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; /** * The interface that describes a Bloom filter. @@ -29,11 +32,12 @@ public interface BloomFilter { // Query Operations /** - * Gets the shape of this filter. - * - * @return the shape of this filter + * This method is used to determine the best mechod for matching. For `sparse` implementations the `getIndices()` + * method is more efficient. Implementers should determine if it is easier for the implementation to return am array of + * Indices (sparse) or a bit map as an array of unsigned longs. + * @return */ - Shape getShape(); + boolean isSparse(); /** * Gets an array of little-endian long values representing the bits of this filter. @@ -48,12 +52,17 @@ public interface BloomFilter { long[] getBits(); /** - * Creates a StaticHasher that contains the indexes of the bits that are on in this - * filter. - * - * @return a StaticHasher for that produces this Bloom filter + * Gets an array of indices of bits that are enabled. + * Array must be in sorted order. + * @return an array of indices for bits that are enabled in the filter. */ - StaticHasher getHasher(); + int[] getIndices(); + + /** + * Gets the shape that was used when the filter was built. + * @return The shape the flter was built with. + */ + Shape getShape(); /** * Returns {@code true} if this filter contains the specified filter. Specifically this @@ -62,11 +71,48 @@ public interface BloomFilter { * effectively {@code (this AND other) == other}. * * @param other the other Bloom filter - * @return true if this filter is enabled for all enabled bits in the other filter - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter + * @return true if all enabled bits in the other filter are enabled in this filter. */ - boolean contains(BloomFilter other); + default boolean contains(BloomFilter other) { + if (isSparse()) { + int[] myIndicies = getIndices(); + if (other.isSparse()) { + int[] otherIndicies = other.getIndices(); + if (otherIndicies.length > myIndicies.length) { + return false; + } + return Arrays.stream( otherIndicies ).allMatch( i -> Arrays.binarySearch( myIndicies, i) >= 0); + } else { + BitIterator iter = new BitIterator( other.getBits() ); + while (iter.hasNext()) + { + if (Arrays.binarySearch( myIndicies, iter.next()) < 0) { + return false; + } + } + return true; + } + } else { + long[] myBits = getBits(); + if (other.isSparse()) { + return Arrays.stream( other.getIndices() ).allMatch( i -> BitMap.contains( myBits, i )); + } else { + long[] otherBits = other.getBits(); + if (myBits.length != otherBits.length) + { + return false; + } + for (int i=0;i=64) + { + offset=0; + bucket++; + } + if (bucket < bits.length && 0 != (bits[bucket] & (1L << offset))) { + next = (bucket*64)+offset; + } + } + return next >= 0; + } + + + @SuppressWarnings("cast") // Cast to long to workaround a bug in animal-sniffer. + @Override + public int nextInt() { + if (hasNext()) { + try { + return next; + } finally { + next = -1; + } + } + throw new NoSuchElementException(); + } + } + + /** + * Contains functions to convert {@code int} indices into Bloom filter bit positions. + */ + class BitMap { + /** A bit shift to apply to an integer to divided by 64 (2^6). */ + private static final int DIVIDE_BY_64 = 6; + + /** Do not instantiate. */ + private BitMap() {} + + /** + * Calculates the number of buckets required for the numberOfBits parameter. + * @param numberOfBits the number of bits to store in the array of buckets. + * @return the number of buckets necessary. + */ + public static int numberOfBuckets( int numberOfBits ) { + int bucket = numberOfBits >> DIVIDE_BY_64; + return bucket+1; + } + + /** + * Checks if the specified index bit is enabled in the array of bit buckets. + * @param buckets The array of bit buckets + * @param idx the index of the bit to locate. + * @return {@code true} if the bit is enabled, {@code false} otherwise. + */ + public static boolean contains( long[] buckets, int idx ) { + return (buckets[ getLongIndex( idx )] & getLongBit( idx )) != 0; + } + + /** + * Check the index is positive. + * + * @param bitIndex the bit index + * @throws IndexOutOfBoundsException if the index is not positive + */ + public static void checkPositive(final int bitIndex) { + if (bitIndex < 0) { + throw new IndexOutOfBoundsException("Negative bitIndex: " + bitIndex); + } + } + + + /** + * Gets the filter index for the specified bit index assuming the filter is using 64-bit longs + * to store bits starting at index 0. + * + *

The index is assumed to be positive. For a positive index the result will match + * {@code bitIndex / 64}. + * + *

The divide is performed using bit shifts. If the input is negative the behavior + * is not defined. + * + * @param bitIndex the bit index (assumed to be positive) + * @return the filter index + * @see #checkPositive(int) + */ + public static int getLongIndex(final int bitIndex) { + // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is positive. + // We do not explicitly check for a negative here. Instead we use a + // a signed shift. Any negative index will produce a negative value + // by sign-extension and if used as an index into an array it will throw an exception. + return bitIndex >> DIVIDE_BY_64; + } + + /** + * Gets the filter bit mask for the specified bit index assuming the filter is using 64-bit + * longs to store bits starting at index 0. The returned value is a {@code long} with only + * 1 bit set. + * + *

The index is assumed to be positive. For a positive index the result will match + * {@code 1L << (bitIndex % 64)}. + * + *

If the input is negative the behavior is not defined. + * + * @param bitIndex the bit index (assumed to be positive) + * @return the filter bit + * @see #checkPositive(int) + */ + public static long getLongBit(final int bitIndex) { + // Bit shifts only use the first 6 bits. Thus it is not necessary to mask this + // using 0x3f (63) or compute bitIndex % 64. + // Note: If the index is negative the shift will be (64 - (bitIndex & 0x3f)) and + // this will identify an incorrect bit. + return 1L << bitIndex; + } + + /** + * Determines id a cardinality is sparse for the shape. + * Since the size of a bucket is a long and the size of an index is an int, there can be + * 2 indexes for each bucket. Since indexes are evenly distributed sparse is defined as + * {@code numberOfBuckets*2 >= cardinality} + * @param cardinality the cardinality to check. + * @param shape the Shape to check against + * @return true if the cardinality is sparse within the bucket. + */ + public static boolean isSparse( int cardinality, Shape shape ) { + return numberOfBuckets(shape.getNumberOfBits()-1)*2 >= cardinality; + } + + } + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java deleted file mode 100644 index fe9b1161a9..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexer.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -/** - * Contains functions to convert {@code int} indices into Bloom filter bit positions. - */ -public final class BloomFilterIndexer { - /** A bit shift to apply to an integer to divided by 64 (2^6). */ - private static final int DIVIDE_BY_64 = 6; - - /** Do not instantiate. */ - private BloomFilterIndexer() {} - - /** - * Check the index is positive. - * - * @param bitIndex the bit index - * @throws IndexOutOfBoundsException if the index is not positive - */ - public static void checkPositive(final int bitIndex) { - if (bitIndex < 0) { - throw new IndexOutOfBoundsException("Negative bitIndex: " + bitIndex); - } - } - - /** - * Gets the filter index for the specified bit index assuming the filter is using 64-bit longs - * to store bits starting at index 0. - * - *

The index is assumed to be positive. For a positive index the result will match - * {@code bitIndex / 64}. - * - *

The divide is performed using bit shifts. If the input is negative the behavior - * is not defined. - * - * @param bitIndex the bit index (assumed to be positive) - * @return the filter index - * @see #checkPositive(int) - */ - public static int getLongIndex(final int bitIndex) { - // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is positive. - // We do not explicitly check for a negative here. Instead we use a - // a signed shift. Any negative index will produce a negative value - // by sign-extension and if used as an index into an array it will throw an exception. - return bitIndex >> DIVIDE_BY_64; - } - - /** - * Gets the filter bit mask for the specified bit index assuming the filter is using 64-bit - * longs to store bits starting at index 0. The returned value is a {@code long} with only - * 1 bit set. - * - *

The index is assumed to be positive. For a positive index the result will match - * {@code 1L << (bitIndex % 64)}. - * - *

If the input is negative the behavior is not defined. - * - * @param bitIndex the bit index (assumed to be positive) - * @return the filter bit - * @see #checkPositive(int) - */ - public static long getLongBit(final int bitIndex) { - // Bit shifts only use the first 6 bits. Thus it is not necessary to mask this - // using 0x3f (63) or compute bitIndex % 64. - // Note: If the index is negative the shift will be (64 - (bitIndex & 0x3f)) and - // this will identify an incorrect bit. - return 1L << bitIndex; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index 0c414ebe93..40b71e471c 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -104,37 +104,6 @@ interface BitCountConsumer { // Modification Operations - /** - * Merges the specified Bloom filter into this Bloom filter. Specifically all counts for - * indexes that are enabled in the {@code other} filter will be incremented by 1. - * - *

Note: If the other filter is a counting Bloom filter the index counts are ignored; only - * the enabled indexes are used. - * - *

This method will return true if the filter is valid after the operation. - * - * @param other {@inheritDoc} - * @return true if the merge was successful and the state is valid - * @throws IllegalArgumentException {@inheritDoc} - * @see #isValid() - */ - @Override - boolean merge(BloomFilter other); - - /** - * Merges the specified decomposed Bloom filter into this Bloom filter. Specifically all - * counts for the distinct indexes that are identified by the {@code hasher} will - * be incremented by 1. If the {@code hasher} contains duplicate bit indexes these are ignored. - * - *

This method will return true if the filter is valid after the operation. - * - * @param hasher {@inheritDoc} - * @return true if the merge was successful and the state is valid - * @throws IllegalArgumentException {@inheritDoc} - * @see #isValid() - */ - @Override - boolean merge(Hasher hasher); /** * Removes the specified Bloom filter from this Bloom filter. Specifically @@ -169,6 +138,7 @@ interface BitCountConsumer { */ boolean remove(Hasher hasher); + /** * Adds the specified counting Bloom filter to this Bloom filter. Specifically * all counts for the indexes identified by the {@code other} filter will be incremented diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java deleted file mode 100644 index 71272e65c4..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilter.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.Arrays; -import java.util.Set; -import java.util.TreeSet; -import java.util.PrimitiveIterator.OfInt; -import java.util.function.IntConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; -import org.apache.commons.collections4.iterators.EmptyIterator; -import org.apache.commons.collections4.iterators.IteratorChain; - -/** - * A Bloom filter built on a single hasher. This filter type should only be used for small - * filters (few on bits). While this implementation correctly supports the merge() methods - * it is recommended that if merges are expected that one of the other Bloom filter - * implementations be used. - * @since 4.5 - */ -public class HasherBloomFilter extends AbstractBloomFilter { - /** The bit representation for an empty Bloom filter. */ - private static final long[] EMPTY = new long[0]; - - /** - * The internal hasher representation. - */ - private StaticHasher hasher; - - /** - * Constructs a HasherBloomFilter from a hasher and a shape. - * - * @param hasher the hasher to use. - * @param shape the shape of the Bloom filter. - */ - public HasherBloomFilter(final Hasher hasher, final Shape shape) { - super(shape); - verifyHasher(hasher); - if (hasher instanceof StaticHasher) { - this.hasher = (StaticHasher) hasher; - verifyShape(this.hasher.getShape()); - } else { - this.hasher = new StaticHasher(hasher, shape); - } - } - - /** - * Constructs an empty HasherBloomFilter from a shape. - * - * @param shape the shape of the Bloom filter. - */ - public HasherBloomFilter(final Shape shape) { - super(shape); - this.hasher = new StaticHasher(EmptyIterator.emptyIterator(), shape); - } - - @Override - public int cardinality() { - return hasher.size(); - } - - @Override - public boolean contains(final Hasher hasher) { - verifyHasher(hasher); - final Set set = new TreeSet<>(); - hasher.iterator(getShape()).forEachRemaining((IntConsumer) idx -> { - set.add(idx); - }); - final OfInt iter = this.hasher.iterator(getShape()); - while (iter.hasNext()) { - final int idx = iter.nextInt(); - set.remove(idx); - if (set.isEmpty()) { - return true; - } - } - return false; - } - - @Override - public long[] getBits() { - if (hasher.isEmpty()) { - return EMPTY; - } - - // Note: This can be simplified if the StaticHasher exposed a getMaxIndex() - // method. Since it maintains an ordered list of unique indices the maximum - // is the last value in the iterator. Knowing this value would allow - // exact allocation of the long[]. - // For now we assume that the long[] will have a positive length and at least - // 1 bit set in the entire array. - - final int n = (int) Math.ceil(hasher.getShape().getNumberOfBits() * (1.0 / Long.SIZE)); - final long[] result = new long[n]; - final OfInt iter = hasher.iterator(hasher.getShape()); - iter.forEachRemaining((IntConsumer) idx -> { - BloomFilterIndexer.checkPositive(idx); - final int buffIdx = BloomFilterIndexer.getLongIndex(idx); - final long buffOffset = BloomFilterIndexer.getLongBit(idx); - result[buffIdx] |= buffOffset; - }); - - int limit = result.length; - - // Assume the array has a non-zero length and at least 1 bit set. - // This is tested using assertions. - assert limit > 0 : "Number of bits in Shape is 0"; - while (result[limit - 1] == 0) { - limit--; - // If the hasher was not empty it is not possible to return - // an array of length zero. - assert limit > 0 : "Hasher reported a non-zero size but has no indices"; - } - if (limit < result.length) { - return Arrays.copyOf(result, limit); - } - return result; - } - - @Override - public StaticHasher getHasher() { - return hasher; - } - - @Override - public boolean merge(final BloomFilter other) { - return merge(other.getHasher()); - } - - @Override - public boolean merge(final Hasher hasher) { - verifyHasher(hasher); - final IteratorChain iter = new IteratorChain<>(this.hasher.iterator(getShape()), - hasher.iterator(getShape())); - this.hasher = new StaticHasher(iter, getShape()); - return true; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java deleted file mode 100644 index 48c43620ad..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.apache.commons.collections4.bloomfilter.hasher.Shape; - -/** - * Implementations of set operations on Bloom filters. - * - */ -public final class SetOperations { - - /** - * Calculates the Cosine distance between two Bloom filters. - * - *

Cosine distance is defined as {@code 1 - Cosine similarity}

- * - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return the jaccard distance. - */ - public static double cosineDistance(final BloomFilter first, final BloomFilter second) { - return 1.0 - cosineSimilarity(first, second); - } - - /** - * Calculates the Cosine similarity between two Bloom filters. - *

Also known as Orchini similarity and the Tucker coefficient of congruence or - * Ochiai similarity.

- * - *

If either filter is empty (no enabled bits) the result is 0 (zero)

- * - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return the Cosine similarity. - */ - public static double cosineSimilarity(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final int numerator = first.andCardinality(second); - return numerator == 0 ? 0 : numerator / (Math.sqrt(first.cardinality()) * Math.sqrt(second.cardinality())); - } - - /** - * Estimates the number of items in the intersection of the sets represented by two - * Bloom filters. - * - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return an estimate of the size of the intersection between the two filters. - */ - public static long estimateIntersectionSize(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - // do subtraction early to avoid Long overflow. - return estimateSize(first) - estimateUnionSize(first, second) + estimateSize(second); - } - - /** - * Estimates the number of items in the Bloom filter based on the shape and the number - * of bits that are enabled. - * - * @param filter the Bloom filter to estimate size for. - * @return an estimate of the number of items that were placed in the Bloom filter. - */ - public static long estimateSize(final BloomFilter filter) { - final Shape shape = filter.getShape(); - final double estimate = -(shape.getNumberOfBits() * - Math.log(1.0 - filter.cardinality() * 1.0 / shape.getNumberOfBits())) / - shape.getNumberOfHashFunctions(); - return Math.round(estimate); - } - - /** - * Estimates the number of items in the union of the sets represented by two - * Bloom filters. - * - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return an estimate of the size of the union between the two filters. - */ - public static long estimateUnionSize(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final Shape shape = first.getShape(); - final double estimate = -(shape.getNumberOfBits() * - Math.log(1.0 - first.orCardinality(second) * 1.0 / shape.getNumberOfBits())) / - shape.getNumberOfHashFunctions(); - return Math.round(estimate); - } - - /** - * Calculates the Hamming distance between two Bloom filters. - * - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return the Hamming distance. - */ - public static int hammingDistance(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - return first.xorCardinality(second); - } - - /** - * Calculates the Jaccard distance between two Bloom filters. - * - *

Jaccard distance is defined as {@code 1 - Jaccard similarity}

- * - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return the Jaccard distance. - */ - public static double jaccardDistance(final BloomFilter first, final BloomFilter second) { - return 1.0 - jaccardSimilarity(first, second); - } - - /** - * Calculates the Jaccard similarity between two Bloom filters. - * - *

Also known as Jaccard index, Intersection over Union, and Jaccard similarity coefficient

- * - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return the Jaccard similarity. - */ - public static double jaccardSimilarity(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final int orCard = first.orCardinality(second); - // if the orCard is zero then the hamming distance will also be zero. - return orCard == 0 ? 0 : hammingDistance(first, second) / (double) orCard; - } - - /** - * Verifies the Bloom filters have the same shape. - * - * @param first the first filter to check. - * @param second the second filter to check. - * @throws IllegalArgumentException if the shapes are not the same. - */ - private static void verifyShape(final BloomFilter first, final BloomFilter second) { - if (!first.getShape().equals(second.getShape())) { - throw new IllegalArgumentException(String.format("Shape %s is not the same as %s", - first.getShape(), second.getShape())); - } - } - - /** - * Do not instantiate. - */ - private SetOperations() {} -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java new file mode 100644 index 0000000000..fc8b2eda3c --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; +import java.util.BitSet; +import java.util.function.IntConsumer; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; + +/** + * A bloom filter using a Java BitSet to track enabled bits. This is a standard + * implementation and should work well for most Bloom filters. + * @since 4.5 + */ +public class SimpleBloomFilter implements BloomFilter { + + /** + * The bitSet that defines this BloomFilter. + */ + private final BitSet bitSet; + private final Shape shape; + + /** + * Constructs an empty BitSetBloomFilter. + * + */ + public SimpleBloomFilter(Shape shape) { + this.shape = shape; + this.bitSet = new BitSet(); + } + + public SimpleBloomFilter(final Shape shape, Hasher hasher) { + this( shape ); + hasher.iterator(shape).forEachRemaining( (IntConsumer) i -> bitSet.set(i)); + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + if (other.isSparse()) { + Arrays.stream(other.getIndices()).forEach( s -> bitSet.set( s )); + } else { + bitSet.or( BitSet.valueOf(other.getBits() )); + } + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean isSparse() { + return false; + } + + @Override + public int cardinality() { + return bitSet.cardinality(); + } + + @Override + public long[] getBits() { + return bitSet.toLongArray(); + } + + @Override + public int[] getIndices() { + int[] result = new int[ bitSet.cardinality() ]; + int idx = 0; + for (int i=0;i indices; + private final Shape shape; + + /** + * Constructs an empty BitSetBloomFilter. + * + */ + public SparseBloomFilter(Shape shape) { + this.shape = shape; + this.indices = new TreeSet(); + } + + public SparseBloomFilter(final Shape shape, Hasher hasher) { + this( shape ); + hasher.iterator(shape).forEachRemaining( (IntConsumer) i -> indices.add( i )); + } + + public SparseBloomFilter(Shape shape, List indices) { + this(shape); + this.indices.addAll( indices ); + } + + @Override + public boolean mergeInPlace(Hasher hasher) { + PrimitiveIterator.OfInt iter = hasher.iterator(shape); + while (iter.hasNext()) { + indices.add( iter.next() ); + } + return true; + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + for (int i : other.getIndices()) { + indices.add(i); + } + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean isSparse() { + return true; + } + + @Override + public int cardinality() { + return indices.size(); + } + + @Override + public long[] getBits() { + if (cardinality() == 0) { + return new long[0]; + } + long[] result = new long[ BitMap.numberOfBuckets( indices.last() )]; + for (Integer idx : indices) + { + result[ BitMap.getLongIndex( idx.intValue()) ] |= BitMap.getLongBit(idx.intValue()); + } + return result; + } + + @Override + public int[] getIndices() { + int[] result = new int[ indices.size() ]; + int i=0; + for (int value : indices ) { + result[i++]=value; + } + return result; + } + + + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java deleted file mode 100644 index ab6b773d6c..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator; - -/** - * The class that performs hashing on demand. - * @since 4.5 - */ -public class DynamicHasher implements Hasher { - - /** - * The builder for DynamicHashers. - * @since 4.5 - */ - public static class Builder implements Hasher.Builder { - - /** - * The list of items (each as a byte[]) that are to be hashed. - */ - private final List buffers; - - /** - * The function that the resulting DynamicHasher will use. - */ - private final HashFunction function; - - /** - * Constructs a DynamicHasher builder. - * - * @param function the function implementation. - */ - public Builder(final HashFunction function) { - this.function = function; - this.buffers = new ArrayList<>(); - } - - @Override - public DynamicHasher build() throws IllegalArgumentException { - // Assumes the hasher will create a copy of the buffers - final DynamicHasher hasher = new DynamicHasher(function, buffers); - // Reset for further use - buffers.clear(); - return hasher; - } - - @Override - public final DynamicHasher.Builder with(final byte[] property) { - buffers.add(property); - return this; - } - - @Override - public DynamicHasher.Builder with(final CharSequence item, final Charset charset) { - Hasher.Builder.super.with(item, charset); - return this; - } - - @Override - public DynamicHasher.Builder withUnencoded(final CharSequence item) { - Hasher.Builder.super.withUnencoded(item); - return this; - } - } - - /** - * The iterator of integers. - * - *

This assumes that the list of buffers is not empty. - */ - private class Iterator implements PrimitiveIterator.OfInt { - /** The number of hash functions per item. */ - private final int k; - /** The number of bits in the shape. */ - private final int m; - /** The current item. */ - private byte[] item; - /** The index of the next item. */ - private int nextItem; - /** The count of hash functions for the current item. */ - private int functionCount; - - /** - * Constructs iterator with the specified shape. - * - * @param shape - */ - private Iterator(final Shape shape) { - // Assumes that shape returns non-zero positive values for hash functions and bits - k = shape.getNumberOfHashFunctions(); - m = shape.getNumberOfBits(); - // Assume non-empty - item = buffers.get(0); - nextItem = 1; - } - - @Override - public boolean hasNext() { - if (functionCount != k) { - return true; - } - // Reached the number of hash functions for the current item. - // Try and advance to the next item. - if (nextItem != buffers.size()) { - item = buffers.get(nextItem++); - functionCount = 0; - return true; - } - // Finished. - // functionCount == shape.getNumberOfHashFunctions() - // nextItem == buffers.size() - return false; - } - - @SuppressWarnings("cast") // Cast to long to workaround a bug in animal-sniffer. - @Override - public int nextInt() { - if (hasNext()) { - return (int) Math.floorMod(function.apply(item, functionCount++), - // Cast to long to workaround a bug in animal-sniffer. - (long) m); - } - throw new NoSuchElementException(); - } - } - - /** - * An iterator of integers to use when there are no values. - */ - private static class NoValuesIterator implements PrimitiveIterator.OfInt { - /** The singleton instance. */ - private static final NoValuesIterator INSTANCE = new NoValuesIterator(); - - /** - * Empty constructor. - */ - private NoValuesIterator() {} - - @Override - public boolean hasNext() { - return false; - } - - @Override - public int nextInt() { - throw new NoSuchElementException(); - } - } - - /** - * The list of byte arrays that are to be hashed. - * Package private for access by the iterator. - */ - final List buffers; - - /** - * The function to hash the buffers. - * Package private for access by the iterator. - */ - final HashFunction function; - - /** - * Constructs a DynamicHasher. - * - * @param function the function to use. - * @param buffers the byte buffers that will be hashed. - */ - public DynamicHasher(final HashFunction function, final List buffers) { - this.buffers = new ArrayList<>(buffers); - this.function = function; - } - - @Override - public PrimitiveIterator.OfInt iterator(final Shape shape) { - HashFunctionValidator.checkAreEqual(getHashFunctionIdentity(), - shape.getHashFunctionIdentity()); - // Use optimised iterator for no values - return buffers.isEmpty() ? NoValuesIterator.INSTANCE : new Iterator(shape); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return function; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java deleted file mode 100644 index d14fd3d830..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunction.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -/** - * Defines a hash function used by a {@link Hasher} . - * @since 4.5 - */ -public interface HashFunction extends HashFunctionIdentity { - - /** - * Applies the hash function to the buffer. - * - * @param buffer the buffer to apply the hash function to. - * @param seed the seed for the hashing. - * @return the long value of the hash. - */ - long apply(byte[] buffer, int seed); - - /** - * Gets the signature of this function. - * - *

The signature of this function is calculated as: - *


-     * int seed = 0;
-     * apply(String.format("%s-%s-%s",
-     *                     getName().toUpperCase(Locale.ROOT), getSignedness(), getProcess())
-     *             .getBytes("UTF-8"), seed);
-     * 
- * - * @see HashFunctionIdentity#prepareSignatureBuffer(HashFunctionIdentity) - */ - @Override - long getSignature(); -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java deleted file mode 100644 index 0ff2edb8d4..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentity.java +++ /dev/null @@ -1,173 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.nio.charset.StandardCharsets; -import java.util.Locale; - -/** - * Defines the hash function used by a {@link Hasher}. - * - * @since 4.5 - */ -public interface HashFunctionIdentity { - - /** - * Identifies the process type of this function. - * - *
- *
Iterative processes
- *
Call the underlying hash algorithm for each (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)}.
- *
Cyclic processes
- *
Call the underlying hash algorithm using a (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)} to initialize the state. Subsequent - * calls can generate hash values without calling the underlying algorithm.
- *
- */ - enum ProcessType { - /** - * Call the underlying hash algorithm for a (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)} when the state is uninitialized or - * the seed is zero. This initializes the state. Subsequent calls with a non-zero - * seed use the state to generate a new value. - */ - CYCLIC, - /** - * Call the underlying hash algorithm for each (buffer, seed) pair passed to - * {@link HashFunction#apply(byte[], int)}. - */ - ITERATIVE - } - - /** - * Identifies the signedness of the calculations for this function. - *

- * When the hash function executes it typically returns an array of bytes. - * That array is converted into one or more numerical values which will be provided - * as a {@code long} primitive type. - * The signedness identifies if those {@code long} values are signed or unsigned. - * For example a hash function that outputs only 32-bits can be unsigned if converted - * using {@link Integer#toUnsignedLong(int)}. A hash function that outputs more than - * 64-bits is typically signed. - *

- */ - enum Signedness { - /** - * The result of {@link HashFunction#apply(byte[], int)} is signed, - * thus the sign bit may be set. - * - *

- * The result can be used with {@code Math.floorMod(x, y)} to generate a positive - * value if y is positive. - *

- * - * @see Math#floorMod(int, int) - */ - SIGNED, - /** - * The result of {@link HashFunction#apply(byte[], int)} is unsigned, - * thus the sign bit is never set. - * - *

- * The result can be used with {@code x % y} to generate a positive - * value if y is positive. - *

- */ - UNSIGNED - } - - /** - * Gets a common formatted string for general display. - * - * @param identity the identity to format. - * @return the String representing the identity. - */ - static String asCommonString(final HashFunctionIdentity identity) { - return String.format("%s-%s-%s", identity.getName(), identity.getSignedness(), identity.getProcessType()); - } - - /** - * Gets a {@code byte[]} buffer for a HashFunctionIdentity to create a signature. The - * {@code byte[]} is composed using properties of the hash function as: - * - *

-     * String.format("%s-%s-%s",
-     *               getName().toUpperCase(Locale.ROOT), getSignedness(), getProcess())
-     *       .getBytes("UTF-8");
-     * 
- * - * @param identity The HashFunctionIdentity to create the buffer for. - * @return the signature buffer for the identity - * @see #getSignature() - */ - static byte[] prepareSignatureBuffer(final HashFunctionIdentity identity) { - return String.format("%s-%s-%s", - identity.getName().toUpperCase(Locale.ROOT), identity.getSignedness(), - identity.getProcessType()).getBytes(StandardCharsets.UTF_8); - } - - /** - * Gets the name of this hash function. - *

- * Hash function should be the common name - * for the hash. This may include indications as to hash length - *

- *

- * Names are not case specific. Thus, "MD5" and "md5" should be considered as the same. - *

- * @return the Hash name - */ - String getName(); - - /** - * Gets the process type of this function. - * - * @return process type of this function. - */ - ProcessType getProcessType(); - - /** - * Gets the name of the provider of this hash function implementation. - *

- * Provider names are not case specific. Thus, "Apache Commons Collection" and - * "apache commons collection" should be considered as the same. - *

- * @return the name of the provider of this hash implementation. - */ - String getProvider(); - - /** - * Gets the signature of this function. The signature is the output of the hash function - * when applied to a set of bytes composed using properties of the hash function. - * - *

- * Implementations should define the method used to generate the signature. - *

- * - * @return the signature of this function. - * @see #prepareSignatureBuffer(HashFunctionIdentity) - */ - long getSignature(); - - /** - * Gets the signedness of this function. - * - * @return signedness of this function. - */ - Signedness getSignedness(); -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java deleted file mode 100644 index c75973a376..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImpl.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -/** - * An instance of HashFunctionIdentity that is suitable for deserializing - * HashFunctionIdentity data from a stream or any other situation where the - * hash function is not available but the identify of the function is required. - * - * @since 4.5 - */ -public final class HashFunctionIdentityImpl implements HashFunctionIdentity { - private final String name; - private final String provider; - private final Signedness signedness; - private final ProcessType process; - private final long signature; - - /** - * Creates a copy of the HashFunctionIdentity. - * @param identity the identity to copy. - */ - public HashFunctionIdentityImpl(final HashFunctionIdentity identity) { - this.name = identity.getName(); - this.provider = identity.getProvider(); - this.signedness = identity.getSignedness(); - this.process = identity.getProcessType(); - this.signature = identity.getSignature(); - } - - /** - * Creates a HashFunctionIdentity from component values. - * @param provider the name of the provider. - * @param name the name of the hash function. - * @param signedness the signedness of the hash function. - * @param process the processes of the hash function. - * @param signature the signature for the hash function. - */ - public HashFunctionIdentityImpl(final String provider, final String name, final Signedness signedness, final ProcessType process, - final long signature) { - this.name = name; - this.provider = provider; - this.signedness = signedness; - this.process = process; - this.signature = signature; - } - - @Override - public String getName() { - return name; - } - - @Override - public ProcessType getProcessType() { - return process; - } - - @Override - public String getProvider() { - return provider; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return signedness; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidator.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidator.java deleted file mode 100644 index 3ec0753e4a..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidator.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.util.Locale; -import java.util.Objects; - -/** - * Contains validation for hash functions. - */ -public final class HashFunctionValidator { - /** Do not instantiate. */ - private HashFunctionValidator() {} - - /** - * Generates a hash code for the identity of the hash function. The hash code is - * generated using the same properties as those tested in - * {@link #areEqual(HashFunctionIdentity, HashFunctionIdentity)}, that is the - * signedness, process type and name. The name is not case specific and is converted - * to lower-case using the {@link Locale#ROOT root locale}. - * - *

The generated value is suitable for use in generation of a hash code that satisfies - * the contract of {@link Object#hashCode()} if the {@link Object#equals(Object)} method - * is implemented using {@link #areEqual(HashFunctionIdentity, HashFunctionIdentity)}. That - * is two objects considered equal will have the same hash code. - * - *

If the hash function identity is a field within a larger object the generated hash code - * should be incorporated into the entire hash, for example using - * {@link Objects#hash(Object...)}. - * - * @param a hash function. - * @return hash code - * @see String#toLowerCase(Locale) - * @see Locale#ROOT - */ - static int hash(final HashFunctionIdentity a) { - return Objects.hash(a.getSignedness(), - a.getProcessType(), - a.getName().toLowerCase(Locale.ROOT)); - } - - /** - * Compares the identity of the two hash functions. The functions are considered - * equal if the signedness, process type and name are equal. The name is not - * case specific. - * - *

A pair of functions that are equal would be expected to produce the same - * hash output from the same input. - * - * @param a First hash function. - * @param b Second hash function. - * @return true, if successful - * @see String#equalsIgnoreCase(String) - */ - public static boolean areEqual(final HashFunctionIdentity a, final HashFunctionIdentity b) { - return (a.getSignedness() == b.getSignedness() && - a.getProcessType() == b.getProcessType() && - a.getName().equalsIgnoreCase(b.getName())); - } - - /** - * Compares the identity of the two hash functions and throws an exception if they - * are not equal. - * - * @param a First hash function. - * @param b Second hash function. - * @see #areEqual(HashFunctionIdentity, HashFunctionIdentity) - * @throws IllegalArgumentException if the hash functions are not equal - */ - public static void checkAreEqual(final HashFunctionIdentity a, final HashFunctionIdentity b) { - if (!areEqual(a, b)) { - throw new IllegalArgumentException(String.format("Hash functions are not equal: (%s) != (%s)", - HashFunctionIdentity.asCommonString(a), HashFunctionIdentity.asCommonString(b))); - } - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index 3700567f1a..03d1488e7e 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -16,7 +16,6 @@ */ package org.apache.commons.collections4.bloomfilter.hasher; -import java.nio.charset.Charset; import java.util.PrimitiveIterator; /** @@ -45,66 +44,6 @@ */ public interface Hasher { - /** - * A builder to build a hasher. - * - *

A hasher represents one or more items of arbitrary byte size. The builder - * contains methods to collect byte representations of items. Each method to add - * to the builder will add an entire item to the final hasher created by the - * {@link #build()} method. - * - * @since 4.5 - */ - interface Builder { - - /** - * Builds the hasher from all the items. - * - *

This method will clear the builder for future use. - * - * @return the fully constructed hasher - */ - Hasher build(); - - /** - * Adds a byte array item to the hasher. - * - * @param item the item to add - * @return a reference to this object - */ - Builder with(byte[] item); - - /** - * Adds a character sequence item to the hasher using the specified {@code charset} - * encoding. - * - * @param item the item to add - * @param charset the character set - * @return a reference to this object - */ - default Builder with(final CharSequence item, final Charset charset) { - return with(item.toString().getBytes(charset)); - } - - /** - * Adds a character sequence item to the hasher. Each 16-bit character is - * converted to 2 bytes using little-endian order. - * - * @param item the item to add - * @return a reference to this object - */ - default Builder withUnencoded(final CharSequence item) { - final int length = item.length(); - final byte[] bytes = new byte[length * 2]; - for (int i = 0; i < length; i++) { - final char ch = item.charAt(i); - bytes[i * 2] = (byte) ch; - bytes[i * 2 + 1] = (byte) (ch >>> 8); - } - return with(bytes); - } - } - /** * Gets an iterator of integers that are the bits to enable in the Bloom * filter based on the shape. @@ -124,9 +63,10 @@ default Builder withUnencoded(final CharSequence item) { PrimitiveIterator.OfInt iterator(Shape shape); /** - * Gets the identify of the hash function used by the the hasher. - * - * @return the identity of the hash function + * Gets the number of items that will be hashed by the iterator. + * @return The number of items that will be hashed by the iterator. */ - HashFunctionIdentity getHashFunctionIdentity(); + int size(); + + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java new file mode 100644 index 0000000000..42350114ef --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; +import java.util.stream.Collectors; + +/** + * The class that performs hashing on demand. + * @since 4.5 + */ +public class HasherCollection implements Hasher { + + + /** + * The list of hashers to be used to generate the iterator. + * Package private for access by the iterator. + */ + final List hashers; + + /** + * Constructs an empty HasherCollection. + */ + public HasherCollection() { + this.hashers = new ArrayList<>(); + } + + /** + * Constructs a DynamicHasher. + * + * @param hashers A collections of Hashers to build the iterator with. + */ + public HasherCollection(final Collection hashers) { + this.hashers = new ArrayList<>(hashers); + } + + /** + * Constructs a DynamicHasher. + * + * @param function the function to use. + * @param buffers the byte buffers that will be hashed. + */ + public HasherCollection(Hasher... hashers) { + this( Arrays.asList(hashers)); + } + + public void add(Hasher hasher) { + hashers.add(hasher); + } + + public void add(Collection hashers) { + hashers.addAll(hashers); + } + + @Override + public PrimitiveIterator.OfInt iterator(final Shape shape) { + return new Iterator(shape); + } + + @Override + public int size() { + int i = 0; + for (Hasher h : hashers ) + { + i += h.size(); + } + return i; + } + + /** + * The iterator of integers. + * + *

This assumes that the list of buffers is not empty. + */ + private class Iterator implements PrimitiveIterator.OfInt { + + /** The iterator over the hashers */ + private final java.util.Iterator wrappedIterator; + + /** The shape of the filter we are createing */ + private final Shape shape; + + /** The iterator over the internal hasher */ + private PrimitiveIterator.OfInt current; + + + /** + * Constructs iterator with the specified shape. + * + * @param shape + */ + private Iterator(final Shape shape) { + this.shape = shape; + wrappedIterator = hashers.iterator(); + current = null; + } + + @Override + public boolean hasNext() { + if (current == null || !current.hasNext()) { + if (wrappedIterator.hasNext()) { + current = wrappedIterator.next().iterator(shape); + } else { + current = null; + } + } + return current != null && current.hasNext(); + } + + @SuppressWarnings("cast") // Cast to long to workaround a bug in animal-sniffer. + @Override + public int nextInt() { + if (hasNext()) { + return current.nextInt(); + } + throw new NoSuchElementException(); + } + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java index a82586fe4e..bcac204c4f 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java @@ -18,6 +18,8 @@ import java.util.Objects; +import org.apache.commons.collections4.bloomfilter.BloomFilter; + /** * The definition of a Bloom filter shape. * @@ -46,132 +48,17 @@ public final class Shape { /** - * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. - */ - private static final double LN_2 = Math.log(2.0); - - /** - * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201. - * - *

ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2) + * Number of hash functions to create a filter ({@code k}). */ - private static final double DENOMINATOR = -LN_2 * LN_2; - - /** - * Number of items in the filter ({@code n}). - */ - private final int numberOfItems; + private final int numberOfHashFunctions; /** * Number of bits in the filter ({@code m}). */ private final int numberOfBits; - /** - * Number of hash functions ({@code k}). - */ - private final int numberOfHashFunctions; - - /** - * The hash code for this filter. - */ - private final int hashCode; - - /** - * The identity of the hasher function. - */ - private final HashFunctionIdentity hashFunctionIdentity; - - /** - * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the - * specified number of bits ({@code m}) and hash functions ({@code k}). - * - *

The number of items ({@code n}) to be stored in the filter is computed. - *

n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))
- * - *

The actual probability will be approximately equal to the - * desired probability but will be dependent upon the calculated Bloom filter capacity - * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param hashFunctionIdentity The identity of the hash function this shape uses - * @param probability The desired false-positive probability in the range {@code (0, 1)} - * @param numberOfBits The number of bits in the filter - * @param numberOfHashFunctions The number of hash functions in the filter - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}; - * if {@code numberOfBits < 1}; if {@code numberOfHashFunctions < 1}; or if the actual - * probability is {@code >= 1.0} - * @see #getProbability() - */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final double probability, final int numberOfBits, - final int numberOfHashFunctions) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - checkProbability(probability); - this.numberOfBits = checkNumberOfBits(numberOfBits); - this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); - // Number of items (n): - // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) - final double n = Math.ceil(numberOfBits / - (-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions)))); - // log of probability is always < 0 - // number of hash functions is >= 1 - // e^x where x < 0 = [0,1) - // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53 - // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0 - // ceil( >0 ) >= 1 - // so we can not produce a negative value thus we don't check for it. - // - // similarly we can not produce a number greater than numberOfBits so we - // do not have to check for Integer.MAX_VALUE either. - this.numberOfItems = (int) n; - // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); - } - - /** - * Constructs a filter configuration with the specified number of items ({@code n}) and - * desired false-positive probability ({@code p}). - * - *

The number of bits ({@code m}) for the filter is computed. - *

m = ceil(n * ln(p) / ln(1 / 2^ln(2)))
- * - *

The optimal number of hash functions ({@code k}) is computed. - *

k = round((m / n) * ln(2))
- * - *

The actual probability will be approximately equal to the - * desired probability but will be dependent upon the calculated number of bits and hash - * functions. An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param hashFunctionIdentity The identity of the hash function this shape uses - * @param numberOfItems Number of items to be placed in the filter - * @param probability The desired false-positive probability in the range {@code (0, 1)} - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if the desired probability - * is not in the range {@code (0, 1)}; or if the actual probability is {@code >= 1.0} - * @see #getProbability() - */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final double probability) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - this.numberOfItems = checkNumberOfItems(numberOfItems); - checkProbability(probability); - - // Number of bits (m) - final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); - if (m > Integer.MAX_VALUE) { - throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); - } - this.numberOfBits = (int) m; - - this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); - // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); - } /** * Constructs a filter configuration with the specified number of items ({@code n}) and @@ -184,51 +71,13 @@ public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOf * functions. An exception is raised if this is greater than or equal to 1 (i.e. the * shape is invalid for use as a Bloom filter). * - * @param hashFunctionIdentity The identity of the hash function this shape uses - * @param numberOfItems Number of items to be placed in the filter + * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. * @param numberOfBits The number of bits in the filter - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; - * if the calculated number of hash function is {@code < 1}; - * or if the actual probability is {@code >= 1.0} - * @see #getProbability() - */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - this.numberOfItems = checkNumberOfItems(numberOfItems); - this.numberOfBits = checkNumberOfBits(numberOfBits); - this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); - // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); - } - - /** - * Constructs a filter configuration with the specified number of items, bits - * and hash functions. - * - *

The false-positive probability is computed using the number of items, bits and hash - * functions. An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param hashFunctionIdentity The identity of the hash function this shape uses - * @param numberOfItems Number of items to be placed in the filter - * @param numberOfBits The number of bits in the filter. - * @param numberOfHashFunctions The number of hash functions in the filter - * @throws NullPointerException if the hash function identity is null - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; - * if {@code numberOfHashFunctions < 1}; or if the actual probability is {@code >= 1.0} - * @see #getProbability() + * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} */ - public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits, - final int numberOfHashFunctions) { - this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); - this.numberOfItems = checkNumberOfItems(numberOfItems); + public Shape(final int numberOfHashFunctions, final int numberOfBits) { this.numberOfBits = checkNumberOfBits(numberOfBits); this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); - // check that probability is within range - checkCalculatedProbability(getProbability()); - this.hashCode = generateHashCode(); } /** @@ -238,11 +87,10 @@ public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOf * @return the number of items * @throws IllegalArgumentException if the number of items is {@code < 1} */ - private static int checkNumberOfItems(final int numberOfItems) { + private static void checkNumberOfItems(final int numberOfItems) { if (numberOfItems < 1) { throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems); } - return numberOfItems; } /** @@ -273,91 +121,19 @@ private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { return numberOfHashFunctions; } - /** - * Check the probability is in the range 0.0, exclusive, to 1.0, exclusive. - * - * @param probability the probability - * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)} - */ - private static void checkProbability(final double probability) { - // Using the negation of within the desired range will catch NaN - if (!(probability > 0.0 && probability < 1.0)) { - throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability); - } - } - - /** - * Check the calculated probability is {@code < 1.0}. - * - *

This function is used to verify that the dynamically calculated probability for the - * Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon - * construction. - * - * @param probability the probability - * @throws IllegalArgumentException if the probability is {@code >= 1.0} - */ - private static void checkCalculatedProbability(final double probability) { - // We do not need to check for p <= 0.0 since we only allow positive values for - // parameters and the closest we can come to exp(-kn/m) == 1 is - // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will - // always be 00 - if (probability >= 1.0) { - throw new IllegalArgumentException( - String.format("Calculated probability is greater than or equal to 1: " + probability)); - } - } - - /** - * Calculates the number of hash functions given numberOfItems and numberofBits. - * This is a method so that the calculation is consistent across all constructors. - * - * @param numberOfItems the number of items in the filter. - * @param numberOfBits the number of bits in the filter. - * @return the optimal number of hash functions. - * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1} - */ - private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) { - // k = round((m / n) * ln(2)) We change order so that we use real math rather - // than integer math. - final long k = Math.round(LN_2 * numberOfBits / numberOfItems); - if (k < 1) { - throw new IllegalArgumentException( - String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k)); - } - // Normally we would check that numberofHashFunctions <= Integer.MAX_VALUE but - // since numberOfBits is at most Integer.MAX_VALUE the numerator of - // numberofHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the - // value of k can not be above Integer.MAX_VALUE. - return (int) k; - } - @Override public boolean equals(final Object o) { if (o instanceof Shape) { final Shape other = (Shape) o; return numberOfBits == other.numberOfBits && - numberOfHashFunctions == other.numberOfHashFunctions && - HashFunctionValidator.areEqual(hashFunctionIdentity, - other.hashFunctionIdentity); + numberOfHashFunctions == other.numberOfHashFunctions; } return false; } @Override public int hashCode() { - return hashCode; - } - - private int generateHashCode() { - return Objects.hash(numberOfBits, numberOfHashFunctions, HashFunctionValidator.hash(hashFunctionIdentity)); - } - - /** - * Gets the HashFunctionIdentity of the hash function this shape uses. - * @return the HashFunctionIdentity of the hash function this shape uses. - */ - public HashFunctionIdentity getHashFunctionIdentity() { - return hashFunctionIdentity; + return Objects.hash(numberOfBits, numberOfHashFunctions); } /** @@ -370,6 +146,7 @@ public int getNumberOfBits() { return numberOfBits; } + /** * Gets the number of hash functions used to construct the filter. * This is also known as {@code k}. @@ -380,15 +157,6 @@ public int getNumberOfHashFunctions() { return numberOfHashFunctions; } - /** - * Gets the number of items that are expected in the filter. - * This is also known as {@code n}. - * - * @return the number of items ({@code n}). - */ - public int getNumberOfItems() { - return numberOfItems; - } /** * Calculates the probability of false positives ({@code p}) given @@ -403,18 +171,26 @@ public int getNumberOfItems() { * Thus this returns the worst-case false positive probability for a filter that has not * exceeded its expected number of items. * + * @param numberOfItems the number of items hashed into the Bloom filter. * @return the probability of false positives. * @see #getNumberOfItems() */ - public double getProbability() { + public double getProbability(int numberOfItems) { + checkNumberOfItems( numberOfItems ); return Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), numberOfHashFunctions); } @Override public String toString() { - return String.format("Shape[ %s n=%s m=%s k=%s ]", - HashFunctionIdentity.asCommonString(hashFunctionIdentity), - numberOfItems, numberOfBits, numberOfHashFunctions); + return String.format("Shape[ m=%s k=%s ]", + numberOfBits, numberOfHashFunctions); + } + + public double estimate_n( int hammingValue ) { + double c = hammingValue; + double m = numberOfBits; + double k = numberOfHashFunctions; + return -(m / k) * Math.log(1.0 - (c / m)); } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java new file mode 100644 index 0000000000..2a9ea06842 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; +import java.util.PrimitiveIterator.OfInt; + + +/** + * A Hasher implementation that contains the index for all enabled bits for a specific + * Shape. + * @since 4.5 + */ +public final class SimpleHasher implements Hasher { + + private final long initial; + private final long increment; + + + /** + * Constructs the SimpleHasher from 2 longs. The long values will be interpreted as unsigned values. + * @param initial The initial value for the hasher.. + * @param increment The value to increment the hash by on each iteration. + */ + public SimpleHasher(long initial, long increment) { + this.initial = initial; + this.increment = increment; + } + + + /** + * Gets an iterator of integers that are the bits to enable in the Bloom + * filter based on the shape. The iterator will not return the same value multiple + * times. Values will be returned in ascending order. + * + * @param shape {@inheritDoc} + * @return {@inheritDoc} + * @throws IllegalArgumentException {@inheritDoc} + */ + @Override + public OfInt iterator(final Shape shape) { + return new Iterator(shape); + } + + @Override + public int size() { + return 1; + } + + + /** + * The iterator of integers. + * + *

This assumes that the list of buffers is not empty. + */ + private class Iterator implements PrimitiveIterator.OfInt { + /** The number of hash functions per item. */ + private final int k; + /** The number of bits in the shape. */ + private final long m; + + /** The index of the next item. */ + private long next; + /** The count of hash functions for the current item. */ + private int functionCount; + + /** + * Constructs iterator with the specified shape. + * + * @param shape + */ + private Iterator(final Shape shape) { + // Assumes that shape returns non-zero positive values for hash functions and bits + k = shape.getNumberOfHashFunctions(); + m = shape.getNumberOfBits(); + next = SimpleHasher.this.initial; + functionCount = 0; + } + + @Override + public boolean hasNext() { + return functionCount < k; + } + + @SuppressWarnings("cast") // Cast to long to workaround a bug in animal-sniffer. + @Override + public int nextInt() { + if (hasNext()) { + int result = (int) Long.remainderUnsigned( next, m ); + functionCount++; + next += SimpleHasher.this.increment; + return result; + } + throw new NoSuchElementException(); + } + } + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java deleted file mode 100644 index 430f99b565..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasher.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.PrimitiveIterator.OfInt; -import java.util.Set; -import java.util.TreeSet; - -/** - * A Hasher implementation that contains the index for all enabled bits for a specific - * Shape. - * @since 4.5 - */ -public final class StaticHasher implements Hasher { - - /** - * The shape of this hasher - */ - private final Shape shape; - - /** - * The ordered set of values that this hasher will return. - */ - private final int[] values; - - /** - * Constructs the StaticHasher from a Hasher and a Shape. - * @param hasher the Hasher to read. - * @param shape the Shape for the resulting values. - * @throws IllegalArgumentException if the hasher function and the shape function are not the same. - */ - public StaticHasher(final Hasher hasher, final Shape shape) { - this(hasher.iterator(shape), shape); - HashFunctionValidator.checkAreEqual(hasher.getHashFunctionIdentity(), - shape.getHashFunctionIdentity()); - } - - /** - * Constructs a StaticHasher from an Iterator of Integers and a Shape. - * @param iter the Iterator of Integers. - * @param shape the Shape that the integers were generated for. - * @throws IllegalArgumentException if any Integer is outside the range [0,shape.getNumberOfBits()) - */ - public StaticHasher(final Iterator iter, final Shape shape) { - this.shape = shape; - final Set workingValues = new TreeSet<>(); - iter.forEachRemaining(idx -> { - if (idx >= this.shape.getNumberOfBits()) { - throw new IllegalArgumentException(String.format("Bit index (%s) is too big for %s", idx, shape)); - } - if (idx < 0) { - throw new IllegalArgumentException(String.format("Bit index (%s) may not be less than zero", idx)); - } - workingValues.add(idx); - }); - this.values = new int[workingValues.size()]; - int i = 0; - for (final Integer value : workingValues) { - values[i++] = value.intValue(); - } - } - - /** - * Constructs the StaticHasher from a StaticHasher and a Shape. - * @param hasher the StaticHasher to read. - * @param shape the Shape for the resulting values. - * @throws IllegalArgumentException if the shape of the hasher and the shape parameter are not the same. - */ - public StaticHasher(final StaticHasher hasher, final Shape shape) { - if (!hasher.shape.equals(shape)) { - throw new IllegalArgumentException(String.format("Hasher shape (%s) is not the same as shape (%s)", - hasher.getShape().toString(), shape.toString())); - } - this.shape = shape; - this.values = hasher.values; - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return shape.getHashFunctionIdentity(); - } - - /** - * Gets the shape this static hasher was created with. - * - * @return the Shape of this hasher. - */ - public Shape getShape() { - return shape; - } - - /** - * Tests emptiness (size == 0). - * - * @return Whether or not this is empty. - */ - public boolean isEmpty() { - return size() == 0; - } - - /** - * Gets an iterator of integers that are the bits to enable in the Bloom - * filter based on the shape. The iterator will not return the same value multiple - * times. Values will be returned in ascending order. - * - * @param shape {@inheritDoc} - * @return {@inheritDoc} - * @throws IllegalArgumentException {@inheritDoc} - */ - @Override - public OfInt iterator(final Shape shape) { - if (!this.shape.equals(shape)) { - throw new IllegalArgumentException( - String.format("shape (%s) does not match internal shape (%s)", shape, this.shape)); - } - return Arrays.stream(values).iterator(); - } - - /** - * Gets the the number of unique values in this hasher. - * @return the number of unique values. - */ - public int size() { - return values.length; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java deleted file mode 100644 index 8e07793b7f..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5Cyclic.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import java.nio.ByteBuffer; - -import java.nio.LongBuffer; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * performs MD5 hashing using a signed cyclic method. - * @since 4.5 - */ -public final class MD5Cyclic implements HashFunction { - - /** - * The name of this hash function. - */ - public static final String NAME = "MD5"; - - /** - * The MD5 digest implementation. - */ - private final MessageDigest messageDigest; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * The result from the digest 0 - */ - private final long[] result = new long[2]; - - /** - * Constructs the MD5 hashing function. - */ - public MD5Cyclic() { - try { - messageDigest = MessageDigest.getInstance(NAME); - } catch (final NoSuchAlgorithmException e) { - // This should not happen - throw new IllegalStateException("Missing the standard MD5 message digest algorithm", e); - } - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - - if (seed == 0) { - final byte[] hash; - synchronized (messageDigest) { - messageDigest.update(buffer); - hash = messageDigest.digest(); - messageDigest.reset(); - } - - final LongBuffer lb = ByteBuffer.wrap(hash).asLongBuffer(); - result[0] = lb.get(0); - result[1] = lb.get(1); - } else { - result[0] += result[1]; - } - return result[0]; - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64Cyclic.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64Cyclic.java deleted file mode 100644 index 99c27c8819..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64Cyclic.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import org.apache.commons.codec.digest.MurmurHash3; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * uses an underlying Murmur3 128-bit hash with a signed cyclic method. - * - *

Requires the optional Apache Commons Codec - * library which contains a Java port of the 128-bit hash function - * {@code MurmurHash3_x64_128} from Austin Applyby's original {@code c++} - * code in SMHasher.

- * - * @see SMHasher - * @since 4.5 - */ -public final class Murmur128x64Cyclic implements HashFunction { - - /** - * The name of this hash method. - */ - public static final String NAME = "Murmur3_x64_128"; - - /** - * The result of the hash 0 call. - */ - private long[] parts; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * Constructs a Murmur3 x64 128 hash. - */ - public Murmur128x64Cyclic() { - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - if (parts == null || seed == 0) { - parts = MurmurHash3.hash128x64(buffer, 0, buffer.length, 0); - } else { - parts[0] += parts[1]; - } - return parts[0]; - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java deleted file mode 100644 index 982ef5c869..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86Iterative.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import org.apache.commons.codec.digest.MurmurHash3; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * uses an underlying Murmur3 32-bit hash with a signed iterative method. - * - *

Requires the optional Apache Commons Codec - * library which contains a Java port of the 32-bit hash function - * {@code MurmurHash3_x86_32} from Austin Applyby's original {@code c++} - * code in SMHasher.

- * - * @see Apache Commons Codec - * @see SMHasher - * @since 4.5 - */ -public final class Murmur32x86Iterative implements HashFunction { - - /** - * The name of this hash function. - */ - public static final String NAME = "Murmur3_x86_32"; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * Constructs a Murmur3 x86 32 hash - */ - public Murmur32x86Iterative() { - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - return MurmurHash3.hash32x86(buffer, 0, buffer.length, seed); - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.ITERATIVE; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java deleted file mode 100644 index da0fc2c2db..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterative.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import java.util.Arrays; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; - -/** - * An implementation of HashFunction that - * performs {@code Objects.hash} hashing using a signed iterative method. - *

- * Except in the case of seed 0, the value of the previous hash is - * used as a seed for the next hash. Hashes are seeded by calling - * {@code Arrays.deepHashCode( new Object[]{seed, buffer} )}. - *

- * @since 4.5 - */ -public final class ObjectsHashIterative implements HashFunction { - - /** - * The name of the hash function. - */ - public static final String NAME = "Objects32"; - - /** - * The signature for this hash function. - * - *

TODO: Make static akin to a serialVersionUID? - */ - private final long signature; - - /** - * The value of the last hash. - */ - private long last; - - /** - * Constructs a hash that uses the Objects.hash method to has values. - */ - public ObjectsHashIterative() { - signature = Signatures.getSignature(this); - } - - @Override - public long apply(final byte[] buffer, final int seed) { - if (seed == 0) { - last = 0; - } - // Effectively: - // result = Arrays.deepHashCode(new Object[] { last, buffer }); - // The method loops over items starting with result=1 - // for i in items: - // result = 31 * result + hashCode(i) - // Here we unroll the computation to 2 iterations. - // The computation is done using 32-bit integers then cast to a long - final long result = 31 * (31 + Long.hashCode(last)) + Arrays.hashCode(buffer); - last += result; - return result; - } - - @Override - public String getName() { - return NAME; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.ITERATIVE; - } - - @Override - public String getProvider() { - return "Apache Commons Collections"; - } - - @Override - public long getSignature() { - return signature; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Signatures.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Signatures.java deleted file mode 100644 index b7f35ac051..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/Signatures.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; - -/** - * Allow computation of HashFunction signatures. - * @since 4.5 - */ -final class Signatures { - - /** No instances. */ - private Signatures() {} - - /** - * Gets the standard signature for the hash function. The signature is prepared as: - *


-     * int seed = 0;
-     * return hashFunction.apply(HashFunctionIdentity.prepareSignatureBuffer(hashFunction), seed);
-     * 
- * - * @param hashFunction the hash function - * @return the signature - * @see HashFunctionIdentity#prepareSignatureBuffer(HashFunctionIdentity) - * @see HashFunction#apply(byte[], int) - */ - static long getSignature(final HashFunction hashFunction) { - return hashFunction.apply(HashFunctionIdentity.prepareSignatureBuffer(hashFunction), 0); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 2a1faa18ea..d470129c78 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -16,6 +16,8 @@ */ package org.apache.commons.collections4.bloomfilter; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -29,10 +31,10 @@ import java.util.Arrays; import java.util.BitSet; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.HasherCollection; import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; import org.junit.jupiter.api.Test; /** @@ -40,122 +42,62 @@ */ public abstract class AbstractBloomFilterTest { - /** - * An implementation of BloomFilter that is used to test merge and cardinality - * operations with a filter type that does not match the type of the filter - * being tested. - */ - private static class TestBloomFilter extends AbstractBloomFilter { - /** The bits. */ - final BitSet bits; - - protected TestBloomFilter(final Shape shape, final BitSet bits) { - super(shape); - this.bits = bits; - } - - @Override - public long[] getBits() { - return bits.toLongArray(); - } - - @Override - public StaticHasher getHasher() { - return new StaticHasher(bits.stream().iterator(), getShape()); - } - - @Override - public boolean merge(final BloomFilter other) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean merge(final Hasher hasher) { - throw new UnsupportedOperationException(); - } - } + private final SimpleHasher from1 = new SimpleHasher( 1, 1 ); + private final SimpleHasher from11 = new SimpleHasher( 11, 1 ); + private final HasherCollection bigHasher = new HasherCollection( from1, from11 ); + private final HasherCollection fullHasher = new HasherCollection( + new SimpleHasher(0,1)/*0-16*/, + new SimpleHasher(17,1)/*17-33*/, + new SimpleHasher(33,1)/*33-49*/, + new SimpleHasher(50,1)/*50-66*/, + new SimpleHasher(67,1)/*67-83*/ + ); /** - * A HashFunctionIdentity for testing. + * The shape of the Bloom filters for testing */ - protected HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; + protected Shape shape = new Shape(17, 72); /** - * A second HashFunctionIdentity for testing. + * Create an empty version of the BloomFilter implementation we are testing. + * + * @param shape the shape of the filter. + * @return a BloomFilter implementation. */ - protected HashFunctionIdentity testFunctionX = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test FunctionX"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 1; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; + protected abstract BloomFilter createEmptyFilter(Shape shape); /** - * The shape of the Bloom filters for testing + * Create the BloomFilter implementation we are testing. + * + * @param hasher the hasher to use to create the filter. + * @param shape the shape of the filter. + * @return a BloomFilter implementation. */ - protected Shape shape = new Shape(testFunction, 3, 72, 17); + protected abstract BloomFilter createFilter(Shape shape, Hasher hasher); + /** * Tests that the andCardinality calculations are correct. + * + * @param filterFactory the factory function to create the filter */ @Test - public final void andCardinalityTest() { - andCardinalityTest(this::createFilter); + public void containsTest() { + final BloomFilter bf = createFilter( shape, from1 ); + final BloomFilter bf2 = createFilter( shape, bigHasher ); + + assertTrue( "BF Should contain itself", bf.contains(bf)); + assertTrue( "BF2 Should contain itself", bf2.contains(bf2)); + assertFalse( "BF should not contain BF2",bf.contains(bf2)); + assertTrue( "BF2 should contain BF", bf2.contains(bf)); } - /** - * Tests that the andCardinality calculations are correct with a generic BloomFilter. - */ @Test - public final void andCardinalityTest_GenericBloomFilter() { - andCardinalityTest(this::createGenericFilter); + public void containsTest_Hasher() { + final BloomFilter bf = createFilter( shape, bigHasher ); + + assertTrue( "BF Should contain this hasher", bf.contains( new SimpleHasher( 1, 1 ))); + assertFalse( "BF Should not contain this hasher", bf.contains( new SimpleHasher( 1, 3 ))); } /** @@ -163,69 +105,51 @@ public final void andCardinalityTest_GenericBloomFilter() { * * @param filterFactory the factory function to create the filter */ - private void andCardinalityTest(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + @Test + public void estimateIntersectionTest() { - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); + final BloomFilter bf = createFilter( shape, from1 ); + final BloomFilter bf2 = createFilter( shape, bigHasher ); - assertEquals(7, bf.andCardinality(bf2)); + assertEquals(1.0, bf.estimateIntersection(bf2), 0.5); + assertEquals(1.0, bf2.estimateIntersection(bf), 0.5); } - /** - * Tests that the andCardinality calculations are correct when there are more than Long.LENGTH bits. - */ @Test - public final void andCardinalityTest_ExtraLongs() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + public void estimateIntersectionTest_empty() { + final BloomFilter bf = createFilter( shape, from1 ); + final BloomFilter bf2 = createEmptyFilter( shape); - final BloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - - final BloomFilter bf2 = createFilter(hasher2, shape); - - assertEquals(7, bf.andCardinality(bf2)); - assertEquals(7, bf2.andCardinality(bf)); + assertEquals(0.0, bf.estimateIntersection(bf2), 0.00); + assertEquals(0.0, bf2.estimateIntersection(bf), 0.00); } /** - * Compare 2 static hashers to verify they have the same bits enabled. + * Tests that the andCardinality calculations are correct. * - * @param hasher1 the first static hasher. - * @param hasher2 the second static hasher. + * @param filterFactory the factory function to create the filter */ - private void assertSameBits(final StaticHasher hasher1, final StaticHasher hasher2) { - final OfInt iter1 = hasher1.iterator(shape); - final OfInt iter2 = hasher2.iterator(shape); - - while (iter1.hasNext()) { - assertTrue(iter2.hasNext(), "Not enough data in second hasher"); - assertEquals(iter1.nextInt(), iter2.nextInt()); - } - assertFalse(iter2.hasNext(), "Too much data in second hasher"); + @Test + public void estimateUnionTest() { + final BloomFilter bf = createFilter( shape, from1 ); + + final BloomFilter bf2 = createFilter( shape, from11 ); + + assertEquals(2.0, bf.estimateUnion(bf2), 0.5); + assertEquals(2.0, bf2.estimateUnion(bf), 0.5); } - /** - * Tests that cardinality is correct. - */ @Test - public final void cardinalityTest() { - + public void estimateUnionTest_empty() { final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final BloomFilter bf = createFilter( shape, from1 ); + final BloomFilter bf2 = createEmptyFilter( shape); - final BloomFilter bf = createFilter(hasher, shape); - assertEquals(17, bf.cardinality()); + assertEquals(1.0, bf.estimateUnion(bf2), 0.15); + assertEquals(1.0, bf2.estimateUnion(bf), 0.15); } + /** * Tests that creating an empty hasher works as expected. */ @@ -242,171 +166,29 @@ public final void constructorTest_Empty() { */ @Test public final void constructorTest_Hasher() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + Hasher hasher = new SimpleHasher(0,1); - final BloomFilter bf = createFilter(hasher, shape); + final BloomFilter bf = createFilter(shape, hasher); final long[] lb = bf.getBits(); assertEquals(0x1FFFF, lb[0]); assertEquals(1, lb.length); } - /** - * Tests that creating a Bloom filter with a Static hasher that has one shape and a - * different specified shape fails. - */ - @Test - public final void constructorTest_WrongShape() { - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), anotherShape); - try { - createFilter(hasher, shape); - fail("Should throw IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests that contains() with a Bloom filter argument returns the proper results. - */ - @Test - public final void containsTest_BloomFilter() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - final BloomFilter bf2 = createFilter(hasher2, shape); - assertTrue(bf.contains(bf2)); - assertFalse(bf2.contains(bf)); - } - - /** - * Tests that contains() fails properly if the other Bloom filter is not of the proper shape. - */ - @Test - public final void containsTest_BloomFilter_WrongShape() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - final Hasher hasher2 = new StaticHasher(lst.iterator(), anotherShape); - final BloomFilter bf2 = createFilter(hasher2, anotherShape); - try { - bf.contains(bf2); - fail("Should throw IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests that contains() with a Hasher argument returns the proper results. - */ - @Test - public final void containsTest_Hasher() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - assertTrue(bf.contains(hasher2)); - - lst2 = Arrays.asList(17, 18, 19, 20); - hasher2 = new StaticHasher(lst2.iterator(), shape); - assertFalse(bf.contains(hasher2)); - - lst2 = Arrays.asList(10, 11, 12, 17, 18, 19, 20); - hasher2 = new StaticHasher(lst2.iterator(), shape); - assertFalse(bf.contains(hasher2)); - } - - /** - * Tests that contains() fails properly if the hasher is not of the proper shape. - */ - @Test - public final void containsTest_Hasher_WrongShape() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - - final List lst2 = Arrays.asList(4, 5, 6, 7, 8, 9, 10); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); - try { - bf.contains(hasher2); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } - - /** - * Create an empty version of the BloomFilter implementation we are testing. - * - * @param shape the shape of the filter. - * @return a BloomFilter implementation. - */ - protected abstract AbstractBloomFilter createEmptyFilter(Shape shape); - - /** - * Create the BloomFilter implementation we are testing. - * - * @param hasher the hasher to use to create the filter. - * @param shape the shape of the filter. - * @return a BloomFilter implementation. - */ - protected abstract AbstractBloomFilter createFilter(Hasher hasher, Shape shape); - - /** - * Create a generic BloomFilter implementation. - * - * @param hasher the hasher to use to create the filter. - * @param shape the shape of the filter. - * @return a BloomFilter implementation. - */ - private AbstractBloomFilter createGenericFilter(final Hasher hasher, final Shape shape) { - final BitSet bits = new BitSet(); - hasher.iterator(shape).forEachRemaining((IntConsumer) bits::set); - return new TestBloomFilter(shape, bits); - } /** * Tests that getBits() works correctly when multiple long values are returned. */ @Test public final void getBitsTest_SpanLong() { - final List lst = Arrays.asList(63, 64); - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); + + final SimpleHasher hasher = new SimpleHasher(63,1); + final BloomFilter bf = createFilter(new Shape(2, 72), hasher ); final long[] lb = bf.getBits(); assertEquals(2, lb.length); assertEquals(0x8000000000000000L, lb[0]); assertEquals(0x1, lb[1]); } - /** - * Tests that the the hasher returned from getHasher() works correctly. - */ - @Test - public final void getHasherTest() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); - - final StaticHasher hasher2 = bf.getHasher(); - - assertEquals(shape, hasher2.getShape()); - assertSameBits(hasher, hasher2); - } - /** * Tests that isFull() returns the proper values. */ @@ -414,228 +196,80 @@ public final void getHasherTest() { public final void isFullTest() { // create empty filter - AbstractBloomFilter filter = createEmptyFilter(shape); - assertFalse(filter.isFull()); - - final List values = new ArrayList<>(shape.getNumberOfBits()); - for (int i = 0; i < shape.getNumberOfBits(); i++) { - values.add(i); - } - - StaticHasher hasher2 = new StaticHasher(values.iterator(), shape); - filter = createFilter(hasher2, shape); + BloomFilter filter = createEmptyFilter(shape); + assertFalse("Should not be full", filter.isFull(shape)); - assertTrue(filter.isFull()); - - final int mid = shape.getNumberOfBits() / 2; - values.remove(Integer.valueOf(mid)); - hasher2 = new StaticHasher(values.iterator(), shape); - filter = createFilter(hasher2, shape); - assertFalse(filter.isFull()); - } + filter = createFilter( shape, fullHasher ); + assertTrue("Should be full", filter.isFull(shape)); - /** - * Tests that merging bloom filters works as expected. - */ - @Test - public final void mergeTest_BloomFilter() { - mergeTest_BloomFilter(this::createFilter); + filter = createFilter( shape, new SimpleHasher( 1, 3 )); + assertFalse("Should not be full", filter.isFull(shape)); } /** * Tests that merging bloom filters works as expected with a generic BloomFilter. */ @Test - public final void mergeTest_GenericBloomFilter() { - mergeTest_BloomFilter(this::createGenericFilter); - } - - /** - * Tests that merging bloom filters works as expected. - * - * @param filterFactory the factory function to create the filter - */ - private void mergeTest_BloomFilter(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); + public final void mergeTest_Bloomfilter() { - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + final BloomFilter bf1 = createFilter( shape, from1); - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); + final BloomFilter bf2 = createFilter( shape, from11); - assertTrue(bf.merge(bf2), "Merge should not fail"); - assertEquals(27, bf.cardinality()); - } - - /** - * Tests that merging bloom filters with different shapes fails properly - */ - @Test - public final void mergeTest_BloomFilter_WrongShape() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final BloomFilter bf3 = bf1.merge(bf2); + assertTrue( "Should contain", bf3.contains( bf1 )); + assertTrue( "Should contain", bf3.contains( bf2 )); - final BloomFilter bf = createFilter(hasher, shape); - - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); - final BloomFilter bf2 = createFilter(hasher2, anotherShape); - - try { - bf.merge(bf2); - fail("Should throw IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } + final BloomFilter bf4 = bf2.merge(bf1); + assertTrue( "Should contain", bf4.contains( bf1 )); + assertTrue( "Should contain", bf4.contains( bf2 )); + assertTrue( "Should contain", bf4.contains( bf3 )); + assertTrue( "Should contain", bf3.contains( bf4 )); } - /** - * Tests that merging a hasher into a Bloom filter works as expected - */ @Test public final void mergeTest_Hasher() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final BloomFilter bf = createFilter(hasher, shape); + final BloomFilter bf1 = createFilter( shape, from1); + final BloomFilter bf2 = createFilter( shape, from11); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - - assertTrue(bf.merge(hasher2), "Merge should not fail"); - assertEquals(27, bf.cardinality()); + final BloomFilter bf3 = bf1.merge( from11 ); + assertTrue( "Should contain", bf3.contains( bf1 )); + assertTrue( "Should contain", bf3.contains( bf2 )); } /** - * Tests that merging a static hasher with the wrong shape into a Bloom filter fails as expected - */ - @Test - public final void mergeTest_Hasher_WrongShape() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); - - final Shape anotherShape = new Shape(testFunctionX, 3, 72, 17); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), anotherShape); - - try { - bf.merge(hasher2); - fail("Should throw IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests that the orCardinality calculations are correct. - */ - @Test - public final void orCardinalityTest() { - orCardinalityTest(this::createFilter); - } - - /** - * Tests that the orCardinality calculations are correct with a generic BloomFilter. + * Tests that merging bloom filters works as expected with a generic BloomFilter. */ @Test - public final void orCardinalityTest_GenericBloomFilter() { - orCardinalityTest(this::createGenericFilter); - } - - /** - * Tests that the andCardinality calculations are correct. - * - * @param filterFactory the factory function to create the filter - */ - private void orCardinalityTest(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + public final void mergeInPlaceTest_Bloomfilter() { - final AbstractBloomFilter bf = createFilter(hasher, shape); + final BloomFilter bf1 = createFilter( shape, from1); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + final BloomFilter bf2 = createFilter( shape, from11); - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); + final BloomFilter bf3 = bf1.merge(bf2); - assertEquals(27, bf.orCardinality(bf2)); - } + bf1.mergeInPlace( bf2 ); - /** - * Tests that the orCardinality calculations are correct when there are more than Long.LENGTH bits. - */ - @Test - public final void orCardinalityTest_ExtraLongs() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final AbstractBloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + assertTrue( "Should contain", bf1.contains( bf2 )); + assertTrue( "Should contain", bf1.contains( bf3 )); - final AbstractBloomFilter bf2 = createFilter(hasher2, shape); - - assertEquals(27, bf.orCardinality(bf2)); - assertEquals(27, bf2.orCardinality(bf)); } - /** - * Tests that the xorCardinality calculations are correct. - */ @Test - public final void xorCardinalityTest() { - xorCardinalityTest(this::createFilter); - } + public final void mergeInPlaceTest_Hasher() { - /** - * Tests that the xorCardinality calculations are correct with a generic BloomFilter. - */ - @Test - public final void xorCardinalityTest_GenericBloomFilter() { - xorCardinalityTest(this::createGenericFilter); - } + final BloomFilter bf1 = createFilter( shape, from1); - /** - * Tests that the andCardinality calculations are correct. - * - * @param filterFactory the factory function to create the filter - */ - private void xorCardinalityTest(final BiFunction filterFactory) { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); + final BloomFilter bf2 = createFilter( shape, from11); - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + final BloomFilter bf3 = bf1.merge(bf2); - final BloomFilter bf2 = filterFactory.apply(hasher2, shape); + bf1.mergeInPlace( from11 ); - assertEquals(20, bf.xorCardinality(bf2)); + assertTrue( "Should contain Bf2", bf1.contains( bf2 )); + assertTrue( "Should contain Bf3", bf1.contains( bf3 )); } - /** - * Tests that the xorCardinality calculations are correct when there are more than Long.LENGTH bits. - */ - @Test - public final void xorCardinalityTest_ExtraLongs() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); - - final BloomFilter bf = createFilter(hasher, shape); - - final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); - final BloomFilter bf2 = createFilter(hasher2, shape); - - assertEquals(20, bf.xorCardinality(bf2)); - assertEquals(20, bf2.xorCardinality(bf)); - } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java index a661f93fde..fb0ee55b6b 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java @@ -41,7 +41,7 @@ public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { * Function to convert int arrays to BloomFilters for testing. */ private final Function converter = counts -> { - final BloomFilter testingFilter = new BitSetBloomFilter(shape); + final BloomFilter testingFilter = new SimpleBloomFilter(shape); testingFilter.merge(new FixedIndexesTestHasher(shape, counts)); return testingFilter; }; @@ -122,10 +122,10 @@ public void contains_BloomFilter() { // Some indexes with duplicates final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 5); final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - BitSetBloomFilter testingFilter = new BitSetBloomFilter(shape); + SimpleBloomFilter testingFilter = new SimpleBloomFilter(shape); testingFilter.merge( new FixedIndexesTestHasher(shape, 3, 4)); assertFalse(bf.contains(testingFilter)); - testingFilter = new BitSetBloomFilter(shape); + testingFilter = new SimpleBloomFilter(shape); testingFilter.merge( new FixedIndexesTestHasher(shape, 2, 5)); assertTrue(bf.contains(testingFilter)); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java new file mode 100644 index 0000000000..7f9e509f5f --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java @@ -0,0 +1,90 @@ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.junit.Test; + +public class BitMaptTest { + + @Test + public void checkPositiveTest() { + BloomFilter.BitMap.checkPositive(0); + BloomFilter.BitMap.checkPositive(0); + try { + BloomFilter.BitMap.checkPositive(-1); + + } catch (IndexOutOfBoundsException expected) { + // do nothing + } + } + + @Test + public void containsTest() { + long[] ary = new long[1]; + + assertFalse( BloomFilter.BitMap.contains(ary, 0) ); + ary[0] = 0x01; + assertTrue( BloomFilter.BitMap.contains(ary, 0) ); + + assertFalse( BloomFilter.BitMap.contains(ary, 63) ); + ary[0] = (1L << 63); + assertTrue( BloomFilter.BitMap.contains(ary, 63) ); + + ary = new long[2]; + assertFalse( BloomFilter.BitMap.contains(ary, 64) ); + ary[1] = 1; + assertTrue( BloomFilter.BitMap.contains(ary, 64) ); + + } + + @Test + public void getLongBitTest() { + assertEquals( 1, BloomFilter.BitMap.getLongBit(0) ); + assertEquals( 0x8000000000000000L, BloomFilter.BitMap.getLongBit( 63 ) ); + assertEquals( 1, BloomFilter.BitMap.getLongBit( 64) ); + assertEquals( 0x8000000000000000L, BloomFilter.BitMap.getLongBit( 127 ) ); + assertEquals( 1, BloomFilter.BitMap.getLongBit( 128 ) ); + } + + @Test + public void getLongIndexTest() { + assertEquals( 0, BloomFilter.BitMap.getLongIndex(0) ); + assertEquals( 0, BloomFilter.BitMap.getLongIndex( 63 ) ); + assertEquals( 1, BloomFilter.BitMap.getLongIndex( 64) ); + assertEquals( 1, BloomFilter.BitMap.getLongIndex( 127 ) ); + assertEquals( 2, BloomFilter.BitMap.getLongIndex( 128 ) ); + } + + + @Test + public void isSparseTest() { + Shape shape = new Shape( 17, 64 ); + assertTrue( BloomFilter.BitMap.isSparse(0, shape) ); + assertTrue( BloomFilter.BitMap.isSparse(1, shape) ); + assertTrue( BloomFilter.BitMap.isSparse(2, shape) ); + assertFalse( BloomFilter.BitMap.isSparse(3, shape) ); + + shape = new Shape( 17, 64*3 ); + + for (int i=0;i<7; i++) { + assertTrue( BloomFilter.BitMap.isSparse(i, shape) ); + } + assertFalse( BloomFilter.BitMap.isSparse(7, shape) ); + } + + @Test + public void numberOfBucketsTest() { + for (int i = 0;i<64;i++) { + assertEquals( 1, BloomFilter.BitMap.numberOfBuckets(i)); + } + for (int i = 64;i<128;i++) { + assertEquals( 2, BloomFilter.BitMap.numberOfBuckets(i)); + } + assertEquals( 3, BloomFilter.BitMap.numberOfBuckets(128)); + + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java index 0d6443355c..b1d0525721 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java @@ -21,7 +21,7 @@ import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; /** * Test all the default implementations of the BloomFilter in {@link AbstractBloomFilter}. @@ -67,8 +67,8 @@ public long[] getBits() { } @Override - public StaticHasher getHasher() { - return new StaticHasher(bitSet.stream().iterator(), getShape()); + public SimpleHasher getHasher() { + return new SimpleHasher(bitSet.stream().iterator(), getShape()); } @Override diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java deleted file mode 100644 index a10df81643..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.DynamicHasher; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; -import org.junit.jupiter.api.Test; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.PrimitiveIterator.OfInt; - -/** - * Tests the {@link HasherBloomFilter}. - */ -public class HasherBloomFilterTest extends AbstractBloomFilterTest { - - /** - * Tests that the constructor works correctly. - */ - @Test - public void constructorTest_NonStatic() { - final Shape shape = new Shape(new MD5Cyclic(), 3, 72, 17); - final DynamicHasher hasher = new DynamicHasher.Builder(new MD5Cyclic()).with("Hello", StandardCharsets.UTF_8).build(); - final HasherBloomFilter filter = createFilter(hasher, shape); - final long[] lb = filter.getBits(); - assertEquals(2, lb.length); - assertEquals(0x6203101001888c44L, lb[0]); - assertEquals(0x60L, lb[1]); - } - - @Override - protected AbstractBloomFilter createEmptyFilter(final Shape shape) { - return new HasherBloomFilter(shape); - } - - @Override - protected HasherBloomFilter createFilter(final Hasher hasher, final Shape shape) { - return new HasherBloomFilter(hasher, shape); - } - - /** - * Test the edge case where the filter is empty and the getBits() function returns a - * zero length array. - */ - @Test - public void getBitsTest_Empty() { - final BloomFilter filter = createEmptyFilter(shape); - assertArrayEquals(new long[0], filter.getBits()); - } - - /** - * Test the edge case where the filter has only 1 bit in the lowest index and the getBits() - * function returns an array of length 1. - */ - @Test - public void getBitsTest_LowestBitOnly() { - final BloomFilter filter = createEmptyFilter(shape); - // Set the lowest bit index only. - filter.merge(new Hasher() { - @Override - public OfInt iterator(final Shape shape) { - return Arrays.stream(new int[] {0}).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return shape.getHashFunctionIdentity(); - } - }); - assertArrayEquals(new long[] {1L}, filter.getBits()); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index 541428989f..967de9fcb6 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -24,7 +24,7 @@ import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; import org.junit.jupiter.api.Test; /** @@ -65,12 +65,12 @@ public Signedness getSignedness() { @Test public void testDifferentShapesThrows() { final List lst = Arrays.asList(1, 2); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); final Shape shape2 = new Shape(testFunction, 3, 72, 18); final List lst2 = Arrays.asList(2, 3); - final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape2); + final Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape2); final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape2); try { @@ -87,29 +87,29 @@ public void testDifferentShapesThrows() { @Test public final void cosineDistanceTest() { List lst = Arrays.asList(1, 2); - Hasher hasher = new StaticHasher(lst.iterator(), shape); + Hasher hasher = new SimpleHasher(lst.iterator(), shape); BloomFilter filter1 = new HasherBloomFilter(hasher, shape); List lst2 = Arrays.asList(2, 3); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape); BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(0.5, SetOperations.cosineDistance(filter1, filter2), 0.0001); assertEquals(0.5, SetOperations.cosineDistance(filter2, filter1), 0.0001); lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - hasher = new StaticHasher(lst.iterator(), shape); + hasher = new SimpleHasher(lst.iterator(), shape); filter1 = new HasherBloomFilter(hasher, shape); lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - hasher2 = new StaticHasher(lst2.iterator(), shape); + hasher2 = new SimpleHasher(lst2.iterator(), shape); filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(0.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); + hasher2 = new SimpleHasher(lst2.iterator(), shape); filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(0.514928749927334, SetOperations.cosineDistance(filter1, filter2), 0.000000000000001); @@ -126,7 +126,7 @@ public final void cosineDistanceTest_NoValues() { final BloomFilter filter2 = new HasherBloomFilter(shape); // build a filter final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); assertEquals(1.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); @@ -141,18 +141,18 @@ public final void cosineDistanceTest_NoValues() { @Test public final void cosineSimilarityTest() { final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape); BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(1.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.cosineSimilarity(filter2, filter1), 0.0001); lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); + hasher2 = new SimpleHasher(lst2.iterator(), shape); filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(0.485071250072666, SetOperations.cosineSimilarity(filter1, filter2), 0.000000000000001); @@ -169,7 +169,7 @@ public final void cosineSimilarityTest_NoValues() { final BloomFilter filter2 = new HasherBloomFilter(shape); // build a filter final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); assertEquals(0.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); @@ -185,12 +185,12 @@ public final void cosineSimilarityTest_NoValues() { public final void estimateIntersectionSizeTest() { // build a filter List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); lst = Arrays.asList(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40); - final Hasher hasher2 = new StaticHasher(lst.iterator(), shape); + final Hasher hasher2 = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); final long estimate = SetOperations.estimateIntersectionSize(filter1, filter2); @@ -204,20 +204,20 @@ public final void estimateIntersectionSizeTest() { public final void estimateSizeTest() { // build a filter List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher = new StaticHasher(lst.iterator(), shape); + Hasher hasher = new SimpleHasher(lst.iterator(), shape); BloomFilter filter1 = new HasherBloomFilter(hasher, shape); assertEquals(1, SetOperations.estimateSize(filter1)); // the data provided above do not generate an estimate that is equivalent to the // actual. lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); - hasher = new StaticHasher(lst.iterator(), shape); + hasher = new SimpleHasher(lst.iterator(), shape); filter1 = new HasherBloomFilter(hasher, shape); assertEquals(1, SetOperations.estimateSize(filter1)); lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33); - final Hasher hasher2 = new StaticHasher(lst.iterator(), shape); + final Hasher hasher2 = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(3, SetOperations.estimateSize(filter2)); @@ -230,12 +230,12 @@ public final void estimateSizeTest() { public final void estimateUnionSizeTest() { // build a filter List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); lst = Arrays.asList(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40); - final Hasher hasher2 = new StaticHasher(lst.iterator(), shape); + final Hasher hasher2 = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); final long estimate = SetOperations.estimateUnionSize(filter1, filter2); @@ -248,18 +248,18 @@ public final void estimateUnionSizeTest() { @Test public final void hammingDistanceTest() { final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape); BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(0, SetOperations.hammingDistance(filter1, filter2)); assertEquals(0, SetOperations.hammingDistance(filter2, filter1)); lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); + hasher2 = new SimpleHasher(lst2.iterator(), shape); filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(17, SetOperations.hammingDistance(filter1, filter2)); @@ -272,18 +272,18 @@ public final void hammingDistanceTest() { @Test public final void jaccardDistanceTest() { final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape); BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1), 0.0001); lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); + hasher2 = new SimpleHasher(lst2.iterator(), shape); filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(0.32, SetOperations.jaccardDistance(filter1, filter2), 0.001); @@ -300,7 +300,7 @@ public final void jaccardDistanceTest_NoValues() { final BloomFilter filter2 = new HasherBloomFilter(shape); // build a filter final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); @@ -315,18 +315,18 @@ public final void jaccardDistanceTest_NoValues() { @Test public final void jaccardSimilarityTest() { final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape); BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.jaccardSimilarity(filter2, filter1), 0.0001); lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new StaticHasher(lst2.iterator(), shape); + hasher2 = new SimpleHasher(lst2.iterator(), shape); filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(0.68, SetOperations.jaccardSimilarity(filter1, filter2), 0.001); @@ -343,7 +343,7 @@ public final void jaccardSimilarityTest_NoValues() { final BloomFilter filter2 = new HasherBloomFilter(shape); // build a filter final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final Hasher hasher = new SimpleHasher(lst.iterator(), shape); final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2), 0.0001); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java similarity index 70% rename from src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java index 9a2078d80c..ea87e8c599 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitSetBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java @@ -20,18 +20,16 @@ import org.apache.commons.collections4.bloomfilter.hasher.Shape; /** - * Tests for the {@link BitSetBloomFilter}. + * Tests for the {@link SimpleBloomFilter}. */ -public class BitSetBloomFilterTest extends AbstractBloomFilterTest { +public class SimpleBloomFilterTest extends AbstractBloomFilterTest { @Override - protected BitSetBloomFilter createEmptyFilter(final Shape shape) { - return new BitSetBloomFilter(shape); + protected SimpleBloomFilter createEmptyFilter(final Shape shape) { + return new SimpleBloomFilter(shape); } @Override - protected BitSetBloomFilter createFilter(final Hasher hasher, final Shape shape) { - final BitSetBloomFilter testFilter = new BitSetBloomFilter(shape); - testFilter.merge( hasher ); - return testFilter; + protected SimpleBloomFilter createFilter(final Shape shape, final Hasher hasher) { + return new SimpleBloomFilter(shape, hasher); } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java similarity index 57% rename from src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java index 95951ad7fe..dca19dda6a 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/function/package-info.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java @@ -14,11 +14,22 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +package org.apache.commons.collections4.bloomfilter; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.apache.commons.collections4.bloomfilter.hasher.Shape; /** - * Provides implementations of the Bloom filter - * {@link org.apache.commons.collections4.bloomfilter.hasher.HashFunction HashFunction} interface. - * - * @since 4.5 + * Tests for the {@link SimpleBloomFilter}. */ -package org.apache.commons.collections4.bloomfilter.hasher.function; +public class SparseBloomFilterTest extends AbstractBloomFilterTest { + @Override + protected SparseBloomFilter createEmptyFilter(final Shape shape) { + return new SparseBloomFilter(shape); + } + + @Override + protected SparseBloomFilter createFilter(final Shape shape, final Hasher hasher) { + return new SparseBloomFilter(shape, hasher); + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java index afbd6d8b0f..58a0148a91 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java @@ -30,11 +30,11 @@ import org.junit.jupiter.api.Test; /** - * {@link DynamicHasher.Builder} tests. + * {@link HasherCollection.Builder} tests. */ public class DynamicHasherBuilderTest { - private DynamicHasher.Builder builder; + private HasherCollection.Builder builder; private final HashFunction hf = new MD5Cyclic(); private final Shape shape = new Shape(hf, 1, 345, 1); private final String testString = HasherBuilderTest.getExtendedString(); @@ -45,7 +45,7 @@ public class DynamicHasherBuilderTest { @Test public void buildTest_byteArray() { final byte[] bytes = testString.getBytes(); - final DynamicHasher hasher = builder.with(bytes).build(); + final HasherCollection hasher = builder.with(bytes).build(); final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); final OfInt iter = hasher.iterator(shape); @@ -60,7 +60,7 @@ public void buildTest_byteArray() { */ @Test public void buildTest_Empty() { - final DynamicHasher hasher = builder.build(); + final HasherCollection hasher = builder.build(); final OfInt iter = hasher.iterator(shape); @@ -79,7 +79,7 @@ public void buildTest_Empty() { @Test public void buildTest_String() { final byte[] bytes = testString.getBytes(StandardCharsets.UTF_8); - final DynamicHasher hasher = builder.with(testString, StandardCharsets.UTF_8).build(); + final HasherCollection hasher = builder.with(testString, StandardCharsets.UTF_8).build(); final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); final OfInt iter = hasher.iterator(shape); @@ -95,7 +95,7 @@ public void buildTest_String() { @Test public void buildTest_UnencodedString() { final byte[] bytes = testString.getBytes(StandardCharsets.UTF_16LE); - final DynamicHasher hasher = builder.withUnencoded(testString).build(); + final HasherCollection hasher = builder.withUnencoded(testString).build(); final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); final OfInt iter = hasher.iterator(shape); @@ -127,6 +127,6 @@ public void buildResetTest() { */ @BeforeEach public void setup() { - builder = new DynamicHasher.Builder(hf); + builder = new HasherCollection.Builder(hf); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java index 7b2bbba3e8..b33c2414a5 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java @@ -31,10 +31,10 @@ import org.junit.jupiter.api.Test; /** - * Tests the {@link DynamicHasher}. + * Tests the {@link HasherCollection}. */ public class DynamicHasherTest { - private DynamicHasher.Builder builder; + private HasherCollection.Builder builder; private Shape shape; private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { @@ -70,7 +70,7 @@ public Signedness getSignedness() { */ @BeforeEach public void setup() { - builder = new DynamicHasher.Builder(new MD5Cyclic()); + builder = new HasherCollection.Builder(new MD5Cyclic()); shape = new Shape(new MD5Cyclic(), 3, 72, 17); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java index c3d7c5c51e..70eb633a78 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java @@ -29,7 +29,7 @@ import org.junit.jupiter.api.Test; /** - * Tests the {@link StaticHasher}. + * Tests the {@link SimpleHasher}. */ public class StaticHasherTest { @@ -97,7 +97,7 @@ public Signedness getSignedness() { * @param hasher1 the first static hasher. * @param hasher2 the second static hasher. */ - private void assertSameBits(final StaticHasher hasher1, final StaticHasher hasher2) { + private void assertSameBits(final SimpleHasher hasher1, final SimpleHasher hasher2) { final OfInt iter1 = hasher1.iterator(shape); final OfInt iter2 = hasher2.iterator(shape); @@ -130,7 +130,7 @@ public HashFunctionIdentity getHashFunctionIdentity() { } }; - final StaticHasher hasher = new StaticHasher(testHasher, shape); + final SimpleHasher hasher = new SimpleHasher(testHasher, shape); final OfInt iter = hasher.iterator(shape); for (final int element : expected) { assertTrue(iter.hasNext()); @@ -160,7 +160,7 @@ public HashFunctionIdentity getHashFunctionIdentity() { }; try { - new StaticHasher(testHasher, shape); + new SimpleHasher(testHasher, shape); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // do nothing @@ -175,7 +175,7 @@ public void testConstructor_Iterator() { final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; Iterator iter = Arrays.stream(values).iterator(); - final StaticHasher hasher = new StaticHasher(iter, shape); + final SimpleHasher hasher = new SimpleHasher(iter, shape); assertEquals(5, hasher.size()); assertEquals(shape, hasher.getShape()); @@ -204,7 +204,7 @@ public void testConstructor_Iterator_ValueTooBig() { final int[] values = {shape.getNumberOfBits(), 3, 5, 7, 9, 3, 5, 1}; final Iterator iter = Arrays.stream(values).iterator(); try { - new StaticHasher(iter, shape); + new SimpleHasher(iter, shape); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // do nothing @@ -221,7 +221,7 @@ public void testConstructor_Iterator_ValueTooSmall() { final int[] values = {-1, 3, 5, 7, 9, 3, 5, 1}; final Iterator iter = Arrays.stream(values).iterator(); try { - new StaticHasher(iter, shape); + new SimpleHasher(iter, shape); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // do nothing @@ -235,9 +235,9 @@ public void testConstructor_Iterator_ValueTooSmall() { public void testConstructor_StaticHasher() { final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; final Iterator iter = Arrays.stream(values).iterator(); - final StaticHasher hasher = new StaticHasher(iter, shape); + final SimpleHasher hasher = new SimpleHasher(iter, shape); - final StaticHasher hasher2 = new StaticHasher(hasher, shape); + final SimpleHasher hasher2 = new SimpleHasher(hasher, shape); assertEquals(shape, hasher2.getShape()); assertSameBits(hasher, hasher2); } @@ -250,10 +250,10 @@ public void testConstructor_StaticHasher() { public void testConstructor_StaticHasher_WrongShape() { final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; final Iterator iter = Arrays.stream(values).iterator(); - final StaticHasher hasher = new StaticHasher(iter, new Shape(testFunctionX, 3, 72, 17)); + final SimpleHasher hasher = new SimpleHasher(iter, new Shape(testFunctionX, 3, 72, 17)); try { - new StaticHasher(hasher, shape); + new SimpleHasher(hasher, shape); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // do nothing @@ -267,7 +267,7 @@ public void testConstructor_StaticHasher_WrongShape() { public void testGetBits() { final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); + final SimpleHasher hasher = new SimpleHasher(lst.iterator(), shape); assertEquals(17, hasher.size()); final OfInt iter = hasher.iterator(shape); for (int i = 0; i < 17; i++) { @@ -287,7 +287,7 @@ public void testGetBits_DuplicateValues() { final int[] expected = {1, 2, 3, 6, 7, 10, 11, 13, 15, 17, 19, 23, 24, 25, 35, 36, 39, 43, 44, 45, 48, 49, 53, 55, 57, 59, 61, 62, 63, 65, 69, 70}; - final StaticHasher hasher = new StaticHasher(Arrays.stream(input).iterator(), shape); + final SimpleHasher hasher = new SimpleHasher(Arrays.stream(input).iterator(), shape); final OfInt iter = hasher.iterator(shape); for (final int element : expected) { @@ -303,7 +303,7 @@ public void testGetBits_DuplicateValues() { @Test public void testGetBits_WrongShape() { final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final StaticHasher hasher = new StaticHasher(lst.iterator(), shape); + final SimpleHasher hasher = new SimpleHasher(lst.iterator(), shape); try { hasher.iterator(new Shape(testFunctionX, 3, 72, 17)); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/AbstractHashFunctionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/AbstractHashFunctionTest.java deleted file mode 100644 index 5498d699cb..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/AbstractHashFunctionTest.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.junit.jupiter.api.Test; - -/** - * Tests the signature of a hash function. - */ -public abstract class AbstractHashFunctionTest { - - /** - * Test that the signature is properly generated. - */ - @Test - public void signatureTest() { - final HashFunction hf = createHashFunction(); - final long expected = hf.apply(HashFunctionIdentity.prepareSignatureBuffer(hf), 0); - assertEquals(expected, hf.getSignature()); - // Should be repeatable - final long expected2 = hf.apply(HashFunctionIdentity.prepareSignatureBuffer(hf), 0); - assertEquals(expected, expected2); - assertEquals("Apache Commons Collections", hf.getProvider()); - } - - /** - * Creates the hash function. - * - * @return the hash function - */ - protected abstract HashFunction createHashFunction(); -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java deleted file mode 100644 index 9b0d9a83e1..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/MD5CyclicTest.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Tests the MD5 cyclic hash function. - */ -public class MD5CyclicTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final MD5Cyclic md5 = new MD5Cyclic(); - final long l1 = 0x8b1a9953c4611296L; - final long l2 = 0xa827abf8c47804d7L; - final byte[] buffer = "Hello".getBytes(); - - long l = md5.apply(buffer, 0); - assertEquals(l1, l); - l = md5.apply(buffer, 1); - assertEquals(l1 + l2, l); - l = md5.apply(buffer, 2); - assertEquals(l1 + l2 + l2, l); - } - - @Override - protected HashFunction createHashFunction() { - return new MD5Cyclic(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64CyclicTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64CyclicTest.java deleted file mode 100644 index 9e17c2ec89..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur128x64CyclicTest.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.nio.charset.StandardCharsets; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Test that the Murmur3 128 x64 hash function works correctly. - */ -public class Murmur128x64CyclicTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final Murmur128x64Cyclic murmur = new Murmur128x64Cyclic(); - - final long l1 = 0xe7eb60dabb386407L; - final long l2 = 0xc3ca49f691f73056L; - final byte[] buffer = "Now is the time for all good men to come to the aid of their country" - .getBytes(StandardCharsets.UTF_8); - - long l = murmur.apply(buffer, 0); - assertEquals(l1, l); - l = murmur.apply(buffer, 1); - assertEquals(l1 + l2, l); - l = murmur.apply(buffer, 2); - assertEquals(l1 + l2 + l2, l); - } - - @Override - protected HashFunction createHashFunction() { - return new Murmur128x64Cyclic(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java deleted file mode 100644 index bca60c1e4b..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/Murmur32x86IterativeTest.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.nio.charset.StandardCharsets; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Test that the Murmur3 32 x86 hash function works correctly. - */ -public class Murmur32x86IterativeTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final Murmur32x86Iterative murmur = new Murmur32x86Iterative(); - - final byte[] buffer = "Now is the time for all good men to come to the aid of their country" - .getBytes(StandardCharsets.UTF_8); - - long l = murmur.apply(buffer, 0); - assertEquals(82674681, l); - l = murmur.apply(buffer, 1); - assertEquals(-1475490736, l); - l = murmur.apply(buffer, 2); - assertEquals(-1561435247, l); - } - - @Override - protected HashFunction createHashFunction() { - return new Murmur32x86Iterative(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java deleted file mode 100644 index 5595efdc77..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/function/ObjectsHashIterativeTest.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunction; -import org.junit.jupiter.api.Test; - -/** - * Tests that the Objects hash works correctly. - */ -public class ObjectsHashIterativeTest extends AbstractHashFunctionTest { - - /** - * Test that the apply function returns the proper values. - */ - @Test - public void applyTest() { - final ObjectsHashIterative obj = new ObjectsHashIterative(); - - final byte[] buffer = "Now is the time for all good men to come to the aid of their country" - .getBytes(StandardCharsets.UTF_8); - - long l = obj.apply(buffer, 0); - long prev = 0; - assertEquals(Arrays.deepHashCode(new Object[] {prev, buffer}), l); - for (int i = 1; i <= 5; i++) { - prev += l; - l = obj.apply(buffer, i); - assertEquals(Arrays.deepHashCode(new Object[] {prev, buffer}), l); - } - } - - @Override - protected HashFunction createHashFunction() { - return new ObjectsHashIterative(); - } -} From 7da9f213cb66467f1753ec08bff782f9a381a15a Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Sun, 3 Oct 2021 15:54:20 +0100 Subject: [PATCH 02/27] First set with complete test cases. --- .../bloomfilter/ArrayCountingBloomFilter.java | 183 ++++--- .../bloomfilter/BitCountProducer.java | 103 ++++ .../bloomfilter/BitMapProducer.java | 26 + .../collections4/bloomfilter/BloomFilter.java | 67 ++- .../bloomfilter/CountingBloomFilter.java | 66 ++- .../bloomfilter/IndexFilters.java | 84 --- .../bloomfilter/IndexProducer.java | 22 + .../bloomfilter/SetOperations.java | 121 +++++ .../collections4/bloomfilter/Shape.java | 460 ++++++++++++++++ .../bloomfilter/SimpleBloomFilter.java | 31 +- .../bloomfilter/SparseBloomFilter.java | 50 +- .../bloomfilter/hasher/Hasher.java | 2 + .../bloomfilter/hasher/HasherCollection.java | 2 + .../bloomfilter/hasher/Shape.java | 196 ------- .../bloomfilter/hasher/SimpleHasher.java | 2 + .../bloomfilter/hasher/package-info.java | 3 +- .../bloomfilter/AbstractBloomFilterTest.java | 60 ++- .../AbstractCountingBloomFilterTest.java | 279 ++++++++++ .../ArrayCountingBloomFilterTest.java | 498 +---------------- .../collections4/bloomfilter/BitMaptTest.java | 1 - .../bloomfilter/BloomFilterIndexerTest.java | 95 ---- .../DefaultBloomFilterMethodsTest.java | 98 ---- .../bloomfilter/FixedIndexesTestHasher.java | 62 --- .../bloomfilter/IndexFilterTest.java | 106 ---- .../bloomfilter/SetOperationsTest.java | 262 ++------- .../bloomfilter/ShapeFactoryTest.java | 229 ++++++++ .../collections4/bloomfilter/ShapeTest.java | 456 ++++++++++++++++ .../bloomfilter/SimpleBloomFilterTest.java | 1 - .../bloomfilter/SparseBloomFilterTest.java | 1 - .../hasher/DynamicHasherBuilderTest.java | 132 ----- .../bloomfilter/hasher/DynamicHasherTest.java | 136 ----- .../hasher/HashFunctionIdentityImplTest.java | 84 --- .../hasher/HashFunctionValidatorTest.java | 120 ----- .../bloomfilter/hasher/HasherBuilderTest.java | 113 ---- .../hasher/HasherCollectionTest.java | 58 ++ .../bloomfilter/hasher/ShapeTest.java | 500 ------------------ .../bloomfilter/hasher/SimpleHasherTest.java | 52 ++ .../bloomfilter/hasher/StaticHasherTest.java | 315 ----------- 38 files changed, 2177 insertions(+), 2899 deletions(-) create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidatorTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index c7afc71cae..4d9d9d5040 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -18,13 +18,15 @@ import java.util.BitSet; import java.util.NoSuchElementException; +import java.util.Objects; import java.util.PrimitiveIterator; import java.util.PrimitiveIterator.OfInt; import java.util.function.IntConsumer; +import java.util.function.LongConsumer; import java.util.stream.IntStream; +import org.apache.commons.collections4.bloomfilter.BloomFilter.BitMap; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; /** * A counting Bloom filter using an array to track counts for each enabled bit @@ -136,8 +138,10 @@ public int nextInt() { * Constructs an empty counting Bloom filter with the specified shape. * * @param shape the shape of the filter + * */ public ArrayCountingBloomFilter(final Shape shape) { + Objects.requireNonNull( shape, "shape"); this.shape = shape; counts = new int[shape.getNumberOfBits()]; } @@ -154,18 +158,18 @@ public int cardinality() { @Override public boolean contains(final BloomFilter other) { - // The AbstractBloomFilter implementation converts both filters to long[] bits. - // This would involve checking all indexes in this filter against zero. - // Ideally we use an iterator of bit indexes to allow fail-fast on the - // first bit index that is zero. - if (other instanceof ArrayCountingBloomFilter) { - return contains(((ArrayCountingBloomFilter) other).iterator()); + Objects.requireNonNull( other, "other"); + try { + other.forEachIndex( idx -> {if ( this.counts[idx] == 0 ) { throw new ArrayCountingBloomFilter.NoMatchException(); }} ); + } catch (NoMatchException e) { + return false; } - return CountingBloomFilter.super.contains(other); + return true; } @Override public boolean contains(final Hasher hasher) { + Objects.requireNonNull( hasher, "hasher"); return contains(hasher.iterator(getShape())); } @@ -206,39 +210,64 @@ private PrimitiveIterator.OfInt iterator() { return new IndexIterator(); } + protected ArrayCountingBloomFilter makeClone() { + ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); + filter.add( this ); + filter.state = this.state; + return filter; + } + + @Override + public CountingBloomFilter merge(BloomFilter other) { + Objects.requireNonNull( other, "other"); + CountingBloomFilter filter = makeClone(); + filter.add( BitCountProducer.Factory.simple( other )); + return filter; + } + + @Override + public CountingBloomFilter merge(Hasher hasher) { + Objects.requireNonNull( hasher, "hasher"); + ArrayCountingBloomFilter filter = makeClone(); + filter.mergeInPlace( hasher ); + return filter; + } + @Override public boolean mergeInPlace(final BloomFilter other) { - applyAsBloomFilter(other, this::increment); - return isValid(); + Objects.requireNonNull( other, "other"); + return add( BitCountProducer.Factory.simple(other) ); } @Override public boolean mergeInPlace(final Hasher hasher) { - applyAsHasher(hasher, this::increment); - return isValid(); + Objects.requireNonNull( hasher, "hasher"); + return add( BitCountProducer.Factory.from( shape, hasher )); } @Override public boolean remove(final BloomFilter other) { - applyAsBloomFilter(other, this::decrement); - return isValid(); + Objects.requireNonNull( other, "other"); + return subtract( BitCountProducer.Factory.simple(other)); } @Override public boolean remove(final Hasher hasher) { - applyAsHasher(hasher, this::decrement); - return isValid(); + Objects.requireNonNull( hasher, "hasher"); + return subtract( BitCountProducer.Factory.from( shape, hasher )); } @Override - public boolean add(final CountingBloomFilter other) { - applyAsCountingBloomFilter(other, this::add); + public boolean add(final BitCountProducer other) { + Objects.requireNonNull( other, "other"); + other.forEachCount(this::add); return isValid(); } @Override - public boolean subtract(final CountingBloomFilter other) { - applyAsCountingBloomFilter(other, this::subtract); + public boolean subtract(final BitCountProducer other) { + Objects.requireNonNull( other, "other"); + other.forEachCount(this::subtract); return isValid(); } @@ -262,66 +291,34 @@ public boolean isValid() { } @Override - public void forEachCount(final BitCountConsumer action) { + public void forEachCount(final BitCountProducer.BitCountConsumer consumer) { + Objects.requireNonNull( consumer, "consumer"); for (int i = 0; i < counts.length; i++) { if (counts[i] != 0) { - action.accept(i, counts[i]); + consumer.accept(i, counts[i]); } } } - /** - * Apply the action for each index in the Bloom filter. - */ - private void applyAsBloomFilter(final BloomFilter other, final IntConsumer action) { - if (other instanceof ArrayCountingBloomFilter) { - // Only use the presence of non-zero and not the counts - final int[] counts2 = ((ArrayCountingBloomFilter) other).counts; - for (int i = 0; i < counts2.length; i++) { - if (counts2[i] != 0) { - action.accept(i); - } + @Override + public void forEachIndex(IntConsumer consumer) { + Objects.requireNonNull( consumer, "consumer"); + for (int i = 0; i < counts.length; i++) { + if (counts[i] != 0) { + consumer.accept(i); } - } else { - BitSet.valueOf(other.getBits()).stream().forEach(action); } } - /** - * Apply the action for each index in the hasher. - */ - private void applyAsHasher(final Hasher hasher, final IntConsumer action) { - // We do not naturally handle duplicates so filter them. - IndexFilters.distinctIndexes(hasher, getShape(), action); - } - - /** - * Apply the action for each index in the Bloom filter. - */ - private void applyAsCountingBloomFilter(final CountingBloomFilter other, final BitCountConsumer action) { - other.forEachCount(action); - } - - /** - * Increment to the count for the bit index. - * - * @param idx the index - */ - private void increment(final int idx) { - final int updated = counts[idx] + 1; - state |= updated; - counts[idx] = updated; - } - - /** - * Decrement from the count for the bit index. - * - * @param idx the index - */ - private void decrement(final int idx) { - final int updated = counts[idx] - 1; - state |= updated; - counts[idx] = updated; + @Override + public void forEachBitMap(LongConsumer consumer) { + Objects.requireNonNull( consumer, "consumer"); + if (cardinality() == 0) { + return; + } + BitMapBuilder builder = new BitMapBuilder( consumer ); + forEachIndex( builder ); + builder.finish(); } /** @@ -330,7 +327,7 @@ private void decrement(final int idx) { * @param idx the index * @param addend the amount to add */ - private void add(final int idx, final int addend) { + protected void add(final int idx, final int addend) { final int updated = counts[idx] + addend; state |= updated; counts[idx] = updated; @@ -342,7 +339,7 @@ private void add(final int idx, final int addend) { * @param idx the index * @param subtrahend the amount to subtract */ - private void subtract(final int idx, final int subtrahend) { + protected void subtract(final int idx, final int subtrahend) { final int updated = counts[idx] - subtrahend; state |= updated; counts[idx] = updated; @@ -357,4 +354,46 @@ public int[] getIndices() { public Shape getShape() { return shape; } + + private static class BitMapBuilder implements IntConsumer { + + LongConsumer consumer; + long bucket = 0; + long bucektIdx=0; + + BitMapBuilder( LongConsumer consumer ) { + this.consumer = consumer; + } + + @Override + public void accept( int i ) { + int nextIndex = BitMap.getLongIndex( i ); + while (nextIndex > bucektIdx) + { + consumer.accept(bucket); + bucket =0; + bucektIdx++; + } + bucket |= BitMap.getLongBit( i ); + } + + public void finish() { + if (bucket != 0) { + consumer.accept( bucket ); + } + } + } + + /** + * An exception throwns when no match was found in the byte buffer. + * + */ + private class NoMatchException extends RuntimeException { + + public NoMatchException() { + super(); + } + + } + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java new file mode 100644 index 0000000000..bd2ccf95e3 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java @@ -0,0 +1,103 @@ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Consumer; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; + +/** + * Produces bit counts for counting type Bloom filters. + * + */ +public interface BitCountProducer { + + /** + * Performs the given action for each {@code } pair where the count is non-zero. + * Any exceptions thrown by the action are relayed to the caller. + * + * Must only process each index once, and must process indexes in order. + * + * @param consumer the action to be performed for each non-zero bit count + * @throws NullPointerException if the specified action is null + */ + void forEachCount(BitCountConsumer consumer); + + /** + * Factory to construct BitCountProducers from common Bloom filter and Hashers. + * + */ + public static class Factory { + /** + * Creates a BitCountProducer from a bloom filter. + * + * If the filter implements the BitCountProducer it is returned unchanged. + * If the filter does not implement the BitCountProducer each enabled bit is + * returned with a count of one (1). + * + * @param filter the Bloom filter to count. + * @return The BitCountProducer for the Bloom filter. + */ + public static BitCountProducer from( BloomFilter filter ) { + return (filter instanceof BitCountProducer) ? (BitCountProducer) filter : simple( filter ); + } + + /** + * Create a BitCountProducer from a bloom filter without regard to previous BitCountProducer + * implementation. + * + * for each enabled bit a count of 1 is returned. + * + * @param filter The Bloom filter to create the BitCountProducer from. + * @return the BitCountProducer for the Bloom filter. + */ + public static BitCountProducer simple( BloomFilter filter ) { + return new BitCountProducer() { + + @Override + public void forEachCount(BitCountConsumer consumer) { + for (int i : filter.getIndices() ) + { + consumer.accept(i, 1); + } + } + }; + } + + /** + * Creates a Bit count producer from a shape and hasher. + * @param shape The shape to use + * @param hasher the hasher to use. + * @return A BitCountProducer for the hasher produced values. + */ + public static BitCountProducer from( Shape shape, Hasher hasher ) { + return new BitCountProducer() { + + @Override + public void forEachCount(BitCountConsumer consumer) { + final Set distinct = new TreeSet<>(); + hasher.iterator(shape).forEachRemaining((Consumer) distinct::add ); + distinct.forEach( i -> consumer.accept(i, 1)); + } + }; + } + } + + /** + * Represents an operation that accepts an {@code } pair representing + * the count for a bit index in a Bit Count Producer Bloom filter and returns no result. + * + *

Note: This is a functional interface as a primitive type specialization of + * {@link java.util.function.BiConsumer} for {@code int}. + */ + @FunctionalInterface + interface BitCountConsumer { + /** + * Performs this operation on the given {@code } pair. + * + * @param index the bit index + * @param count the count at the specified bit index + */ + void accept(int index, int count); + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java new file mode 100644 index 0000000000..f7f18ce994 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -0,0 +1,26 @@ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Consumer; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; + +/** + * Interface that produces bit map long values in a Bloom filter. + * + */ +public interface BitMapProducer { + + /** + * Performs the given action for each bit map {@code long} that comprise the Bloom filter. + * Any exceptions thrown by the action are relayed to the caller. + * + * @param consumer the action to be performed for each bit map long + * @throws NullPointerException if the specified action is null + */ + void forEachBitMap(LongConsumer consumer); + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index defb01f424..a17d89a6f2 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -16,18 +16,23 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import java.util.NoSuchElementException; +import java.util.Objects; import java.util.PrimitiveIterator; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; +import org.apache.commons.collections4.bloomfilter.BloomFilter.BitMap; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; /** * The interface that describes a Bloom filter. * @since 4.5 */ -public interface BloomFilter { +public interface BloomFilter extends IndexProducer, BitMapProducer { // Query Operations @@ -49,14 +54,34 @@ public interface BloomFilter { * * @return the {@code long[]} representation of this filter */ - long[] getBits(); + default long[] getBits() { + + if (cardinality() == 0) { + return new long[0]; + } + + BitBuilder consumer = new BitBuilder(getShape()); + forEachBitMap( consumer ); + return consumer.trim(); + } /** * Gets an array of indices of bits that are enabled. * Array must be in sorted order. * @return an array of indices for bits that are enabled in the filter. */ - int[] getIndices(); + default int[] getIndices() { + int[] result = new int[ cardinality() ]; + IntConsumer consumer = new IntConsumer() { + int idx = 0; + @Override + public void accept(int i) { + result[idx++] = i; + } + }; + forEachIndex( consumer ); + return result; + } /** * Gets the shape that was used when the filter was built. @@ -74,6 +99,7 @@ public interface BloomFilter { * @return true if all enabled bits in the other filter are enabled in this filter. */ default boolean contains(BloomFilter other) { + Objects.requireNonNull( other, "other"); if (isSparse()) { int[] myIndicies = getIndices(); if (other.isSparse()) { @@ -126,6 +152,7 @@ default boolean contains(BloomFilter other) { * this filter */ default boolean contains(Hasher hasher) { + Objects.requireNonNull( hasher, "Hasher"); Shape shape = getShape(); BloomFilter result = BitMap.isSparse( (hasher.size() * shape.getNumberOfHashFunctions()), shape ) ? new SparseBloomFilter(getShape(), hasher) : @@ -149,6 +176,7 @@ default boolean contains(Hasher hasher) { * the shape of this filter */ default BloomFilter merge(BloomFilter other) { + Objects.requireNonNull( other, "other"); Shape shape = getShape(); BloomFilter result = BitMap.isSparse( (cardinality() + other.cardinality()), getShape() ) ? new SparseBloomFilter(shape) : @@ -173,6 +201,7 @@ default BloomFilter merge(BloomFilter other) { * this filter */ default BloomFilter merge(Hasher hasher) { + Objects.requireNonNull( hasher, "hasher"); Shape shape = getShape(); BloomFilter result = BitMap.isSparse( (hasher.size() * shape.getNumberOfHashFunctions())+ cardinality(), shape ) ? new SparseBloomFilter(shape, hasher) : @@ -185,6 +214,7 @@ default BloomFilter merge(Hasher hasher) { boolean mergeInPlace(BloomFilter other); default boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull( hasher, "hasher"); Shape shape = getShape(); BloomFilter result = BitMap.isSparse( (hasher.size() * shape.getNumberOfHashFunctions())+cardinality(),shape ) ? new SparseBloomFilter(getShape(), hasher) : @@ -199,6 +229,7 @@ default boolean mergeInPlace(Hasher hasher) { * @return true if the filter is full. */ default boolean isFull(Shape shape) { + Objects.requireNonNull( shape, "shape"); return cardinality() == shape.getNumberOfBits(); } @@ -217,8 +248,8 @@ default boolean isFull(Shape shape) { * Estimates the number of items in the Bloom filter. * @return an estimate of the number of items in the bloom filter. */ - default double estimateN() { - return getShape().estimate_n( cardinality() ); + default int estimateN() { + return (int) Math.round( getShape().estimateN( cardinality() )); } /** @@ -226,7 +257,8 @@ default double estimateN() { * @param other The other Bloom filter * @return an estimate of the number of items in the union. */ - default double estimateUnion( BloomFilter other) { + default int estimateUnion( BloomFilter other) { + Objects.requireNonNull( other, "other"); return this.merge( other ).estimateN(); } @@ -235,7 +267,8 @@ default double estimateUnion( BloomFilter other) { * @param other The other Bloom filter * @return an estimate of the number of items in the intersection. */ - default double estimateIntersection( BloomFilter other) { + default int estimateIntersection( BloomFilter other) { + Objects.requireNonNull( other, "other"); return estimateN() + other.estimateN() - estimateUnion( other ); } @@ -250,7 +283,7 @@ class BitIterator implements PrimitiveIterator.OfInt { private int next; /** - * Constructs a bit iterator from an array fo bit maps + * Constructs a bit iterator from an array of bit maps * @param bits the array of bit maps. */ BitIterator( long[] bits ) { @@ -393,4 +426,20 @@ public static boolean isSparse( int cardinality, Shape shape ) { } + public class BitBuilder implements LongConsumer { + private long[] result; + private int idx=0; + public BitBuilder( Shape shape ) { + result = new long[ BitMap.numberOfBuckets( shape.getNumberOfBits() )]; + } + @Override + public void accept(long bitmap) { + result[idx++] = bitmap; + } + + public long[] trim() { + return Arrays.copyOf( result, idx ); + } + } + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index 40b71e471c..c0d89e3a5e 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -16,6 +16,7 @@ */ package org.apache.commons.collections4.bloomfilter; +import org.apache.commons.collections4.bloomfilter.BloomFilter.BitMap; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; /** @@ -52,25 +53,9 @@ * * @since 4.5 */ -public interface CountingBloomFilter extends BloomFilter { +public interface CountingBloomFilter extends BloomFilter, BitCountProducer { - /** - * Represents an operation that accepts an {@code } pair representing - * the count for a bit index in a counting Bloom filter and returns no result. - * - *

Note: This is a functional interface as a primitive type specialization of - * {@link java.util.function.BiConsumer} for {@code int}. - */ - @FunctionalInterface - interface BitCountConsumer { - /** - * Performs this operation on the given {@code } pair. - * - * @param index the bit index - * @param count the count at the specified bit index - */ - void accept(int index, int count); - } + // Query Operations @@ -93,15 +78,6 @@ interface BitCountConsumer { */ boolean isValid(); - /** - * Performs the given action for each {@code } pair where the count is non-zero. - * Any exceptions thrown by the action are relayed to the caller. - * - * @param action the action to be performed for each non-zero bit count - * @throws NullPointerException if the specified action is null - */ - void forEachCount(BitCountConsumer action); - // Modification Operations @@ -152,7 +128,7 @@ interface BitCountConsumer { * the shape of this filter * @see #isValid() */ - boolean add(CountingBloomFilter other); + boolean add(BitCountProducer other); /** * Adds the specified counting Bloom filter to this Bloom filter. Specifically @@ -167,5 +143,37 @@ interface BitCountConsumer { * the shape of this filter * @see #isValid() */ - boolean subtract(CountingBloomFilter other); + boolean subtract(BitCountProducer other); + + /** + * Merges the specified Bloom filter into this Bloom filter. Specifically all bit indexes + * that are enabled in the {@code other} filter will be enabled in this filter. + * + *

Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter is not ensured to contain + * the {@code other} Bloom filter. + * + * @param other the other Bloom filter + * @return true if the merge was successful + * @throws IllegalArgumentException if the shape of the other filter does not match + * the shape of this filter + */ + @Override + CountingBloomFilter merge(BloomFilter other); + + /** + * Merges the specified decomposed Bloom filter into this Bloom filter. Specifically all + * bit indexes that are identified by the {@code hasher} will be enabled in this filter. + * + *

Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter is not ensured to contain + * the specified decomposed Bloom filter. + * + * @param hasher the hasher to provide the indexes + * @return true if the merge was successful + * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of + * this filter + */ + @Override + CountingBloomFilter merge(Hasher hasher); } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java deleted file mode 100644 index e4adb4fc66..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilters.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; - -import java.util.Objects; -import java.util.Set; -import java.util.TreeSet; -import java.util.function.Consumer; -import java.util.function.IntConsumer; - -/** - * Contains functions to filter indexes. - */ -final class IndexFilters { - /** Do not instantiate. */ - private IndexFilters() { - } - - /** - * Transfer all distinct indexes in the specified {@code hasher} generated for the - * specified {@code shape} to the specified {@code consumer}. For example this - * can be used to merge a {@link Hasher} representation of a Bloom filter into a - * {@link BloomFilter} instance that does not naturally handle duplicate indexes. - * - *

This method is functionally equivalent to: - * - *

-     *     final Set<Integer> distinct = new TreeSet<>();
-     *     hasher.iterator(shape).forEachRemaining((Consumer<Integer>) i -> {
-     *         if (distinct.add(i)) {
-     *             consumer.accept(i);
-     *         }
-     *     });
-     * 
- * - * @param hasher the hasher - * @param shape the shape - * @param consumer the consumer to receive distinct indexes - * @throws NullPointerException if the hasher, shape or action are null - * @see Hasher#iterator(Shape) - */ - static void distinctIndexes(final Hasher hasher, final Shape shape, final IntConsumer consumer) { - Objects.requireNonNull(hasher, "hasher"); - Objects.requireNonNull(shape, "shape"); - Objects.requireNonNull(consumer, "consumer"); - - // TODO - // This function can be optimised based on the expected size - // (number of indexes) of the hasher and the number of bits in the shape. - // - // A large size would benefit from a pre-allocated BitSet-type filter. - // A very small size may be more efficient as a simple array of values - // that have already been seen that is scanned for each new index. - // - // A default is to use a Set to filter distinct values. The choice of set - // should be evaluated. A HashSet would be optimal if size is known. - // A TreeSet has lower memory consumption and performance is not as - // sensitive to knowing the size in advance. - - final Set distinct = new TreeSet<>(); - hasher.iterator(shape).forEachRemaining((Consumer) i -> { - if (distinct.add(i)) { - consumer.accept(i); - } - }); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java new file mode 100644 index 0000000000..dbf41dbe94 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -0,0 +1,22 @@ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Consumer; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; + +public interface IndexProducer { + + /** + * Performs the given action for each {@code index} that represents an enabled bit. + * Any exceptions thrown by the action are relayed to the caller. + * + * @param consumer the action to be performed for each non-zero bit index. + * @throws NullPointerException if the specified action is null + */ + void forEachIndex(IntConsumer consumer); + +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java new file mode 100644 index 0000000000..47bb8dd37c --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.BitSet; + +/** + * Implementations of set operations on Bloom filters. + * + */ +public final class SetOperations { + + /** + * Calculates the Cosine distance between two Bloom filters. + * + *

Cosine distance is defined as {@code 1 - Cosine similarity}

+ * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return the jaccard distance. + */ + public static double cosineDistance(final BloomFilter first, final BloomFilter second) { + return 1.0 - cosineSimilarity(first, second); + } + + private static BitSet and(final BloomFilter first, final BloomFilter second) { + BitSet result = BitSet.valueOf(first.getBits()); + result.and(BitSet.valueOf(second.getBits())); + return result; + } + + private static BitSet or(final BloomFilter first, final BloomFilter second) { + BitSet result = BitSet.valueOf(first.getBits()); + result.or(BitSet.valueOf(second.getBits())); + return result; + } + + private static BitSet xor(final BloomFilter first, final BloomFilter second) { + BitSet result = BitSet.valueOf(first.getBits()); + result.xor(BitSet.valueOf(second.getBits())); + return result; + } + + /** + * Calculates the Cosine similarity between two Bloom filters. + *

Also known as Orchini similarity and the Tucker coefficient of congruence or + * Ochiai similarity.

+ * + *

If either filter is empty (no enabled bits) the result is 0 (zero)

+ * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return the Cosine similarity. + */ + public static double cosineSimilarity(final BloomFilter first, final BloomFilter second) { + final int numerator = and( first, second).cardinality(); + return numerator == 0 ? 0 : numerator / (Math.sqrt(first.cardinality()) * Math.sqrt(second.cardinality())); + } + + + + /** + * Calculates the Hamming distance between two Bloom filters. + * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return the Hamming distance. + */ + public static int hammingDistance(final BloomFilter first, final BloomFilter second) { + return xor(first,second).cardinality(); + } + + /** + * Calculates the Jaccard distance between two Bloom filters. + * + *

Jaccard distance is defined as {@code 1 - Jaccard similarity}

+ * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return the Jaccard distance. + */ + public static double jaccardDistance(final BloomFilter first, final BloomFilter second) { + return 1.0 - jaccardSimilarity(first, second); + } + + /** + * Calculates the Jaccard similarity between two Bloom filters. + * + *

Also known as Jaccard index, Intersection over Union, and Jaccard similarity coefficient

+ * + * @param first the first Bloom filter. + * @param second the second Bloom filter. + * @return the Jaccard similarity. + */ + public static double jaccardSimilarity(final BloomFilter first, final BloomFilter second) { + final int orCard = or(first,second).cardinality(); + // if the orCard is zero then the hamming distance will also be zero. + return orCard == 0 ? 0 : hammingDistance(first, second) / (double) orCard; + } + + + /** + * Do not instantiate. + */ + private SetOperations() {} +} + diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java new file mode 100644 index 0000000000..ff028fc506 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -0,0 +1,460 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; + +/** + * The definition of a Bloom filter shape. + * + *

This class contains the values for the filter configuration and is used to + * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are + * compatible. (i.e. can be compared or merged)

+ * + *

Interrelatedness of values

+ * + *
Number of Items ({@code n})
+ *
{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}
Probability of + * False Positives ({@code p})
{@code p = pow(1 - exp(-k / (m / n)), k)}
Number + * of Bits ({@code m})
+ *
{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}
Number of + * Functions ({@code k})
{@code k = round((m / n) * ln(2))}
+ * + *

Comparisons

For purposes of equality checking and hashCode + * calculations a {@code Shape} is defined by the hashing function identity, the number of + * bits ({@code m}), and the number of functions ({@code k}).

+ * + * @see Bloom Filter calculator + * @see Bloom filter + * [Wikipedia] + * @since 4.5 + */ +public final class Shape { + + /** + * Number of hash functions to create a filter ({@code k}). + */ + private final int numberOfHashFunctions; + + /** + * Number of bits in the filter ({@code m}). + */ + private final int numberOfBits; + + + + + /** + * Constructs a filter configuration with the specified number of items ({@code n}) and + * bits ({@code m}). + * + *

The optimal number of hash functions ({@code k}) is computed. + *

k = round((m / n) * ln(2))
+ * + *

The false-positive probability is computed using the number of items, bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. + * @param numberOfBits The number of bits in the filter + * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} + */ + public Shape(final int numberOfHashFunctions, final int numberOfBits) { + this.numberOfBits = checkNumberOfBits(numberOfBits); + this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); + } + + /** + * Check number of bits is strictly positive. + * + * @param numberOfBits the number of bits + * @return the number of bits + * @throws IllegalArgumentException if the number of bits is {@code < 1} + */ + private static int checkNumberOfBits(final int numberOfBits) { + if (numberOfBits < 1) { + throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); + } + return numberOfBits; + } + + /** + * Check number of hash functions is strictly positive + * + * @param numberOfHashFunctions the number of hash functions + * @return the number of hash functions + * @throws IllegalArgumentException if the number of hash functions is {@code < 1} + */ + private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { + if (numberOfHashFunctions < 1) { + throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); + } + return numberOfHashFunctions; + } + + @Override + public boolean equals(final Object o) { + if (o instanceof Shape) { + final Shape other = (Shape) o; + return numberOfBits == other.numberOfBits && + numberOfHashFunctions == other.numberOfHashFunctions; + } + return false; + } + + @Override + public int hashCode() { + return Objects.hash(numberOfBits, numberOfHashFunctions); + } + + /** + * Gets the number of bits in the Bloom filter. + * This is also known as {@code m}. + * + * @return the number of bits in the Bloom filter ({@code m}). + */ + public int getNumberOfBits() { + return numberOfBits; + } + + + /** + * Gets the number of hash functions used to construct the filter. + * This is also known as {@code k}. + * + * @return the number of hash functions used to construct the filter ({@code k}). + */ + public int getNumberOfHashFunctions() { + return numberOfHashFunctions; + } + + + /** + * Calculates the probability of false positives ({@code p}) given + * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). + *

p = pow(1 - exp(-k / (m / n)), k)
+ * + *

This is the probability that a Bloom filter will return true for the presence of an item + * when it does not contain the item. + * + *

The probability assumes that the Bloom filter is filled with the expected number of + * items. If the filter contains fewer items then the actual probability will be lower. + * Thus this returns the worst-case false positive probability for a filter that has not + * exceeded its expected number of items. + * + * @param numberOfItems the number of items hashed into the Bloom filter. + * @return the probability of false positives. + * @see #getNumberOfItems() + */ + public double getProbability(int numberOfItems) { + if (numberOfItems < 0) { + throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems); + } + if (numberOfItems == 0) { + return 0; + } + return Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), + numberOfHashFunctions); + } + + @Override + public String toString() { + return String.format("Shape[ m=%s k=%s ]", + numberOfBits, numberOfHashFunctions); + } + + public double estimateN( int hammingValue ) { + double c = hammingValue; + double m = numberOfBits; + double k = numberOfHashFunctions; + return -(m / k) * Math.log(1.0 - (c / m)); + } + + /** + * The factory to assist in the creation of proper Shapes. + * + * In the methods of this factory the `fraom` names are appended with the standard variable + * names in the order expected: + * + *

+ *
{@code N})
The number of items to be placed in the Bloom filter
+ *
{@code M})
The number of bits in the Bloom filter
+ *
{@code K})
The number of hash functions for each item placed in the Bloom filter
+ *
{@code P})
The probability of a collision once N items have been placed in the Bloom filter
+ *
+ */ + public static class Factory { + + + /** + * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. + */ + private static final double LN_2 = Math.log(2.0); + + /** + * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201. + * + *

ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2) + */ + private static final double DENOMINATOR = -LN_2 * LN_2; + + /** + * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the + * specified number of bits ({@code m}) and hash functions ({@code k}). + * + *

The number of items ({@code n}) to be stored in the filter is computed. + *

n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))
+ * + *

The actual probability will be approximately equal to the + * desired probability but will be dependent upon the calculated Bloom filter capacity + * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param probability The desired false-positive probability in the range {@code (0, 1)} + * @param numberOfBits The number of bits in the filter + * @param numberOfHashFunctions The number of hash functions in the filter + * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}; + * if {@code numberOfBits < 1}; if {@code numberOfHashFunctions < 1}; or if the actual + * probability is {@code >= 1.0} + */ + public static Shape fromPMK(final double probability, final int numberOfBits, + final int numberOfHashFunctions) { + checkProbability(probability); + checkNumberOfBits(numberOfBits); + checkNumberOfHashFunctions(numberOfHashFunctions); + + // Number of items (n): + // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) + final double n = Math.ceil(numberOfBits / + (-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions)))); + + // log of probability is always < 0 + // number of hash functions is >= 1 + // e^x where x < 0 = [0,1) + // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53 + // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0 + // ceil( >0 ) >= 1 + // so we can not produce a negative value thus we don't check for it. + // + // similarly we can not produce a number greater than numberOfBits so we + // do not have to check for Integer.MAX_VALUE either. + + + Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + // check that probability is within range + checkCalculatedProbability(shape.getProbability( (int) n )); + return shape; + } + + /** + * Constructs a filter configuration with the specified number of items ({@code n}) and + * desired false-positive probability ({@code p}). + * + *

The number of bits ({@code m}) for the filter is computed. + *

m = ceil(n * ln(p) / ln(1 / 2^ln(2)))
+ * + *

The optimal number of hash functions ({@code k}) is computed. + *

k = round((m / n) * ln(2))
+ * + *

The actual probability will be approximately equal to the + * desired probability but will be dependent upon the calculated number of bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfItems Number of items to be placed in the filter + * @param probability The desired false-positive probability in the range {@code (0, 1)} + * @throws IllegalArgumentException if {@code numberOfItems < 1}; if the desired probability + * is not in the range {@code (0, 1)}; or if the actual probability is {@code >= 1.0} + * @see #getProbability() + */ + public static Shape fromNP (final int numberOfItems, final double probability) { + checkNumberOfItems(numberOfItems); + checkProbability(probability); + + // Number of bits (m) + final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); + if (m > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); + } + int numberOfBits = (int) m; + + int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + // check that probability is within range + checkCalculatedProbability(shape.getProbability( numberOfItems )); + return shape; + } + + /** + * Constructs a filter configuration with the specified number of items ({@code n}) and + * bits ({@code m}). + * + *

The optimal number of hash functions ({@code k}) is computed. + *

k = round((m / n) * ln(2))
+ * + *

The false-positive probability is computed using the number of items, bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfItems Number of items to be placed in the filter + * @param numberOfBits The number of bits in the filter + * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; + * if the calculated number of hash function is {@code < 1}; + * or if the actual probability is {@code >= 1.0} + * @see #getProbability() + */ + public static Shape fromNM(final int numberOfItems, final int numberOfBits) { + checkNumberOfItems(numberOfItems); + checkNumberOfBits(numberOfBits); + int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + // check that probability is within range + checkCalculatedProbability(shape.getProbability( numberOfItems )); + return shape; + } + + /** + * Constructs a filter configuration with the specified number of items, bits + * and hash functions. + * + *

The false-positive probability is computed using the number of items, bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfItems Number of items to be placed in the filter + * @param numberOfBits The number of bits in the filter. + * @param numberOfHashFunctions The number of hash functions in the filter + * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; + * if {@code numberOfHashFunctions < 1}; or if the actual probability is {@code >= 1.0} + * @see #getProbability() + */ + public static Shape fromNMK (final int numberOfItems, final int numberOfBits, + final int numberOfHashFunctions) { + checkNumberOfItems(numberOfItems); + checkNumberOfBits(numberOfBits); + checkNumberOfHashFunctions(numberOfHashFunctions); + // check that probability is within range + Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + // check that probability is within range + checkCalculatedProbability(shape.getProbability( numberOfItems )); + return shape; + } + + /** + * Check number of items is strictly positive. + * + * @param numberOfItems the number of items + * @return the number of items + * @throws IllegalArgumentException if the number of items is {@code < 1} + */ + private static int checkNumberOfItems(final int numberOfItems) { + if (numberOfItems < 1) { + throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems); + } + return numberOfItems; + } + + /** + * Check number of bits is strictly positive. + * + * @param numberOfBits the number of bits + * @return the number of bits + * @throws IllegalArgumentException if the number of bits is {@code < 1} + */ + private static int checkNumberOfBits(final int numberOfBits) { + if (numberOfBits < 1) { + throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); + } + return numberOfBits; + } + + /** + * Check number of hash functions is strictly positive + * + * @param numberOfHashFunctions the number of hash functions + * @return the number of hash functions + * @throws IllegalArgumentException if the number of hash functions is {@code < 1} + */ + private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { + if (numberOfHashFunctions < 1) { + throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); + } + return numberOfHashFunctions; + } + + /** + * Check the probability is in the range 0.0, exclusive, to 1.0, exclusive. + * + * @param probability the probability + * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)} + */ + private static void checkProbability(final double probability) { + // Using the negation of within the desired range will catch NaN + if (!(probability > 0.0 && probability < 1.0)) { + throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability); + } + } + + /** + * Check the calculated probability is {@code < 1.0}. + * + *

This function is used to verify that the dynamically calculated probability for the + * Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon + * construction. + * + * @param probability the probability + * @throws IllegalArgumentException if the probability is {@code >= 1.0} + */ + private static void checkCalculatedProbability(final double probability) { + // We do not need to check for p <= 0.0 since we only allow positive values for + // parameters and the closest we can come to exp(-kn/m) == 1 is + // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will + // always be 00 + if (probability >= 1.0) { + throw new IllegalArgumentException( + String.format("Calculated probability is greater than or equal to 1: " + probability)); + } + } + + /** + * Calculates the number of hash functions given numberOfItems and numberofBits. + * This is a method so that the calculation is consistent across all constructors. + * + * @param numberOfItems the number of items in the filter. + * @param numberOfBits the number of bits in the filter. + * @return the optimal number of hash functions. + * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1} + */ + private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) { + // k = round((m / n) * ln(2)) We change order so that we use real math rather + // than integer math. + final long k = Math.round(LN_2 * numberOfBits / numberOfItems); + if (k < 1) { + throw new IllegalArgumentException( + String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k)); + } + // Normally we would check that numberofHashFunctions <= Integer.MAX_VALUE but + // since numberOfBits is at most Integer.MAX_VALUE the numerator of + // numberofHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the + // value of k can not be above Integer.MAX_VALUE. + return (int) k; + } + + + + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java index fc8b2eda3c..e8ab3bea80 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -18,10 +18,11 @@ import java.util.Arrays; import java.util.BitSet; +import java.util.Objects; import java.util.function.IntConsumer; +import java.util.function.LongConsumer; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; /** * A bloom filter using a Java BitSet to track enabled bits. This is a standard @@ -41,19 +42,22 @@ public class SimpleBloomFilter implements BloomFilter { * */ public SimpleBloomFilter(Shape shape) { + Objects.requireNonNull( shape, "shape"); this.shape = shape; this.bitSet = new BitSet(); } public SimpleBloomFilter(final Shape shape, Hasher hasher) { this( shape ); + Objects.requireNonNull( hasher, "hasher"); hasher.iterator(shape).forEachRemaining( (IntConsumer) i -> bitSet.set(i)); } @Override public boolean mergeInPlace(BloomFilter other) { + Objects.requireNonNull( other, "other"); if (other.isSparse()) { - Arrays.stream(other.getIndices()).forEach( s -> bitSet.set( s )); + other.forEachIndex( bitSet::set ); } else { bitSet.or( BitSet.valueOf(other.getBits() )); } @@ -81,17 +85,22 @@ public long[] getBits() { } @Override - public int[] getIndices() { - int[] result = new int[ bitSet.cardinality() ]; - int idx = 0; - for (int i=0;i= 0; i = bitSet.nextSetBit(i+1)) { + consumer.accept(i); + if (i == Integer.MAX_VALUE) { + break; // or (i+1) would overflow + } } - return result; } + @Override + public void forEachBitMap(LongConsumer consumer) { + Objects.requireNonNull( consumer, "consumer"); + for ( long l : getBits() ) { + consumer.accept(l); + } + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java index 76f0d5137f..34d2047fe4 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -21,16 +21,17 @@ import java.util.Collections; import java.util.HashSet; import java.util.List; +import java.util.Objects; import java.util.PrimitiveIterator; import java.util.PrimitiveIterator.OfInt; import java.util.Set; import java.util.TreeSet; import java.util.function.IntConsumer; +import java.util.function.LongConsumer; import javax.swing.event.ListSelectionEvent; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; /** @@ -51,22 +52,26 @@ public class SparseBloomFilter implements BloomFilter { * */ public SparseBloomFilter(Shape shape) { + Objects.requireNonNull( shape, "shape"); this.shape = shape; this.indices = new TreeSet(); } public SparseBloomFilter(final Shape shape, Hasher hasher) { this( shape ); + Objects.requireNonNull( hasher, "hasher"); hasher.iterator(shape).forEachRemaining( (IntConsumer) i -> indices.add( i )); } public SparseBloomFilter(Shape shape, List indices) { this(shape); + Objects.requireNonNull( indices, "indices"); this.indices.addAll( indices ); } @Override public boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull( hasher, "hasher"); PrimitiveIterator.OfInt iter = hasher.iterator(shape); while (iter.hasNext()) { indices.add( iter.next() ); @@ -76,9 +81,8 @@ public boolean mergeInPlace(Hasher hasher) { @Override public boolean mergeInPlace(BloomFilter other) { - for (int i : other.getIndices()) { - indices.add(i); - } + Objects.requireNonNull( other, "other"); + other.forEachIndex( indices::add ); return true; } @@ -98,26 +102,34 @@ public int cardinality() { } @Override - public long[] getBits() { - if (cardinality() == 0) { - return new long[0]; - } - long[] result = new long[ BitMap.numberOfBuckets( indices.last() )]; - for (Integer idx : indices) - { - result[ BitMap.getLongIndex( idx.intValue()) ] |= BitMap.getLongBit(idx.intValue()); + public void forEachIndex(IntConsumer consumer) { + Objects.requireNonNull( consumer, "consumer"); + for (int value : indices ) { + consumer.accept( value ); } - return result; } @Override - public int[] getIndices() { - int[] result = new int[ indices.size() ]; - int i=0; - for (int value : indices ) { - result[i++]=value; + public void forEachBitMap(LongConsumer consumer) { + Objects.requireNonNull( consumer, "consumer"); + if (cardinality() == 0) { + return; + } + long bucket = 0; + long bucektIdx=0; + for (int i : indices ) { + int nextIndex = BitMap.getLongIndex( i ); + while (nextIndex > bucektIdx) + { + consumer.accept(bucket); + bucket =0; + bucektIdx++; + } + bucket |= BitMap.getLongBit( i ); + } + if (bucket != 0) { + consumer.accept( bucket ); } - return result; } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index 03d1488e7e..dc2375d7fb 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -18,6 +18,8 @@ import java.util.PrimitiveIterator; +import org.apache.commons.collections4.bloomfilter.Shape; + /** * A Hasher represents items of arbitrary byte size as a byte representation of * fixed size (a hash). The hash representations can be used to create indexes diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java index 42350114ef..18af931dc7 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java @@ -24,6 +24,8 @@ import java.util.PrimitiveIterator; import java.util.stream.Collectors; +import org.apache.commons.collections4.bloomfilter.Shape; + /** * The class that performs hashing on demand. * @since 4.5 diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java deleted file mode 100644 index bcac204c4f..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Shape.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import java.util.Objects; - -import org.apache.commons.collections4.bloomfilter.BloomFilter; - -/** - * The definition of a Bloom filter shape. - * - *

This class contains the values for the filter configuration and is used to - * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are - * compatible. (i.e. can be compared or merged)

- * - *

Interrelatedness of values

- * - *
Number of Items ({@code n})
- *
{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}
Probability of - * False Positives ({@code p})
{@code p = pow(1 - exp(-k / (m / n)), k)}
Number - * of Bits ({@code m})
- *
{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}
Number of - * Functions ({@code k})
{@code k = round((m / n) * ln(2))}
- * - *

Comparisons

For purposes of equality checking and hashCode - * calculations a {@code Shape} is defined by the hashing function identity, the number of - * bits ({@code m}), and the number of functions ({@code k}).

- * - * @see Bloom Filter calculator - * @see Bloom filter - * [Wikipedia] - * @since 4.5 - */ -public final class Shape { - - /** - * Number of hash functions to create a filter ({@code k}). - */ - private final int numberOfHashFunctions; - - /** - * Number of bits in the filter ({@code m}). - */ - private final int numberOfBits; - - - - - /** - * Constructs a filter configuration with the specified number of items ({@code n}) and - * bits ({@code m}). - * - *

The optimal number of hash functions ({@code k}) is computed. - *

k = round((m / n) * ln(2))
- * - *

The false-positive probability is computed using the number of items, bits and hash - * functions. An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. - * @param numberOfBits The number of bits in the filter - * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} - */ - public Shape(final int numberOfHashFunctions, final int numberOfBits) { - this.numberOfBits = checkNumberOfBits(numberOfBits); - this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); - } - - /** - * Check number of items is strictly positive. - * - * @param numberOfItems the number of items - * @return the number of items - * @throws IllegalArgumentException if the number of items is {@code < 1} - */ - private static void checkNumberOfItems(final int numberOfItems) { - if (numberOfItems < 1) { - throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems); - } - } - - /** - * Check number of bits is strictly positive. - * - * @param numberOfBits the number of bits - * @return the number of bits - * @throws IllegalArgumentException if the number of bits is {@code < 1} - */ - private static int checkNumberOfBits(final int numberOfBits) { - if (numberOfBits < 1) { - throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); - } - return numberOfBits; - } - - /** - * Check number of hash functions is strictly positive - * - * @param numberOfHashFunctions the number of hash functions - * @return the number of hash functions - * @throws IllegalArgumentException if the number of hash functions is {@code < 1} - */ - private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { - if (numberOfHashFunctions < 1) { - throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); - } - return numberOfHashFunctions; - } - - @Override - public boolean equals(final Object o) { - if (o instanceof Shape) { - final Shape other = (Shape) o; - return numberOfBits == other.numberOfBits && - numberOfHashFunctions == other.numberOfHashFunctions; - } - return false; - } - - @Override - public int hashCode() { - return Objects.hash(numberOfBits, numberOfHashFunctions); - } - - /** - * Gets the number of bits in the Bloom filter. - * This is also known as {@code m}. - * - * @return the number of bits in the Bloom filter ({@code m}). - */ - public int getNumberOfBits() { - return numberOfBits; - } - - - /** - * Gets the number of hash functions used to construct the filter. - * This is also known as {@code k}. - * - * @return the number of hash functions used to construct the filter ({@code k}). - */ - public int getNumberOfHashFunctions() { - return numberOfHashFunctions; - } - - - /** - * Calculates the probability of false positives ({@code p}) given - * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). - *

p = pow(1 - exp(-k / (m / n)), k)
- * - *

This is the probability that a Bloom filter will return true for the presence of an item - * when it does not contain the item. - * - *

The probability assumes that the Bloom filter is filled with the expected number of - * items. If the filter contains fewer items then the actual probability will be lower. - * Thus this returns the worst-case false positive probability for a filter that has not - * exceeded its expected number of items. - * - * @param numberOfItems the number of items hashed into the Bloom filter. - * @return the probability of false positives. - * @see #getNumberOfItems() - */ - public double getProbability(int numberOfItems) { - checkNumberOfItems( numberOfItems ); - return Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), - numberOfHashFunctions); - } - - @Override - public String toString() { - return String.format("Shape[ m=%s k=%s ]", - numberOfBits, numberOfHashFunctions); - } - - public double estimate_n( int hammingValue ) { - double c = hammingValue; - double m = numberOfBits; - double k = numberOfHashFunctions; - return -(m / k) * Math.log(1.0 - (c / m)); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java index 2a9ea06842..4a6c99934b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java @@ -20,6 +20,8 @@ import java.util.PrimitiveIterator; import java.util.PrimitiveIterator.OfInt; +import org.apache.commons.collections4.bloomfilter.Shape; + /** * A Hasher implementation that contains the index for all enabled bits for a specific diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java index b73675ed28..e5f96bd271 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java @@ -16,8 +16,7 @@ */ /** - * Provides classes and interfaces to define the shape of a Bloom filter and the conversion - * of generic bytes to a hash of bit indexes to be used with a Bloom filter. + * Hasher definition and examples for the Bloom filter implementation. * * @since 4.5 */ diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index d470129c78..7901e8f6e1 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -33,25 +33,28 @@ import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.HasherCollection; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; import org.junit.jupiter.api.Test; /** * Test standard methods in the {@link BloomFilter} interface. */ -public abstract class AbstractBloomFilterTest { - - private final SimpleHasher from1 = new SimpleHasher( 1, 1 ); - private final SimpleHasher from11 = new SimpleHasher( 11, 1 ); - private final HasherCollection bigHasher = new HasherCollection( from1, from11 ); - private final HasherCollection fullHasher = new HasherCollection( +public abstract class AbstractBloomFilterTest { + + protected final SimpleHasher from1 = new SimpleHasher( 1, 1 ); + protected final long from1Value = 0x3FFFEL; + protected final SimpleHasher from11 = new SimpleHasher( 11, 1 ); + protected final long from11Value = 0xFFFF800L; + protected final HasherCollection bigHasher = new HasherCollection( from1, from11 ); + protected final long bigHashValue = 0xFFFFFFEL; + protected final HasherCollection fullHasher = new HasherCollection( new SimpleHasher(0,1)/*0-16*/, new SimpleHasher(17,1)/*17-33*/, new SimpleHasher(33,1)/*33-49*/, new SimpleHasher(50,1)/*50-66*/, new SimpleHasher(67,1)/*67-83*/ ); + protected final long[] fullHashValue = { 0xFFFFFFFFFFFFFFFFL, 0xFFFFFL }; /** * The shape of the Bloom filters for testing @@ -64,7 +67,7 @@ public abstract class AbstractBloomFilterTest { * @param shape the shape of the filter. * @return a BloomFilter implementation. */ - protected abstract BloomFilter createEmptyFilter(Shape shape); + protected abstract T createEmptyFilter(Shape shape); /** * Create the BloomFilter implementation we are testing. @@ -73,7 +76,7 @@ public abstract class AbstractBloomFilterTest { * @param shape the shape of the filter. * @return a BloomFilter implementation. */ - protected abstract BloomFilter createFilter(Shape shape, Hasher hasher); + protected abstract T createFilter(Shape shape, Hasher hasher); /** @@ -111,8 +114,8 @@ public void estimateIntersectionTest() { final BloomFilter bf = createFilter( shape, from1 ); final BloomFilter bf2 = createFilter( shape, bigHasher ); - assertEquals(1.0, bf.estimateIntersection(bf2), 0.5); - assertEquals(1.0, bf2.estimateIntersection(bf), 0.5); + assertEquals(1.0, bf.estimateIntersection(bf2)); + assertEquals(1.0, bf2.estimateIntersection(bf)); } @Test @@ -120,8 +123,8 @@ public void estimateIntersectionTest_empty() { final BloomFilter bf = createFilter( shape, from1 ); final BloomFilter bf2 = createEmptyFilter( shape); - assertEquals(0.0, bf.estimateIntersection(bf2), 0.00); - assertEquals(0.0, bf2.estimateIntersection(bf), 0.00); + assertEquals(0.0, bf.estimateIntersection(bf2)); + assertEquals(0.0, bf2.estimateIntersection(bf)); } /** @@ -135,18 +138,39 @@ public void estimateUnionTest() { final BloomFilter bf2 = createFilter( shape, from11 ); - assertEquals(2.0, bf.estimateUnion(bf2), 0.5); - assertEquals(2.0, bf2.estimateUnion(bf), 0.5); + assertEquals(2.0, bf.estimateUnion(bf2)); + assertEquals(2.0, bf2.estimateUnion(bf)); } @Test public void estimateUnionTest_empty() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); final BloomFilter bf = createFilter( shape, from1 ); final BloomFilter bf2 = createEmptyFilter( shape); - assertEquals(1.0, bf.estimateUnion(bf2), 0.15); - assertEquals(1.0, bf2.estimateUnion(bf), 0.15); + assertEquals(1.0, bf.estimateUnion(bf2)); + assertEquals(1.0, bf2.estimateUnion(bf)); + } + + + + /** + * Tests that the size estimate is correctly calculated. + */ + @Test + public void estimateNTest() { + // build a filter + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + assertEquals(1, filter1.estimateN()); + + // the data provided above do not generate an estimate that is equivalent to the + // actual. + filter1.mergeInPlace( new SimpleHasher( 4, 1 )); + + assertEquals(1, filter1.estimateN() ); + + filter1.mergeInPlace( new SimpleHasher( 17, 1 )); + + assertEquals(3, filter1.estimateN() ); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java new file mode 100644 index 0000000000..cf64ec0833 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.BiPredicate; +import java.util.function.Function; +import java.util.function.IntConsumer; +import java.util.function.ToIntBiFunction; + +import org.apache.commons.collections4.bloomfilter.BitCountProducer.BitCountConsumer; +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.junit.jupiter.api.Test; + +/** + * Tests for the {@link ArrayCountingBloomFilter}. + */ +public abstract class AbstractCountingBloomFilterTest extends AbstractBloomFilterTest { + protected int[] from1Counts = { 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0 }; + protected int[] from11Counts = { 0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0 }; + protected int[] bigHashCounts = { 0,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,0 }; + + protected final BitCountProducer maximumValueProducer = new BitCountProducer() { + + + @Override + public void forEachCount(BitCountProducer.BitCountConsumer consumer) { + for (int i=1;i<18;i++) + { + consumer.accept( i, Integer.MAX_VALUE ); + } + } + }; + + +// /** +// * Function to convert int arrays to BloomFilters for testing. +// */ +// private final Function converter = counts -> { +// final BloomFilter testingFilter = new SimpleBloomFilter(shape); +// testingFilter.merge(new FixedIndexesTestHasher(shape, counts)); +// return testingFilter; +// }; + +// @Override +// protected ArrayCountingBloomFilter createEmptyFilter(final Shape shape) { +// return new ArrayCountingBloomFilter(shape); +// } +// +// @Override +// protected ArrayCountingBloomFilter createFilter(final Hasher hasher, final Shape shape) { +// final ArrayCountingBloomFilter result = new ArrayCountingBloomFilter(shape); +// result.merge( hasher ); +// return result; +// } + +// private ArrayCountingBloomFilter createFromCounts(final int[] counts) { +// // Use a dummy filter to add the counts to an empty filter +// final CountingBloomFilter dummy = new ArrayCountingBloomFilter(shape) { +// @Override +// public void forEachCount(final BitCountConsumer action) { +// for (int i = 0; i < counts.length; i++) { +// action.accept(i, counts[i]); +// } +// } +// }; +// final ArrayCountingBloomFilter bf = new ArrayCountingBloomFilter(shape); +// bf.add(dummy); +// return bf; +// } + + /** + * Assert the counts match the expected values. Values are for indices starting + * at 0. Assert the cardinality equals the number of non-zero counts. + * + * @param bf the bloom filter + * @param expected the expected counts + */ + private static void assertCounts(final CountingBloomFilter bf, final int[] expected) { + final Map m = new HashMap<>(); + bf.forEachCount(m::put); + int zeros = 0; + for (int i = 0; i < expected.length; i++) { + if (m.get(i) == null) { + assertEquals(expected[i], 0, "Wrong value for " + i); + zeros++; + } else { + assertEquals(expected[i], m.get(i).intValue(), "Wrong value for " + i); + } + } + assertEquals(expected.length - zeros, bf.cardinality()); + } + + /** + * Tests that counts are correct when a hasher with duplicates is used in the + * constructor. + */ + @Test + public void constructorTest_Hasher_Duplicates() { + // bit hasher has duplicates for 11, 12,13,14,15,16, and 17 + final CountingBloomFilter bf = createFilter( shape, from1); + bf.add( BitCountProducer.Factory.from( shape , from11) ); + + final long[] lb = bf.getBits(); + assertEquals(1, lb.length); + assertEquals(bigHashValue, lb[0]); + + assertCounts(bf, bigHashCounts ); + } + + + + @Test + public void containsTest_Mixed() { + final BloomFilter bf = new SimpleBloomFilter( shape, from1 ); + final CountingBloomFilter bf2 = createFilter( shape, bigHasher ); + + assertTrue( "BF Should contain itself", bf.contains(bf)); + assertTrue( "BF2 Should contain itself", bf2.contains(bf2)); + assertFalse( "BF should not contain BF2",bf.contains(bf2)); + assertTrue( "BF2 should contain BF", bf2.contains(bf)); + } + + /** + * Tests that merging bloom filters works as expected with a generic BloomFilter. + */ + @Test + public final void mergeTest_Mixed() { + final BloomFilter bf1 = createFilter( shape, from1); + + final BloomFilter bf2 = new SimpleBloomFilter( shape, from11); + + final BloomFilter bf3 = bf1.merge(bf2); + assertTrue( "Should contain", bf3.contains( bf1 )); + assertTrue( "Should contain", bf3.contains( bf2 )); + + final BloomFilter bf4 = bf2.merge(bf1); + assertTrue( "Should contain", bf4.contains( bf1 )); + assertTrue( "Should contain", bf4.contains( bf2 )); + assertTrue( "Should contain", bf4.contains( bf3 )); + assertTrue( "Should contain", bf3.contains( bf4 )); + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void addTest() { + final CountingBloomFilter bf1 = createFilter( shape, from1); + assertTrue( "Add should work", bf1.add(createFilter( shape, from11)) ); + assertTrue( "Should contain", bf1.contains( from1 )); + assertTrue( "Should contain", bf1.contains( from11 )); + assertCounts(bf1, bigHashCounts ); + + } + + @Test + public void addTest_overflow() { + + final CountingBloomFilter bf1 = createEmptyFilter( shape); + assertTrue( "Should add to empty", bf1.add( maximumValueProducer )); + assertTrue( "Should be valid", bf1.isValid() ); + + assertFalse( "Should not add", bf1.add( createFilter( shape, from1) )); + assertFalse( "Should not be valid", bf1.isValid() ); + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void subtractTest() { + final CountingBloomFilter bf1 = createFilter( shape, from1); + bf1.add( BitCountProducer.Factory.from( shape , from11) ); + + final CountingBloomFilter bf2 = createFilter( shape, from11); + + assertTrue( "Subtract should work", bf1.subtract(bf2) ); + assertFalse( "Should not contain bitHasher", bf1.contains( bigHasher )); + assertTrue( "Should contain from1", bf1.contains( from1 )); + + assertCounts(bf1, from1Counts); + + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void subtractTest_underflow() { + final CountingBloomFilter bf1 = createFilter( shape, from1); + + final CountingBloomFilter bf2 = createFilter( shape, from11); + + assertFalse( "Subtract should not work", bf1.subtract(bf2) ); + assertFalse( "isValid should return false", bf1.isValid()); + assertFalse( "Should not contain", bf1.contains( from1 )); + assertFalse( "Should not contain", bf1.contains( bf2 )); + + assertCounts(bf1, new int[] { 0,1,1,1,1,1,1,1,1,1,1,0}); + + } + + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void removeTest() { + final CountingBloomFilter bf1 = createFilter( shape, from1); + bf1.add( BitCountProducer.Factory.from( shape , from11) ); + + assertTrue( "Remove should work", bf1.remove(new SimpleBloomFilter( shape, from11)) ); + assertFalse( "Should not contain", bf1.contains( from11 )); + assertTrue( "Should contain", bf1.contains( from1 )); + + assertCounts(bf1, from1Counts ); + + } + + /** + * Tests that merge correctly updates the counts when a CountingBloomFilter is + * passed. + */ + @Test + public void removeTest_underflow() { + final CountingBloomFilter bf1 = createFilter( shape, from1); + + final BloomFilter bf2 = new SimpleBloomFilter( shape, from11); + + assertFalse( "Subtract should not work", bf1.remove(bf2) ); + assertFalse( "isValid should return false", bf1.isValid()); + assertFalse( "Should not contain", bf1.contains( from1 )); + assertFalse( "Should not contain", bf1.contains( bf2 )); + + assertCounts(bf1, new int[] { 0,1,1,1,1,1,1,1,1,1,1}); + + } + + @Test + public void mergeTest_overflow() { + + final CountingBloomFilter bf1 = createEmptyFilter( shape); + assertTrue( "Should add to empty", bf1.add( maximumValueProducer )); + assertTrue( "Should be valid", bf1.isValid() ); + + CountingBloomFilter bf2 = bf1.merge(new SimpleBloomFilter( shape, from1)); + assertFalse( "Should not be valid", bf2.isValid() ); + } + + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java index fb0ee55b6b..5b1a83e8b6 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java @@ -29,507 +29,23 @@ import java.util.function.ToIntBiFunction; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; import org.junit.jupiter.api.Test; /** * Tests for the {@link ArrayCountingBloomFilter}. */ -public class ArrayCountingBloomFilterTest extends AbstractBloomFilterTest { - - /** - * Function to convert int arrays to BloomFilters for testing. - */ - private final Function converter = counts -> { - final BloomFilter testingFilter = new SimpleBloomFilter(shape); - testingFilter.merge(new FixedIndexesTestHasher(shape, counts)); - return testingFilter; - }; +public class ArrayCountingBloomFilterTest extends AbstractCountingBloomFilterTest { @Override - protected ArrayCountingBloomFilter createEmptyFilter(final Shape shape) { - return new ArrayCountingBloomFilter(shape); + protected ArrayCountingBloomFilter createEmptyFilter(Shape shape) { + return new ArrayCountingBloomFilter( shape ); } @Override - protected ArrayCountingBloomFilter createFilter(final Hasher hasher, final Shape shape) { - final ArrayCountingBloomFilter result = new ArrayCountingBloomFilter(shape); - result.merge( hasher ); - return result; - } - - private ArrayCountingBloomFilter createFromCounts(final int[] counts) { - // Use a dummy filter to add the counts to an empty filter - final CountingBloomFilter dummy = new ArrayCountingBloomFilter(shape) { - @Override - public void forEachCount(final BitCountConsumer action) { - for (int i = 0; i < counts.length; i++) { - action.accept(i, counts[i]); - } - } - }; - final ArrayCountingBloomFilter bf = new ArrayCountingBloomFilter(shape); - bf.add(dummy); - return bf; - } - - /** - * Assert the counts match the expected values. Values are for indices starting - * at 0. Assert the cardinality equals the number of non-zero counts. - * - * @param bf the bloom filter - * @param expected the expected counts - */ - private static void assertCounts(final CountingBloomFilter bf, final int[] expected) { - final Map m = new HashMap<>(); - bf.forEachCount(m::put); - int zeros = 0; - for (int i = 0; i < expected.length; i++) { - if (m.get(i) == null) { - assertEquals(expected[i], 0, "Wrong value for " + i); - zeros++; - } else { - assertEquals(expected[i], m.get(i).intValue(), "Wrong value for " + i); - } - } - assertEquals(expected.length - zeros, bf.cardinality()); - } - - /** - * Tests that counts are correct when a hasher with duplicates is used in the - * constructor. - */ - @Test - public void constructorTest_Hasher_Duplicates() { - final int[] expected = {0, 1, 1, 0, 0, 1}; - // Some indexes with duplicates - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 2, 5); - - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - final long[] lb = bf.getBits(); - assertEquals(1, lb.length); - assertEquals(0b100110L, lb[0]); - - assertCounts(bf, expected); - } - - /** - * Test the contains function with a standard Bloom filter. - * The contains function is tested using a counting Bloom filter in the parent test class. - */ - @Test - public void contains_BloomFilter() { - // Some indexes with duplicates - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 5); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - SimpleBloomFilter testingFilter = new SimpleBloomFilter(shape); - testingFilter.merge( new FixedIndexesTestHasher(shape, 3, 4)); - assertFalse(bf.contains(testingFilter)); - testingFilter = new SimpleBloomFilter(shape); - testingFilter.merge( new FixedIndexesTestHasher(shape, 2, 5)); - assertTrue(bf.contains(testingFilter)); - } - - /** - * Tests that merge correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void mergeTest_Counts_CountingBloomFilter() { - assertMerge(counts -> createFilter(new FixedIndexesTestHasher(shape, counts), shape), - BloomFilter::merge); - } - - /** - * Tests that merge correctly updates the counts when a BloomFilter is passed. - */ - @Test - public void mergeTest_Counts_BloomFilter() { - assertMerge(converter, BloomFilter::merge); - } - - /** - * Test that merge correctly updates the counts when a Hasher is passed. - */ - @Test - public void mergeTest_Counts_Hasher() { - assertMerge(counts -> new FixedIndexesTestHasher(shape, counts), - BloomFilter::merge); - } - - /** - * Test that merge correctly updates the counts when a Hasher is passed with duplicates. - */ - @Test - public void mergeTest_Counts_Hasher_Duplicates() { - assertMerge(counts -> new FixedIndexesTestHasher(shape, createDuplicates(counts)), - BloomFilter::merge); - } - - /** - * Tests that remove correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void removeTest_Counts_CountingBloomFilter() { - assertRemove(counts -> createFilter(new FixedIndexesTestHasher(shape, counts), shape), - CountingBloomFilter::remove); - } - - /** - * Tests that remove correctly updates the counts when a BloomFilter is passed. - */ - @Test - public void removeTest_Counts_BloomFilter() { - assertRemove(converter, CountingBloomFilter::remove); - } - - /** - * Test that remove correctly updates the counts when a Hasher is passed. - */ - @Test - public void removeTest_Counts_Hasher() { - assertRemove(counts -> new FixedIndexesTestHasher(shape, counts), - CountingBloomFilter::remove); - } - - /** - * Test that remove correctly updates the counts when a Hasher is passed with duplicates. - */ - @Test - public void removeTest_Counts_Hasher_Duplicates() { - assertRemove(counts -> new FixedIndexesTestHasher(shape, createDuplicates(counts)), - CountingBloomFilter::remove); - } - - /** - * Creates duplicates in the counts. - * - * @param counts the counts - * @return the new counts - */ - private static int[] createDuplicates(final int[] counts) { - // Duplicate some values randomly - final int length = counts.length; - final int[] countsWithDuplicates = Arrays.copyOf(counts, 2 * length); - for (int i = length; i < countsWithDuplicates.length; i++) { - // Copy a random value from the counts into the end position - countsWithDuplicates[i] = countsWithDuplicates[ThreadLocalRandom.current().nextInt(i)]; - } - return countsWithDuplicates; - } - - /** - * Assert a merge operation. The converter should construct a suitable object - * to remove the indices from the provided Bloom filter with the remove operation. - * - * @param the type of the filter - * @param converter the converter - * @param merge the merge operation - */ - private void assertMerge(final Function converter, - final BiPredicate merge) { - final int[] indexes1 = { 1, 2, 4, 5, 6}; - final int[] indexes2 = { 3, 4, 6}; - final int[] expected = {0, 1, 1, 1, 2, 1, 2}; - assertOperation(indexes1, indexes2, converter, merge, true, expected); - } - - /** - * Assert a remove operation. The converter should construct a suitable object - * to remove the indices from the provided Bloom filter with the remove operation. - * - * @param the type of the filter - * @param converter the converter - * @param remove the remove operation - */ - private void assertRemove(final Function converter, - final BiPredicate remove) { - final int[] indexes1 = { 1, 2, 4, 5, 6}; - final int[] indexes2 = { 2, 5, 6}; - final int[] expected = {0, 1, 0, 0, 1, 0, 0}; - assertOperation(indexes1, indexes2, converter, remove, true, expected); - } - - /** - * Assert a counting operation. The first set of indexes is used to create the - * CountingBloomFilter. The second set of indices is passed to the converter to - * construct a suitable object to combine with the counting Bloom filter. The counts - * of the first Bloom filter are checked using the expected counts. - * - *

Counts are assumed to map to indexes starting from 0. - * - * @param the type of the filter - * @param indexes1 the first set of indexes - * @param indexes2 the second set of indexes - * @param converter the converter - * @param operation the operation - * @param isValid the expected value for the operation result - * @param expected the expected counts after the operation - */ - private void assertOperation(final int[] indexes1, final int[] indexes2, - final Function converter, - final BiPredicate operation, - final boolean isValid, final int[] expected) { - final Hasher hasher = new FixedIndexesTestHasher(shape, indexes1); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - final F filter = converter.apply(indexes2); - final boolean result = operation.test(bf, filter); - assertEquals(isValid, result); - assertEquals(isValid, bf.isValid()); - assertCounts(bf, expected); + protected ArrayCountingBloomFilter createFilter(Shape shape, Hasher hasher) { + ArrayCountingBloomFilter filter = createEmptyFilter( shape ); + filter.add( BitCountProducer.Factory.from(shape, hasher)); + return filter; } - /** - * Tests that merge errors when the counts overflow the maximum integer value. - */ - @Test - public void mergeTest_Overflow() { - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - - final ArrayCountingBloomFilter bf2 = createFromCounts(new int[] {0, 0, Integer.MAX_VALUE}); - - // Small + 1 = OK - // should not fail as the counts are ignored - assertTrue(bf.merge(bf2)); - assertTrue(bf.isValid()); - assertCounts(bf, new int[] {0, 1, 2, 1}); - - // Big + 1 = Overflow - assertTrue(bf2.isValid()); - assertFalse(bf2.merge(bf)); - assertFalse(bf2.isValid(), "Merge should overflow and the filter is invalid"); - - // The counts are not clipped to max. They have simply overflowed. - // Note that this is a merge and the count is only incremented by 1 - // and not the actual count at each index. So it is not 2 + Integer.MAX_VALUE. - assertCounts(bf2, new int[] {0, 1, 1 + Integer.MAX_VALUE, 1}); - } - - /** - * Tests that removal errors when the counts become negative. - */ - @Test - public void removeTest_Negative() { - final Hasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); - final ArrayCountingBloomFilter bf = createFilter(hasher, shape); - - final Hasher hasher2 = new FixedIndexesTestHasher(shape, 2); - final ArrayCountingBloomFilter bf2 = createFilter(hasher2, shape); - - // More - Less = OK - bf.remove(bf2); - assertTrue(bf.isValid()); - assertCounts(bf, new int[] {0, 1, 0, 1}); - - // Less - More = Negative - assertTrue(bf2.isValid()); - bf2.remove(bf); - assertFalse(bf2.isValid(), "Remove should create negative counts and the filter is invalid"); - - // The counts are not clipped to zero. They have been left as negative. - assertCounts(bf2, new int[] {0, -1, 1, -1}); - } - - /** - * Tests that counts can be added to a new instance. - * - *

Note: This test ensures the CountingBloomFilter - * can be created with whatever counts are required for other tests. - */ - @Test - public void addTest_NewInstance() { - for (final int[] counts : new int[][] { - { /* empty */}, - {0, 0, 1}, - {0, 1, 2}, - {2, 3, 4}, - {66, 77, 0, 99}, - {Integer.MAX_VALUE, 42}, - }) { - assertCounts(createFromCounts(counts), counts); - } - } - - /** - * Test that add correctly ignores an empty CountingBloomFilter. - */ - @Test - public void addTest_Empty() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[0], - CountingBloomFilter::add, - true, - new int[] {5, 2, 1}); - } - - /** - * Test that add correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void addTest_Counts() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[] {0, 6, 4, 1}, - CountingBloomFilter::add, - true, - new int[] {5, 8, 5, 1}); - } - - /** - * Test that add correctly updates the isValid state when a CountingBloomFilter is - * passed and an integer overflow occurs. - */ - @Test - public void addTest_Overflow() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[] {0, 6, Integer.MAX_VALUE}, - CountingBloomFilter::add, - false, - new int[] {5, 8, 1 + Integer.MAX_VALUE}); - } - - /** - * Test that subtract correctly ignores an empty CountingBloomFilter. - */ - @Test - public void subtractTest_Empty() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[0], - CountingBloomFilter::subtract, - true, - new int[] {5, 2, 1}); - } - - /** - * Test that subtract correctly updates the counts when a CountingBloomFilter is - * passed. - */ - @Test - public void subtractTest_Counts() { - assertCountingOperation(new int[] {5, 9, 1, 1}, - new int[] {0, 2, 1}, - CountingBloomFilter::subtract, - true, - new int[] {5, 7, 0, 1}); - } - - /** - * Test that subtract correctly updates the isValid state when a CountingBloomFilter is - * passed and the counts become negative. - */ - @Test - public void subtractTest_Negative() { - assertCountingOperation(new int[] {5, 2, 1}, - new int[] {0, 6, 1}, - CountingBloomFilter::subtract, - false, - new int[] {5, -4, 0}); - } - - /** - * Assert a counting operation. Two CountingBloomFilters are created from the - * two sets of counts. The operation is applied and the counts of the first - * Bloom filter is checked using the expected counts. - * - *

Counts are assumed to map to indexes starting from 0. - * - * @param counts1 the first set counts - * @param counts2 the first set counts - * @param operation the operation - * @param isValid the expected value for the operation result - * @param expected the expected counts after the operation - */ - private void assertCountingOperation(final int[] counts1, final int[] counts2, - final BiPredicate operation, - final boolean isValid, final int[] expected) { - final ArrayCountingBloomFilter bf1 = createFromCounts(counts1); - final ArrayCountingBloomFilter bf2 = createFromCounts(counts2); - final boolean result = operation.test(bf1, bf2); - assertEquals(isValid, result); - assertEquals(isValid, bf1.isValid()); - assertCounts(bf1, expected); - } - - /** - * Tests that the andCardinality calculation executes correctly when using a - * CountingBloomFilter argument. - */ - @Test - public void andCardinalityTest_CountingBloomFilter() { - assertCardinalityOperation(new int[] {1, 1}, - new int[] {1, 1}, - BloomFilter::andCardinality, - 2); - assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, - new int[] {1, 1, 0, 0, 0, 1}, - BloomFilter::andCardinality, - 2); - assertCardinalityOperation(new int[] {1, 1}, - new int[] {0, 0, 1, 1, 1}, - BloomFilter::andCardinality, - 0); - } - - /** - * Tests that the orCardinality calculation executes correctly when using a - * CountingBloomFilter argument. - */ - @Test - public void orCardinalityTest_CountingBloomFilter() { - assertCardinalityOperation(new int[] {1, 1}, - new int[] {1, 1}, - BloomFilter::orCardinality, - 2); - assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, - new int[] {1, 1, 0, 0, 0, 1}, - BloomFilter::orCardinality, - 6); - assertCardinalityOperation(new int[] {1, 1}, - new int[] {0, 0, 1, 1, 1}, - BloomFilter::orCardinality, - 5); - } - - /** - * Tests that the xorCardinality calculation executes correctly when using a - * CountingBloomFilter argument. - */ - @Test - public void xorCardinalityTest_CountingBloomFilter() { - assertCardinalityOperation(new int[] {1, 1}, - new int[] {1, 1}, - BloomFilter::xorCardinality, - 0); - assertCardinalityOperation(new int[] {0, 1, 0, 1, 1, 1, 0, 1, 0}, - new int[] {1, 1, 0, 0, 0, 1}, - BloomFilter::xorCardinality, - 4); - assertCardinalityOperation(new int[] {1, 1}, - new int[] {0, 0, 1, 1, 1}, - BloomFilter::xorCardinality, - 5); - } - - /** - * Assert a cardinality operation. Two CountingBloomFilters are created from the - * two sets of counts. The operation is applied and the counts of the first - * Bloom filter is checked using the expected counts. - * - *

Counts are assumed to map to indexes starting from 0. - * - * @param counts1 the first set counts - * @param counts2 the first set counts - * @param operation the operation - * @param expected the expected cardinality - */ - private void assertCardinalityOperation(final int[] counts1, final int[] counts2, - final ToIntBiFunction operation, - final int expected) { - final ArrayCountingBloomFilter bf1 = createFromCounts(counts1); - final ArrayCountingBloomFilter bf2 = createFromCounts(counts2); - assertEquals(expected, operation.applyAsInt(bf1, bf2)); - assertEquals(expected, operation.applyAsInt(bf2, bf1)); - } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java index 7f9e509f5f..5c70e5d214 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java @@ -5,7 +5,6 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; import org.junit.Test; public class BitMaptTest { diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java deleted file mode 100644 index ffd2d0d8c5..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BloomFilterIndexerTest.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.ArrayList; -import java.util.Random; -import java.util.concurrent.ThreadLocalRandom; - -import static org.junit.jupiter.api.Assertions.assertThrows; - -/** - * Tests for the {@link BloomFilterIndexer}. - */ -public class BloomFilterIndexerTest { - - @Test - public void testCheckPositiveThrows() { - assertThrows(IndexOutOfBoundsException.class, () -> BloomFilterIndexer.checkPositive(-1)); - } - - @Test - public void testGetLongIndex() { - assertEquals(0, BloomFilterIndexer.getLongIndex(0)); - - for (final int index : getIndexes()) { - // getLongIndex is expected to identify a block of 64-bits (starting from zero) - assertEquals(index / Long.SIZE, BloomFilterIndexer.getLongIndex(index)); - - // Verify the behavior for negatives. It should produce a negative (invalid) - // as a simple trip for incorrect usage. - assertTrue(BloomFilterIndexer.getLongIndex(-index) < 0); - - // If index is not zero then when negated this is what a signed shift - // of 6-bits actually does - assertEquals(((1 - index) / Long.SIZE) - 1, - BloomFilterIndexer.getLongIndex(-index)); - } - } - - @Test - public void testGetLongBit() { - assertEquals(1L, BloomFilterIndexer.getLongBit(0)); - - for (final int index : getIndexes()) { - // getLongBit is expected to identify a single bit in a 64-bit block - assertEquals(1L << (index % Long.SIZE), BloomFilterIndexer.getLongBit(index)); - - // Verify the behavior for negatives - assertEquals(1L << (64 - (index & 0x3f)), BloomFilterIndexer.getLongBit(-index)); - } - } - - /** - * Gets non-zero positive indexes for testing. - * - * @return the indices - */ - private static int[] getIndexes() { - final Random rng = ThreadLocalRandom.current(); - final ArrayList indexes = new ArrayList<>(40); - for (int i = 0; i < 10; i++) { - // random positive numbers - indexes.add(rng.nextInt() >>> 1); - indexes.add(rng.nextInt(23647826)); - indexes.add(rng.nextInt(245)); - } - // Quickly remove zeros (as these cannot be negated) - indexes.removeIf(i -> i == 0); - // Add edge cases here - indexes.add(1); - indexes.add(2); - indexes.add(63); - indexes.add(64); - indexes.add(Integer.MAX_VALUE); - return indexes.stream().mapToInt(Integer::intValue).toArray(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java deleted file mode 100644 index b1d0525721..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterMethodsTest.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.BitSet; -import java.util.function.IntConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; - -/** - * Test all the default implementations of the BloomFilter in {@link AbstractBloomFilter}. - */ -public class DefaultBloomFilterMethodsTest extends AbstractBloomFilterTest { - - /** - * A testing class that implements only the abstract methods from BloomFilter. - * - */ - private static class BF extends AbstractBloomFilter { - - /** - * The bits for this BloomFilter. - */ - private final BitSet bitSet; - - /** - * Constructs a BitSetBloomFilter from a hasher and a shape. - * - * @param hasher the Hasher to use. - * @param shape the desired shape of the filter. - */ - BF(final Hasher hasher, final Shape shape) { - this(shape); - verifyHasher(hasher); - hasher.iterator(shape).forEachRemaining((IntConsumer) bitSet::set); - } - - /** - * Constructs an empty BitSetBloomFilter. - * - * @param shape the desired shape of the filter. - */ - BF(final Shape shape) { - super(shape); - this.bitSet = new BitSet(); - } - - @Override - public long[] getBits() { - return bitSet.toLongArray(); - } - - @Override - public SimpleHasher getHasher() { - return new SimpleHasher(bitSet.stream().iterator(), getShape()); - } - - @Override - public boolean merge(final BloomFilter other) { - verifyShape(other); - bitSet.or(BitSet.valueOf(other.getBits())); - return true; - } - - @Override - public boolean merge(final Hasher hasher) { - verifyHasher(hasher); - hasher.iterator(getShape()).forEachRemaining((IntConsumer) bitSet::set); - return true; - } - } - - @Override - protected AbstractBloomFilter createEmptyFilter(final Shape shape) { - return new BF(shape); - } - - @Override - protected AbstractBloomFilter createFilter(final Hasher hasher, final Shape shape) { - return new BF(hasher, shape); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java b/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java deleted file mode 100644 index ec4886294c..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/FixedIndexesTestHasher.java +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; - -import java.util.Arrays; -import java.util.PrimitiveIterator.OfInt; - -/** - * A Hasher implementation to return fixed indexes. Duplicates are allowed. - * The shape is ignored when generating the indexes. - * - *

This is not a real hasher and is used for testing only. - */ -class FixedIndexesTestHasher implements Hasher { - /** The shape. */ - private final Shape shape; - /** The indexes. */ - private final int[] indexes; - - /** - * Create an instance. - * - * @param shape the shape - * @param indexes the indexes - */ - FixedIndexesTestHasher(final Shape shape, final int... indexes) { - this.shape = shape; - this.indexes = indexes; - } - - @Override - public OfInt iterator(final Shape shape) { - if (!this.shape.equals(shape)) { - throw new IllegalArgumentException( - String.format("shape (%s) does not match internal shape (%s)", shape, this.shape)); - } - return Arrays.stream(indexes).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return shape.getHashFunctionIdentity(); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java deleted file mode 100644 index c6c6a03b2e..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexFilterTest.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentityImpl; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; -import org.junit.jupiter.api.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Set; -import java.util.function.IntConsumer; -import java.util.stream.Collectors; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -/** - * Tests for the {@link IndexFilters}. - */ -public class IndexFilterTest { - - /** - * The shape of the dummy Bloom filter. - * This is used as an argument to a Hasher that just returns fixed indexes - * so the parameters do not matter. - */ - private final Shape shape = new Shape(new HashFunctionIdentityImpl( - "Apache Commons Collections", "Dummy", Signedness.SIGNED, ProcessType.CYCLIC, 0L), - 50, 3000, 4); - - @Test - public void testApplyThrowsWithNullArguments() { - final FixedIndexesTestHasher hasher = new FixedIndexesTestHasher(shape, 1, 2, 3); - final Shape shape = this.shape; - final ArrayList actual = new ArrayList<>(); - final IntConsumer consumer = actual::add; - - try { - IndexFilters.distinctIndexes(null, shape, consumer); - fail("null hasher"); - } catch (final NullPointerException expected) { - // Ignore - } - - try { - IndexFilters.distinctIndexes(hasher, null, consumer); - fail("null shape"); - } catch (final NullPointerException expected) { - // Ignore - } - - try { - IndexFilters.distinctIndexes(hasher, shape, null); - fail("null consumer"); - } catch (final NullPointerException expected) { - // Ignore - } - - // All OK together - IndexFilters.distinctIndexes(hasher, shape, consumer); - } - - @Test - public void testApply() { - assertFilter(1, 4, 6, 7, 9); - } - - @Test - public void testApplyWithDuplicates() { - assertFilter(1, 4, 4, 6, 7, 7, 7, 7, 7, 9); - } - - private void assertFilter(final int... indexes) { - final FixedIndexesTestHasher hasher = new FixedIndexesTestHasher(shape, indexes); - final Set expected = Arrays.stream(indexes).boxed().collect(Collectors.toSet()); - final ArrayList actual = new ArrayList<>(); - - IndexFilters.distinctIndexes(hasher, shape, actual::add); - - assertEquals(expected.size(), actual.size()); - // Check the array has all the values. - // We do not currently check the order of indexes from the - // hasher.iterator() function. - for (final Integer index : actual) { - assertTrue(expected.contains(index)); - } - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index 967de9fcb6..8b5388832a 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -21,9 +21,8 @@ import java.util.List; import java.util.Arrays; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import org.apache.commons.collections4.bloomfilter.hasher.HasherCollection; import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; import org.junit.jupiter.api.Test; @@ -32,88 +31,41 @@ */ public class SetOperationsTest { - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - private final Shape shape = new Shape(testFunction, 3, 72, 17); - - @Test - public void testDifferentShapesThrows() { - final List lst = Arrays.asList(1, 2); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - final Shape shape2 = new Shape(testFunction, 3, 72, 18); - final List lst2 = Arrays.asList(2, 3); - final Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape2); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape2); - - try { - SetOperations.cosineDistance(filter1, filter2); - fail("Expected an IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // Ignore - } - } + protected final SimpleHasher from1 = new SimpleHasher( 1, 1 ); + protected final long from1Value = 0x3FFFEL; + protected final SimpleHasher from11 = new SimpleHasher( 11, 1 ); + protected final long from11Value = 0xFFFF800L; + protected final HasherCollection bigHasher = new HasherCollection( from1, from11 ); + protected final long bigHashValue = 0xFFFFFFEL; + private final Shape shape = new Shape(17, 72); /** * Tests that the Cosine similarity is correctly calculated. */ @Test public final void cosineDistanceTest() { - List lst = Arrays.asList(1, 2); - Hasher hasher = new SimpleHasher(lst.iterator(), shape); - BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(2, 3); - Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(0.5, SetOperations.cosineDistance(filter1, filter2), 0.0001); - assertEquals(0.5, SetOperations.cosineDistance(filter2, filter1), 0.0001); - lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - hasher = new SimpleHasher(lst.iterator(), shape); - filter1 = new HasherBloomFilter(hasher, shape); + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); - lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - hasher2 = new SimpleHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(0.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new SimpleHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); + Shape shape2 = new Shape( 2, 72 ); + filter1 = new SimpleBloomFilter(shape2, from1); + filter2 = new SimpleBloomFilter(shape2, new SimpleHasher( 2, 1 )); - assertEquals(0.514928749927334, SetOperations.cosineDistance(filter1, filter2), 0.000000000000001); - assertEquals(0.514928749927334, SetOperations.cosineDistance(filter2, filter1), 0.000000000000001); + assertEquals(0.5, SetOperations.cosineDistance(filter1, filter2), 0.0001); + assertEquals(0.5, SetOperations.cosineDistance(filter2, filter1), 0.0001); + + + filter1 = new SimpleBloomFilter(shape, from1); + filter2 = new SimpleBloomFilter(shape, from11); + + assertEquals(0.58823529, SetOperations.cosineDistance(filter1, filter2), 0.00000001); + assertEquals(0.58823529, SetOperations.cosineDistance(filter2, filter1), 0.00000001); } /** @@ -122,17 +74,15 @@ public final void cosineDistanceTest() { */ @Test public final void cosineDistanceTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); - // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape); + BloomFilter filter3 = new SimpleBloomFilter(shape); + assertEquals(1.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); - assertEquals(1.0, SetOperations.cosineDistance(filter1, filter3), 0.0001); - assertEquals(1.0, SetOperations.cosineDistance(filter3, filter1), 0.0001); + assertEquals(1.0, SetOperations.cosineDistance(filter2, filter3), 0.0001); + assertEquals(1.0, SetOperations.cosineDistance(filter3, filter2), 0.0001); } /** @@ -140,23 +90,17 @@ public final void cosineDistanceTest_NoValues() { */ @Test public final void cosineSimilarityTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); + BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); assertEquals(1.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.cosineSimilarity(filter2, filter1), 0.0001); - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new SimpleHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); + filter2 = new SimpleBloomFilter(shape, from11); - assertEquals(0.485071250072666, SetOperations.cosineSimilarity(filter1, filter2), 0.000000000000001); - assertEquals(0.485071250072666, SetOperations.cosineSimilarity(filter2, filter1), 0.000000000000001); + assertEquals(0.41176470, SetOperations.cosineSimilarity(filter1, filter2), 0.000000001); + assertEquals(0.41176470, SetOperations.cosineSimilarity(filter2, filter1), 0.000000001); } /** @@ -165,12 +109,10 @@ public final void cosineSimilarityTest() { */ @Test public final void cosineSimilarityTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape); + final BloomFilter filter2 = new SimpleBloomFilter(shape); // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); + final BloomFilter filter3 = new SimpleBloomFilter(shape, from1); assertEquals(0.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.cosineSimilarity(filter2, filter1), 0.0001); @@ -178,92 +120,23 @@ public final void cosineSimilarityTest_NoValues() { assertEquals(0.0, SetOperations.cosineSimilarity(filter3, filter1), 0.0001); } - /** - * Tests that the intersection size estimate is correctly calculated. - */ - @Test - public final void estimateIntersectionSizeTest() { - // build a filter - List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - lst = Arrays.asList(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 34, 35, 36, 37, 38, 39, 40); - final Hasher hasher2 = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - final long estimate = SetOperations.estimateIntersectionSize(filter1, filter2); - assertEquals(1, estimate); - } - - /** - * Tests that the size estimate is correctly calculated. - */ - @Test - public final void estimateSizeTest() { - // build a filter - List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher = new SimpleHasher(lst.iterator(), shape); - BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - assertEquals(1, SetOperations.estimateSize(filter1)); - - // the data provided above do not generate an estimate that is equivalent to the - // actual. - lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); - hasher = new SimpleHasher(lst.iterator(), shape); - filter1 = new HasherBloomFilter(hasher, shape); - assertEquals(1, SetOperations.estimateSize(filter1)); - - lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 33); - final Hasher hasher2 = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - assertEquals(3, SetOperations.estimateSize(filter2)); - } - - /** - * Tests that the union size estimate is correctly calculated. - */ - @Test - public final void estimateUnionSizeTest() { - // build a filter - List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - lst = Arrays.asList(17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, - 40); - final Hasher hasher2 = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); - - final long estimate = SetOperations.estimateUnionSize(filter1, filter2); - assertEquals(3, estimate); - } /** * Tests that the Hamming distance is correctly calculated. */ @Test public final void hammingDistanceTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); assertEquals(0, SetOperations.hammingDistance(filter1, filter2)); assertEquals(0, SetOperations.hammingDistance(filter2, filter1)); - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new SimpleHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); + filter2 = new SimpleBloomFilter( shape, from11); - assertEquals(17, SetOperations.hammingDistance(filter1, filter2)); - assertEquals(17, SetOperations.hammingDistance(filter2, filter1)); + assertEquals(20, SetOperations.hammingDistance(filter1, filter2)); + assertEquals(20, SetOperations.hammingDistance(filter2, filter1)); } /** @@ -271,23 +144,17 @@ public final void hammingDistanceTest() { */ @Test public final void jaccardDistanceTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1), 0.0001); - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new SimpleHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); - assertEquals(0.32, SetOperations.jaccardDistance(filter1, filter2), 0.001); - assertEquals(0.32, SetOperations.jaccardDistance(filter2, filter1), 0.001); + filter2 = new SimpleBloomFilter(shape, from11); + + assertEquals(0.26, SetOperations.jaccardDistance(filter1, filter2), 0.001); + assertEquals(0.26, SetOperations.jaccardDistance(filter2, filter1), 0.001); } /** @@ -296,12 +163,9 @@ public final void jaccardDistanceTest() { */ @Test public final void jaccardDistanceTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); - // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape); + final BloomFilter filter2 = new SimpleBloomFilter(shape); + final BloomFilter filter3 = new SimpleBloomFilter(shape, from1); assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1), 0.0001); @@ -314,23 +178,16 @@ public final void jaccardDistanceTest_NoValues() { */ @Test public final void jaccardSimilarityTest() { - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter1 = new HasherBloomFilter(hasher, shape); - - List lst2 = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - Hasher hasher2 = new SimpleHasher(lst2.iterator(), shape); - BloomFilter filter2 = new HasherBloomFilter(hasher2, shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape, from1); + BloomFilter filter2 = new SimpleBloomFilter(shape, from1); assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.jaccardSimilarity(filter2, filter1), 0.0001); - lst2 = Arrays.asList(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25); - hasher2 = new SimpleHasher(lst2.iterator(), shape); - filter2 = new HasherBloomFilter(hasher2, shape); + filter2 = new SimpleBloomFilter(shape, from11); - assertEquals(0.68, SetOperations.jaccardSimilarity(filter1, filter2), 0.001); - assertEquals(0.68, SetOperations.jaccardSimilarity(filter2, filter1), 0.001); + assertEquals(0.74, SetOperations.jaccardSimilarity(filter1, filter2), 0.001); + assertEquals(0.74, SetOperations.jaccardSimilarity(filter2, filter1), 0.001); } /** @@ -339,12 +196,9 @@ public final void jaccardSimilarityTest() { */ @Test public final void jaccardSimilarityTest_NoValues() { - final BloomFilter filter1 = new HasherBloomFilter(shape); - final BloomFilter filter2 = new HasherBloomFilter(shape); - // build a filter - final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); - final Hasher hasher = new SimpleHasher(lst.iterator(), shape); - final BloomFilter filter3 = new HasherBloomFilter(hasher, shape); + final BloomFilter filter1 = new SimpleBloomFilter(shape); + final BloomFilter filter2 = new SimpleBloomFilter(shape); + final BloomFilter filter3 = new SimpleBloomFilter(shape, from1); assertEquals(0.0, SetOperations.jaccardSimilarity(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.jaccardSimilarity(filter2, filter1), 0.0001); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java new file mode 100644 index 0000000000..ecc18e5db5 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.fail; + + +import java.util.ArrayList; + +import org.apache.commons.collections4.bloomfilter.Shape; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link Shape} class. + */ +public class ShapeFactoryTest { + + + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + * + * n = 5 + * + * p = 0.100375138 (1 in 10) + * + * m = 24 (3B) + * + * k = 3 + */ + + private final Shape shape = new Shape(3, 24 ); + + /** + * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. + */ + @Test + public void badNumberOfItemsTest() { + try { + Shape.Factory.fromNM(0, 24); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNMK(0, 24, 5); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNP(0, 0.02 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the number of bits is less than 1 an exception is thrown + */ + @Test + public void badNumberOfBitsTest() { + try { + Shape.Factory.fromNM( 5, 0 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNMK( 5, 0, 7 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromPMK( 0.035, 0, 7 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the number of hash functions is less than 1 an exception is thrown. + */ + @Test + public void badNumberOfHashFunctionsTest() { + try { + Shape.Factory.fromNMK(5, 26, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromPMK(0.35, 26, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown + */ + @Test + public void badProbabilityTest() { + try { + Shape.Factory.fromNMK( 4000, 8, 1); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNP(10, 0.0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + try { + Shape.Factory.fromNP( 10, 1.0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + try { + Shape.Factory.fromNP( 10, Double.NaN); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + } + + + /** + * Tests that when the number of items, number of bits and number of hash functions is passed the values are + * calculated correctly. + */ + @Test + public void fromNMK_test() { + /* + * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 + */ + final Shape filterConfig = Shape.Factory.fromNMK( 5, 24, 4); + + assertEquals(24, filterConfig.getNumberOfBits()); + assertEquals(4, filterConfig.getNumberOfHashFunctions()); + assertEquals(0.102194782, filterConfig.getProbability(5 ), 0.000001); + } + + /** + * Tests that the number of items and number of bits is passed the other values are calculated correctly. + */ + @Test + public void fromNM_Test() { + /* + * values from https://hur.st/bloomfilter/?n=5&m=24 + */ + final Shape filterConfig = Shape.Factory.fromNM(5, 24); + + assertEquals(24, filterConfig.getNumberOfBits()); + assertEquals(3, filterConfig.getNumberOfHashFunctions()); + assertEquals(0.100375138, filterConfig.getProbability(5), 0.000001); + } + + + + + /** + * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. + */ + @Test + public void numberOfBitsOverflowTest() { + try { + Shape.Factory.fromNP(Integer.MAX_VALUE, 0.1); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + } + + /** + * Tests the the probability is calculated correctly. + */ + @Test + public void probabilityTest() { + Shape shape = Shape.Factory.fromNMK(5, 24, 3 ); + assertEquals(24, shape.getNumberOfBits()); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(0.100375138, shape.getProbability(5), 0.000001); + } + + + + + /** + * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash + * functions. + */ + @Test + public void fromPMK_test() { + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 + */ + final Shape shape = Shape.Factory.fromPMK( 0.1, 24, 3); + + assertEquals(24, shape.getNumberOfBits()); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(0.100375138, shape.getProbability(5), 0.000001); + } + + + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java new file mode 100644 index 0000000000..d9d4b82334 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java @@ -0,0 +1,456 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.fail; + + +import java.util.ArrayList; + +import org.apache.commons.collections4.bloomfilter.Shape; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link Shape} class. + */ +public class ShapeTest { + + + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + * + * n = 5 + * + * p = 0.100375138 (1 in 10) + * + * m = 24 (3B) + * + * k = 3 + */ + + private final Shape shape = new Shape(3, 24 ); + + /** + * Tests that if the number of bits less than 1 an IllegalArgumentException is thrown. + */ + @Test + public void constructor_items_bits_BadNumberOfBitsTest() { + try { + new Shape( 5, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + +// /** +// * Tests that if the number of hash functions is less than 1 an IllegalArgumentException is thrown. +// */ +// @Test +// public void constructor_items_bits_BadNumberOfHashFunctionsTest() { +// try { +// new Shape( 16, 8); +// +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// } + +// /** +// * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. +// */ +// @Test +// public void constructor_items_bits_BadNumberOfItemsTest() { +// try { +// new Shape(testFunction, 0, 24); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// } + +// /** +// * Tests that if the number of bits is less than 1 an exception is thrown +// */ +// @Test +// public void constructor_items_bits_hash_BadNumberOfBitsTest() { +// try { +// new Shape(testFunction, 5, 0, 1); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// } + + /** + * Tests that if the number of hash functions is less than 1 an exception is thrown. + */ + @Test + public void constructor_items_bits_hash_BadNumberOfHashFunctionsTest() { + try { + new Shape(0, 5); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + +// /** +// * Tests that if the number of items is less than 1 an exception is thrown. +// */ +// @Test +// public void constructor_items_bits_hash_BadNumberOfItemsTest() { +// try { +// new Shape(testFunction, 0, 24, 1); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// } + +// /** +// * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown +// */ +// @Test +// public void constructor_items_bits_hash_BadProbabilityTest() { +// try { +// new Shape(testFunction, 4000, 8, 1); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// } + +// /** +// * Tests that when the number of items, number of bits and number of hash functions is passed the values are +// * calculated correctly. +// */ +// @Test +// public void constructor_items_bits_hashTest() { +// /* +// * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 +// */ +// final Shape filterConfig = new Shape(testFunction, 5, 24, 4); +// +// assertEquals(24, filterConfig.getNumberOfBits()); +// assertEquals(4, filterConfig.getNumberOfHashFunctions()); +// assertEquals(5, filterConfig.getNumberOfItems()); +// assertEquals(0.102194782, filterConfig.getProbability(), 0.000001); +// } + +// /** +// * Tests that the number of items and number of bits is passed the other values are calculated correctly. +// */ +// @Test +// public void constructor_items_bitsTest() { +// /* +// * values from https://hur.st/bloomfilter/?n=5&m=24 +// */ +// final Shape filterConfig = new Shape(testFunction, 5, 24); +// +// assertEquals(24, filterConfig.getNumberOfBits()); +// assertEquals(3, filterConfig.getNumberOfHashFunctions()); +// assertEquals(5, filterConfig.getNumberOfItems()); +// assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); +// } +// +// /** +// * Tests that if the number of items is less than 1 an IllegalArgumentException is thrown. +// */ +// @Test +// public void constructor_items_probability_BadNumberOfItemsTest() { +// try { +// new Shape(testFunction, 0, 1.0 / 10); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // do nothing. +// } +// } +// +// /** +// * Tests that if the probability is less than or equal to 0 or more than or equal to 1 an IllegalArgumentException is thrown. +// */ +// @Test +// public void constructor_items_probability_BadProbabilityTest() { +// try { +// new Shape(testFunction, 10, 0.0); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // do nothing. +// } +// try { +// new Shape(testFunction, 10, 1.0); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // do nothing. +// } +// try { +// new Shape(testFunction, 10, Double.NaN); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // do nothing. +// } +// } +// +// /** +// * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. +// */ +// @Test +// public void constructor_items_probability_NumberOfBitsOverflowTest() { +// try { +// new Shape(testFunction, Integer.MAX_VALUE, 1.0 / 10); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // do nothing. +// } +// } +// +// /** +// * Tests the the probability is calculated correctly. +// */ +// @Test +// public void constructor_items_probability_Test() { +// +// assertEquals(24, shape.getNumberOfBits()); +// assertEquals(3, shape.getNumberOfHashFunctions()); +// assertEquals(5, shape.getNumberOfItems()); +// assertEquals(0.100375138, shape.getProbability(), 0.000001); +// } +// +// /** +// * Tests that the constructor with a null name, number of items and size of filter fails. +// */ +// @Test +// public void constructor_nm_noName() { +// try { +// new Shape(null, 5, 72); +// fail("Should throw NullPointerException"); +// } catch (final NullPointerException expected) { +// // do nothing +// } +// } +// +// /** +// * Tests that the constructor with a null name, number of items, size of filter, and number of functions fails. +// */ +// @Test +// public void constructor_nmk_noName() { +// try { +// new Shape(null, 5, 72, 17); +// fail("Should throw NullPointerException"); +// } catch (final NullPointerException expected) { +// // do nothing +// } +// } +// +// /** +// * Tests that the constructor with a null name, number of items, and probability fails. +// */ +// @Test +// public void constructor_np_noName() { +// try { +// new Shape(null, 5, 0.1); +// fail("Should throw NullPointerException"); +// } catch (final NullPointerException expected) { +// // do nothing +// } +// } +// +// /** +// * Tests that the constructor with a null name, probability, size of filter, and number of functions fails. +// */ +// @Test +// public void constructor_pmk_noName() { +// try { +// new Shape(null, 0.1, 72, 17); +// fail("Should throw NullPointerException"); +// } catch (final NullPointerException expected) { +// // do nothing +// } +// } +// +// /** +// * Tests that if the number of bits is less than 1 an exception is thrown +// */ +// @Test +// public void constructor_probability_bits_hash_BadNumberOfBitsTest() { +// try { +// new Shape(testFunction, 0.5, 0, 1); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// } +// +// /** +// * Tests that if the number of functions is less than 1 an exception is thrown +// */ +// @Test +// public void constructor_probability_bits_hash_BadNumberOfHashFunctionsTest() { +// try { +// new Shape(testFunction, 0.5, 24, 0); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// } +// +// /** +// * Tests that invalid probability values cause and IllegalArgumentException to be thrown. +// */ +// @Test +// public void constructor_probability_bits_hash_BadProbabilityTest() { +// // probability should not be 0 +// try { +// new Shape(testFunction, 0.0, 24, 1); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// +// // probability should not be = -1 +// try { +// new Shape(testFunction, -1.0, 24, 1); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// +// // probability should not be < -1 +// try { +// new Shape(testFunction, -1.5, 24, 1); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// +// // probability should not be = 1 +// try { +// new Shape(testFunction, 1.0, 24, 1); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// +// // probability should not be > 1 +// try { +// new Shape(testFunction, 2.0, 24, 1); +// fail("Should have thrown IllegalArgumentException"); +// } catch (final IllegalArgumentException expected) { +// // expected +// } +// } +// +// /** +// * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash +// * functions. +// */ +// @Test +// public void constructor_probability_bits_hashTest() { +// /* +// * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 +// */ +// final Shape filterConfig = new Shape(testFunction, 0.1, 24, 3); +// +// assertEquals(24, filterConfig.getNumberOfBits()); +// assertEquals(3, filterConfig.getNumberOfHashFunctions()); +// assertEquals(5, filterConfig.getNumberOfItems()); +// assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); +// } +// + /** + * Test equality of shape. + */ + @Test + public void equalsTest() { + + assertEquals(shape, shape); + assertEquals( 3, shape.getNumberOfHashFunctions()); + assertEquals( 24, shape.getNumberOfBits() ); + assertEquals(shape.hashCode(), new Shape(3,24 ).hashCode()); + assertNotEquals(shape, null); + assertNotEquals(shape, new Shape(3,25)); + assertNotEquals(shape, new Shape(4,24)); + } + + @Test + public void estimateNTest() { + double[] expected = {0.0, 0.3404769153503671, + 0.6960910159170385, + 1.068251140996181, + 1.4585724543516367, + 1.8689188094520417, + 2.301456579614247, + 2.758723890333837, + 3.243720864865314, + 3.7600290339658846, + 4.311972005861497, + 4.90483578309127, + 5.545177444479562, + 6.2412684603966, + 7.003749898831201, + 7.8466340240938095, + 8.788898309344876, + 9.85714945034106, + 11.090354888959125, + 12.54892734331076, + 14.334075753824441, + 16.635532333438686, + 19.879253198304, + 25.424430642783573}; + for (int i=0;i<24;i++) + { + assertEquals( expected[i], shape.estimateN(i), 0.00000000000000001); + } + } + + @Test + public void getProbabilityTest() { + double[] expected = { 0.0, + 0.0016223626694561954, + 0.010823077182670957, + 0.030579354491777785, + 0.06091618422799686, + 0.1003751381786711, + 0.14689159766038104, + 0.19829601428155866, + 0.25258045782764715, + 0.3080221532988778, + 0.3632228594351169, + 0.4171013016177174, + 0.4688617281200601, + 0.5179525036637239, + 0.5640228015164387, + 0.6068817738972262, + 0.6464623147796981, + 0.6827901771310362, + 0.7159584363083427, + 0.7461068849672469, + 0.7734057607554121, + 0.7980431551369204, + 0.8202154721379679, + 0.8401203636727712}; + for (int i=0;i<24;i++) { + assertEquals( expected[i], shape.getProbability(i), 0.00000000000000001); + } + } + + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java index ea87e8c599..b06bf60183 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java @@ -17,7 +17,6 @@ package org.apache.commons.collections4.bloomfilter; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; /** * Tests for the {@link SimpleBloomFilter}. diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java index dca19dda6a..a65f23834d 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java @@ -17,7 +17,6 @@ package org.apache.commons.collections4.bloomfilter; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; /** * Tests for the {@link SimpleBloomFilter}. diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java deleted file mode 100644 index 58a0148a91..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -import java.nio.charset.StandardCharsets; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator.OfInt; - -import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -/** - * {@link HasherCollection.Builder} tests. - */ -public class DynamicHasherBuilderTest { - - private HasherCollection.Builder builder; - private final HashFunction hf = new MD5Cyclic(); - private final Shape shape = new Shape(hf, 1, 345, 1); - private final String testString = HasherBuilderTest.getExtendedString(); - - /** - * Tests that hashing a byte array works as expected. - */ - @Test - public void buildTest_byteArray() { - final byte[] bytes = testString.getBytes(); - final HasherCollection hasher = builder.with(bytes).build(); - final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); - - final OfInt iter = hasher.iterator(shape); - - assertTrue(iter.hasNext()); - assertEquals(expected, iter.nextInt()); - assertFalse(iter.hasNext()); - } - - /** - * Tests that an empty hasher works as expected. - */ - @Test - public void buildTest_Empty() { - final HasherCollection hasher = builder.build(); - - final OfInt iter = hasher.iterator(shape); - - assertFalse(iter.hasNext()); - try { - iter.nextInt(); - fail("Should have thrown NoSuchElementException"); - } catch (final NoSuchElementException ignore) { - // do nothing - } - } - - /** - * Tests that hashing a string works as expected. - */ - @Test - public void buildTest_String() { - final byte[] bytes = testString.getBytes(StandardCharsets.UTF_8); - final HasherCollection hasher = builder.with(testString, StandardCharsets.UTF_8).build(); - final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); - - final OfInt iter = hasher.iterator(shape); - - assertTrue(iter.hasNext()); - assertEquals(expected, iter.nextInt()); - assertFalse(iter.hasNext()); - } - - /** - * Tests that hashing a string works as expected. - */ - @Test - public void buildTest_UnencodedString() { - final byte[] bytes = testString.getBytes(StandardCharsets.UTF_16LE); - final HasherCollection hasher = builder.withUnencoded(testString).build(); - final int expected = (int) Math.floorMod((long) hf.apply(bytes, 0), (long) shape.getNumberOfBits()); - - final OfInt iter = hasher.iterator(shape); - - assertTrue(iter.hasNext()); - assertEquals(expected, iter.nextInt()); - assertFalse(iter.hasNext()); - } - - /** - * Tests that build resets the builder. - */ - @Test - public void buildResetTest() { - builder.with(new byte[] {123}); - final OfInt iter = builder.build().iterator(shape); - - assertTrue(iter.hasNext()); - iter.next(); - assertFalse(iter.hasNext()); - - // Nothing added since last build so it should be an empty hasher - final OfInt iter2 = builder.build().iterator(shape); - assertFalse(iter2.hasNext()); - } - - /** - * Sets up the builder for testing. - */ - @BeforeEach - public void setup() { - builder = new HasherCollection.Builder(hf); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java deleted file mode 100644 index b33c2414a5..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -import java.nio.charset.StandardCharsets; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator.OfInt; - -import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link HasherCollection}. - */ -public class DynamicHasherTest { - private HasherCollection.Builder builder; - private Shape shape; - - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - /** - * Sets up the DynamicHasher. - */ - @BeforeEach - public void setup() { - builder = new HasherCollection.Builder(new MD5Cyclic()); - shape = new Shape(new MD5Cyclic(), 3, 72, 17); - } - - /** - * Tests that the expected bits are returned from hashing. - */ - @Test - public void testGetBits() { - - final int[] expected = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62}; - - final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).build(); - - final OfInt iter = hasher.iterator(shape); - - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that bits from multiple hashes are returned correctly. - */ - @Test - public void testGetBits_MultipleHashes() { - final int[] expected = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62, 1, 63, 53, 43, 17, 7, 69, - 59, 49, 39, 13, 3, 65, 55, 45, 35, 25}; - - final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).with("World", StandardCharsets.UTF_8).build(); - - final OfInt iter = hasher.iterator(shape); - - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - try { - iter.next(); - fail("Should have thrown NoSuchElementException"); - } catch (final NoSuchElementException ignore) { - // do nothing - } - } - - /** - * Tests that retrieving bits for the wrong shape throws an exception. - */ - @Test - public void testGetBits_WrongShape() { - - final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).build(); - - try { - hasher.iterator(new Shape(testFunction, 3, 72, 17)); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java deleted file mode 100644 index 479cfa5188..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionIdentityImplTest.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.junit.jupiter.api.Test; - -/** - * Tests the HashFunctionIdentity implementation ({@link HashFunctionIdentityImpl}).. - */ -public class HashFunctionIdentityImplTest { - - /** - * Tests a copy constructor of the HashFunctionIdentity. - */ - @Test - public void copyConstructorTest() { - final HashFunctionIdentity identity = new HashFunctionIdentity() { - - @Override - public String getName() { - return "NAME"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Provider"; - } - - @Override - public long getSignature() { - return -1L; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - - }; - final HashFunctionIdentityImpl impl = new HashFunctionIdentityImpl(identity); - assertEquals("NAME", impl.getName()); - assertEquals("Provider", impl.getProvider()); - assertEquals(Signedness.SIGNED, impl.getSignedness()); - assertEquals(ProcessType.CYCLIC, impl.getProcessType()); - assertEquals(-1L, impl.getSignature()); - } - - /** - * Test the constructor from component values. - */ - @Test - public void valuesConstructorTest() { - final HashFunctionIdentityImpl impl = new HashFunctionIdentityImpl("Provider", "NAME", Signedness.UNSIGNED, - ProcessType.ITERATIVE, -2L); - assertEquals("NAME", impl.getName()); - assertEquals("Provider", impl.getProvider()); - assertEquals(Signedness.UNSIGNED, impl.getSignedness()); - assertEquals(ProcessType.ITERATIVE, impl.getProcessType()); - assertEquals(-2L, impl.getSignature()); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidatorTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidatorTest.java deleted file mode 100644 index e68df55b26..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HashFunctionValidatorTest.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; -import org.junit.jupiter.api.Test; - -/** - * Tests of the {@link HashFunctionValidator}. - */ -public class HashFunctionValidatorTest { - - /** - * Tests that name is used in the equality check. - */ - @Test - public void testName() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl2", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl1, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Tests that name is not affected by case. - */ - @Test - public void testNameIsCaseInsensitive() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "IMPL1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl2)); - } - - /** - * Tests that process type is used in the equality check. - */ - @Test - public void testProcessType() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.ITERATIVE, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl1, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Tests that provider is not used in the equality check. - */ - @Test - public void testProviderIsNotUsedInEqualityCheck() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite2", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertTrue(HashFunctionValidator.areEqual(impl1, impl2)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Tests that signedness is used in the equality check. - */ - @Test - public void testSignedness() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, - ProcessType.CYCLIC, 300L); - - assertTrue(HashFunctionValidator.areEqual(impl1, impl1)); - assertTrue(HashFunctionValidator.areEqual(impl2, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl1, impl2)); - assertFalse(HashFunctionValidator.areEqual(impl2, impl1)); - } - - /** - * Test the check method throws when the two hash functions are not equal. - */ - @Test - public void testCheckThrows() { - final HashFunctionIdentityImpl impl1 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.SIGNED, - ProcessType.CYCLIC, 300L); - final HashFunctionIdentityImpl impl2 = new HashFunctionIdentityImpl("Testing Suite", "impl1", Signedness.UNSIGNED, - ProcessType.CYCLIC, 300L); - assertThrows(IllegalArgumentException.class, () -> HashFunctionValidator.checkAreEqual(impl1, impl2)); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java deleted file mode 100644 index 303034053a..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher.Builder; -import org.apache.commons.lang3.NotImplementedException; -import org.junit.jupiter.api.Test; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; - -/** - * Tests the - * {@link org.apache.commons.collections4.bloomfilter.hasher.Hasher.Builder Hasher.Builder}. - */ -public class HasherBuilderTest { - - /** - * Simple class to collect byte[] items added to the builder. - */ - private static class TestBuilder implements Hasher.Builder { - ArrayList items = new ArrayList<>(); - - @Override - public Hasher build() { - throw new NotImplementedException("Not required"); - } - - @Override - public Builder with(final byte[] item) { - items.add(item); - return this; - } - } - - /** - * Tests that adding CharSequence items works correctly. - */ - @Test - public void withCharSequenceTest() { - final String ascii = "plain"; - final String extended = getExtendedString(); - for (final String s : new String[] {ascii, extended}) { - for (final Charset cs : new Charset[] { - StandardCharsets.ISO_8859_1, StandardCharsets.UTF_8, StandardCharsets.UTF_16 - }) { - final TestBuilder builder = new TestBuilder(); - builder.with(s, cs); - assertArrayEquals(s.getBytes(cs), builder.items.get(0)); - } - } - } - - /** - * Tests that adding unencoded CharSequence items works correctly. - */ - @Test - public void withUnencodedCharSequenceTest() { - final String ascii = "plain"; - final String extended = getExtendedString(); - for (final String s : new String[] {ascii, extended}) { - final TestBuilder builder = new TestBuilder(); - builder.withUnencoded(s); - final byte[] encoded = builder.items.get(0); - final char[] original = s.toCharArray(); - // Should be twice the length - assertEquals(original.length * 2, encoded.length); - // Should be little endian (lower bits first) - final CharBuffer buffer = ByteBuffer.wrap(encoded) - .order(ByteOrder.LITTLE_ENDIAN).asCharBuffer(); - for (int i = 0; i < original.length; i++) { - assertEquals(original[i], buffer.get(i)); - } - } - } - - /** - * Gets a string with non-standard characters. - * - * @return the extended string - */ - static String getExtendedString() { - final char[] data = {'e', 'x', 't', 'e', 'n', 'd', 'e', 'd', ' ', - // Add some characters that are non standard - // non-ascii - 0xCA98, - // UTF-16 surrogate pair - 0xD803, 0xDE6D - // Add other cases here ... - }; - return String.valueOf(data); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java new file mode 100644 index 0000000000..acd225d9d4 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; +import java.util.PrimitiveIterator.OfInt; + +import org.apache.commons.collections4.bloomfilter.Shape; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link SimpleHasher}. + */ +public class HasherCollectionTest { + + private SimpleHasher hasher1 = new SimpleHasher( 1,1 ); + private SimpleHasher hasher2 = new SimpleHasher( 2, 2 ); + private HasherCollection hasher = new HasherCollection( hasher1, hasher2 ); + + @Test + public void sizeTest() { + assertEquals( 2, hasher.size() ); + HasherCollection hasher3 = new HasherCollection( hasher, new SimpleHasher( 3, 3 )); + assertEquals( 3, hasher3.size() ); + + } + + @Test + public void testIterator() { + Shape shape = new Shape( 5, 10 ); + int[] expected = { 1,2,3,4,5,2,4,6,8,0 }; + OfInt iter = hasher.iterator(shape); + for (int i=0;i< expected.length;i++) { + assertEquals( expected[i], iter.next() ); + } + assertFalse( iter.hasNext()); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java deleted file mode 100644 index 90f3808d8e..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/ShapeTest.java +++ /dev/null @@ -1,500 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; -import static org.junit.jupiter.api.Assertions.fail; - -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.ProcessType; -import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity.Signedness; - -import java.util.ArrayList; - -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link Shape} class. - */ -public class ShapeTest { - - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - /* - * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= - * - * n = 5 - * - * p = 0.100375138 (1 in 10) - * - * m = 24 (3B) - * - * k = 3 - */ - - private final Shape shape = new Shape(testFunction, 5, 0.1); - - /** - * Tests that if the number of bits less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_bits_BadNumberOfBitsTest() { - try { - new Shape(testFunction, 5, 0); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of hash functions is less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_bits_BadNumberOfHashFunctionsTest() { - try { - new Shape(testFunction, 16, 8); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_bits_BadNumberOfItemsTest() { - try { - new Shape(testFunction, 0, 24); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of bits is less than 1 an exception is thrown - */ - @Test - public void constructor_items_bits_hash_BadNumberOfBitsTest() { - try { - new Shape(testFunction, 5, 0, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of hash functions is less than 1 an exception is thrown. - */ - @Test - public void constructor_items_bits_hash_BadNumberOfHashFunctionsTest() { - try { - new Shape(testFunction, 5, 24, 0); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of items is less than 1 an exception is thrown. - */ - @Test - public void constructor_items_bits_hash_BadNumberOfItemsTest() { - try { - new Shape(testFunction, 0, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown - */ - @Test - public void constructor_items_bits_hash_BadProbabilityTest() { - try { - new Shape(testFunction, 4000, 8, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that when the number of items, number of bits and number of hash functions is passed the values are - * calculated correctly. - */ - @Test - public void constructor_items_bits_hashTest() { - /* - * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 - */ - final Shape filterConfig = new Shape(testFunction, 5, 24, 4); - - assertEquals(24, filterConfig.getNumberOfBits()); - assertEquals(4, filterConfig.getNumberOfHashFunctions()); - assertEquals(5, filterConfig.getNumberOfItems()); - assertEquals(0.102194782, filterConfig.getProbability(), 0.000001); - } - - /** - * Tests that the number of items and number of bits is passed the other values are calculated correctly. - */ - @Test - public void constructor_items_bitsTest() { - /* - * values from https://hur.st/bloomfilter/?n=5&m=24 - */ - final Shape filterConfig = new Shape(testFunction, 5, 24); - - assertEquals(24, filterConfig.getNumberOfBits()); - assertEquals(3, filterConfig.getNumberOfHashFunctions()); - assertEquals(5, filterConfig.getNumberOfItems()); - assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); - } - - /** - * Tests that if the number of items is less than 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_probability_BadNumberOfItemsTest() { - try { - new Shape(testFunction, 0, 1.0 / 10); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests that if the probability is less than or equal to 0 or more than or equal to 1 an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_probability_BadProbabilityTest() { - try { - new Shape(testFunction, 10, 0.0); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - try { - new Shape(testFunction, 10, 1.0); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - try { - new Shape(testFunction, 10, Double.NaN); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. - */ - @Test - public void constructor_items_probability_NumberOfBitsOverflowTest() { - try { - new Shape(testFunction, Integer.MAX_VALUE, 1.0 / 10); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing. - } - } - - /** - * Tests the the probability is calculated correctly. - */ - @Test - public void constructor_items_probability_Test() { - - assertEquals(24, shape.getNumberOfBits()); - assertEquals(3, shape.getNumberOfHashFunctions()); - assertEquals(5, shape.getNumberOfItems()); - assertEquals(0.100375138, shape.getProbability(), 0.000001); - } - - /** - * Tests that the constructor with a null name, number of items and size of filter fails. - */ - @Test - public void constructor_nm_noName() { - try { - new Shape(null, 5, 72); - fail("Should throw NullPointerException"); - } catch (final NullPointerException expected) { - // do nothing - } - } - - /** - * Tests that the constructor with a null name, number of items, size of filter, and number of functions fails. - */ - @Test - public void constructor_nmk_noName() { - try { - new Shape(null, 5, 72, 17); - fail("Should throw NullPointerException"); - } catch (final NullPointerException expected) { - // do nothing - } - } - - /** - * Tests that the constructor with a null name, number of items, and probability fails. - */ - @Test - public void constructor_np_noName() { - try { - new Shape(null, 5, 0.1); - fail("Should throw NullPointerException"); - } catch (final NullPointerException expected) { - // do nothing - } - } - - /** - * Tests that the constructor with a null name, probability, size of filter, and number of functions fails. - */ - @Test - public void constructor_pmk_noName() { - try { - new Shape(null, 0.1, 72, 17); - fail("Should throw NullPointerException"); - } catch (final NullPointerException expected) { - // do nothing - } - } - - /** - * Tests that if the number of bits is less than 1 an exception is thrown - */ - @Test - public void constructor_probability_bits_hash_BadNumberOfBitsTest() { - try { - new Shape(testFunction, 0.5, 0, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that if the number of functions is less than 1 an exception is thrown - */ - @Test - public void constructor_probability_bits_hash_BadNumberOfHashFunctionsTest() { - try { - new Shape(testFunction, 0.5, 24, 0); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests that invalid probability values cause and IllegalArgumentException to be thrown. - */ - @Test - public void constructor_probability_bits_hash_BadProbabilityTest() { - // probability should not be 0 - try { - new Shape(testFunction, 0.0, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - - // probability should not be = -1 - try { - new Shape(testFunction, -1.0, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - - // probability should not be < -1 - try { - new Shape(testFunction, -1.5, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - - // probability should not be = 1 - try { - new Shape(testFunction, 1.0, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - - // probability should not be > 1 - try { - new Shape(testFunction, 2.0, 24, 1); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // expected - } - } - - /** - * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash - * functions. - */ - @Test - public void constructor_probability_bits_hashTest() { - /* - * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 - */ - final Shape filterConfig = new Shape(testFunction, 0.1, 24, 3); - - assertEquals(24, filterConfig.getNumberOfBits()); - assertEquals(3, filterConfig.getNumberOfHashFunctions()); - assertEquals(5, filterConfig.getNumberOfItems()); - assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); - } - - /** - * Test equality of shape. - */ - @Test - public void equalsTest() { - - assertEquals(shape, shape); - assertEquals(shape, new Shape(testFunction, 5, 1.0 / 10)); - assertNotEquals(shape, null); - assertNotEquals(shape, new Shape(testFunction, 5, 1.0 / 11)); - assertNotEquals(shape, new Shape(testFunction, 4, 1.0 / 10)); - // Number of bits does not change equality, - // only the number of bits and the number of hash functions - final int numberOfBits = 10000; - final int numberOfItems = 15; - final int numberOfHashFunctions = 4; - assertEquals(new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions), - new Shape(testFunction, numberOfItems + 1, numberOfBits, numberOfHashFunctions)); - assertNotEquals(new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions), - new Shape(testFunction, numberOfItems, numberOfBits + 1, numberOfHashFunctions)); - assertNotEquals(new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions), - new Shape(testFunction, numberOfItems, numberOfBits, numberOfHashFunctions + 1)); - - final HashFunctionIdentity testFunction2 = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function2"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - assertNotEquals(shape, new Shape(testFunction2, 4, 1.0 / 10)); - } - - /** - * Test that hashCode satisfies the contract between {@link Object#hashCode()} and - * {@link Object#equals(Object)}. Equal shapes must have the same hash code. - */ - @Test - public void hashCodeTest() { - // Hash function equality is based on process type, signedness and name (case insensitive) - final ArrayList list = new ArrayList<>(); - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - // Provider changes - list.add(new HashFunctionIdentityImpl("PROVIDER", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - list.add(new HashFunctionIdentityImpl("Provider2", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - // Name changes - list.add(new HashFunctionIdentityImpl("Provider", "name", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - list.add(new HashFunctionIdentityImpl("Provider", "NAME", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - list.add(new HashFunctionIdentityImpl("Provider", "Other", Signedness.SIGNED, ProcessType.ITERATIVE, 0L)); - // Signedness changes - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.UNSIGNED, ProcessType.ITERATIVE, 0L)); - // ProcessType changes - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.SIGNED, ProcessType.CYCLIC, 0L)); - // Signature changes - list.add(new HashFunctionIdentityImpl("Provider", "Name", Signedness.SIGNED, ProcessType.ITERATIVE, 1L)); - - // Create shapes that only differ in the hash function. - final int numberOfItems = 30; - final int numberOfBits = 3000; - final int numberOfHashFunctions = 10; - final Shape shape1 = new Shape(list.get(0), numberOfItems, numberOfBits, numberOfHashFunctions); - assertEquals(shape1, shape1); - - // Try variations - for (int i = 1; i < list.size(); i++) { - final Shape shape2 = new Shape(list.get(i), numberOfItems, numberOfBits, numberOfHashFunctions); - assertEquals(shape2, shape2); - - // Equal shapes must have the same hash code - if (shape1.equals(shape2)) { - assertEquals(shape1.hashCode(), shape2.hashCode()); - } - } - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java new file mode 100644 index 0000000000..09ae011123 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; +import java.util.PrimitiveIterator.OfInt; + +import org.apache.commons.collections4.bloomfilter.Shape; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link SimpleHasher}. + */ +public class SimpleHasherTest { + + private SimpleHasher hasher = new SimpleHasher( 1,1 ); + + @Test + public void sizeTest() { + assertEquals( 1, hasher.size() ); + } + + @Test + public void testIterator() { + Shape shape = new Shape( 5, 10 ); + OfInt iter = hasher.iterator(shape); + for (int i=1;i<6;i++) { + assertEquals( i, iter.next() ); + } + assertFalse( iter.hasNext()); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java deleted file mode 100644 index 70eb633a78..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/StaticHasherTest.java +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter.hasher; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.PrimitiveIterator.OfInt; - -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link SimpleHasher}. - */ -public class StaticHasherTest { - - private final HashFunctionIdentity testFunction = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test Function"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - private final HashFunctionIdentity testFunctionX = new HashFunctionIdentity() { - - @Override - public String getName() { - return "Test FunctionX"; - } - - @Override - public ProcessType getProcessType() { - return ProcessType.CYCLIC; - } - - @Override - public String getProvider() { - return "Apache Commons Collection Tests"; - } - - @Override - public long getSignature() { - return 0; - } - - @Override - public Signedness getSignedness() { - return Signedness.SIGNED; - } - }; - - private final Shape shape = new Shape(testFunction, 3, 72, 17); - - /** - * Compare 2 static hashers to verify they have the same bits enabled. - * - * @param hasher1 the first static hasher. - * @param hasher2 the second static hasher. - */ - private void assertSameBits(final SimpleHasher hasher1, final SimpleHasher hasher2) { - final OfInt iter1 = hasher1.iterator(shape); - final OfInt iter2 = hasher2.iterator(shape); - - while (iter1.hasNext()) { - assertTrue(iter2.hasNext(), "Not enough data in second hasher"); - assertEquals(iter1.nextInt(), iter2.nextInt()); - } - assertFalse(iter2.hasNext(), "Too much data in second hasher"); - } - - /** - * Tests that passing a hasher other than a StaticHasher to the constructor works as - * expected. - */ - @Test - public void testConstructor_Hasher() { - final int[] expected = {1, 3, 5, 7, 9}; - - final Hasher testHasher = new Hasher() { - - @Override - public OfInt iterator(final Shape shape) { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - return Arrays.stream(values).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return testFunction; - } - }; - - final SimpleHasher hasher = new SimpleHasher(testHasher, shape); - final OfInt iter = hasher.iterator(shape); - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that passing a hasher other than a StaticHasher and the wrong Shape to the - * constructor throws an IllegalArgumentException. - */ - @Test - public void testConstructor_Hasher_WrongShape() { - final Hasher testHasher = new Hasher() { - - @Override - public OfInt iterator(final Shape shape) { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - return Arrays.stream(values).iterator(); - } - - @Override - public HashFunctionIdentity getHashFunctionIdentity() { - return testFunctionX; - } - }; - - try { - new SimpleHasher(testHasher, shape); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } - - /** - * Test that the iterator based constructor works correctly and removes duplicates. - */ - @Test - public void testConstructor_Iterator() { - - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - Iterator iter = Arrays.stream(values).iterator(); - final SimpleHasher hasher = new SimpleHasher(iter, shape); - - assertEquals(5, hasher.size()); - assertEquals(shape, hasher.getShape()); - // All function properties are equal - assertEquals(testFunction.getName(), hasher.getHashFunctionIdentity().getName()); - assertEquals(testFunction.getProcessType(), hasher.getHashFunctionIdentity().getProcessType()); - assertEquals(testFunction.getProvider(), hasher.getHashFunctionIdentity().getProvider()); - assertEquals(testFunction.getSignedness(), hasher.getHashFunctionIdentity().getSignedness()); - - iter = hasher.iterator(shape); - int idx = 0; - while (iter.hasNext()) { - assertEquals(Integer.valueOf(values[idx]), iter.next(), "Error at idx " + idx); - idx++; - } - assertEquals(5, idx); - } - - /** - * Tests that if the iterator passed to the constructor contains a value greater than - * or equal to Shape.numberOfBits() an exception is thrown. - */ - @Test - public void testConstructor_Iterator_ValueTooBig() { - - final int[] values = {shape.getNumberOfBits(), 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - try { - new SimpleHasher(iter, shape); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } - - /** - * Tests that if the iterator passed to the constructor contains a value less than 0 - * (zero) an exception is thrown. - */ - @Test - public void testConstructor_Iterator_ValueTooSmall() { - - final int[] values = {-1, 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - try { - new SimpleHasher(iter, shape); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } - - /** - * Tests that the constructor that accepts a static hasher properly builds the hasher. - */ - @Test - public void testConstructor_StaticHasher() { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - final SimpleHasher hasher = new SimpleHasher(iter, shape); - - final SimpleHasher hasher2 = new SimpleHasher(hasher, shape); - assertEquals(shape, hasher2.getShape()); - assertSameBits(hasher, hasher2); - } - - /** - * Tests that calling the constructor with a hasher and the wrong shape throws an - * IllegalArgumentException. - */ - @Test - public void testConstructor_StaticHasher_WrongShape() { - final int[] values = {1, 3, 5, 7, 9, 3, 5, 1}; - final Iterator iter = Arrays.stream(values).iterator(); - final SimpleHasher hasher = new SimpleHasher(iter, new Shape(testFunctionX, 3, 72, 17)); - - try { - new SimpleHasher(hasher, shape); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } - - /** - * Tests that iterator returns the proper values. - */ - @Test - public void testGetBits() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - - final SimpleHasher hasher = new SimpleHasher(lst.iterator(), shape); - assertEquals(17, hasher.size()); - final OfInt iter = hasher.iterator(shape); - for (int i = 0; i < 17; i++) { - assertTrue(iter.hasNext()); - assertEquals(i, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that iterator does not return duplicates and orders the indices. - */ - @Test - public void testGetBits_DuplicateValues() { - final int[] input = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62, 1, 63, 53, 43, 17, 7, 69, 59, - 49, 39, 13, 3, 65, 55, 45, 35, 25}; - final int[] expected = {1, 2, 3, 6, 7, 10, 11, 13, 15, 17, 19, 23, 24, 25, 35, 36, 39, 43, 44, 45, 48, 49, 53, 55, 57, - 59, 61, 62, 63, 65, 69, 70}; - - final SimpleHasher hasher = new SimpleHasher(Arrays.stream(input).iterator(), shape); - - final OfInt iter = hasher.iterator(shape); - for (final int element : expected) { - assertTrue(iter.hasNext()); - assertEquals(element, iter.nextInt()); - } - assertFalse(iter.hasNext()); - } - - /** - * Tests that gitBits is called with the wrong shape an exception is thrown. - */ - @Test - public void testGetBits_WrongShape() { - final List lst = Arrays.asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); - final SimpleHasher hasher = new SimpleHasher(lst.iterator(), shape); - - try { - hasher.iterator(new Shape(testFunctionX, 3, 72, 17)); - fail("Should have thrown IllegalArgumentException"); - } catch (final IllegalArgumentException expected) { - // do nothing - } - } -} From 0287831837d23691ee0b2cd75359603045fb1f99 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Sun, 3 Oct 2021 17:10:19 +0100 Subject: [PATCH 03/27] Cleaned up hasher collecton processing --- .../bloomfilter/ArrayCountingBloomFilter.java | 7 +- .../collections4/bloomfilter/BitMap.java | 103 +++++++++++ .../bloomfilter/BitMapProducer.java | 51 ++++- .../collections4/bloomfilter/BloomFilter.java | 174 ++++-------------- .../bloomfilter/CountingBloomFilter.java | 11 +- .../bloomfilter/SimpleBloomFilter.java | 5 + .../bloomfilter/SparseBloomFilter.java | 19 +- .../bloomfilter/hasher/Hasher.java | 12 ++ .../bloomfilter/hasher/HasherCollection.java | 9 + .../bloomfilter/hasher/SimpleHasher.java | 8 +- .../collections4/bloomfilter/BitMaptTest.java | 56 +++--- .../bloomfilter/SetOperationsTest.java | 4 +- 12 files changed, 271 insertions(+), 188 deletions(-) create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index 4d9d9d5040..52138eddfa 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -25,7 +25,6 @@ import java.util.function.LongConsumer; import java.util.stream.IntStream; -import org.apache.commons.collections4.bloomfilter.BloomFilter.BitMap; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; /** @@ -242,7 +241,8 @@ public boolean mergeInPlace(final BloomFilter other) { @Override public boolean mergeInPlace(final Hasher hasher) { Objects.requireNonNull( hasher, "hasher"); - return add( BitCountProducer.Factory.from( shape, hasher )); + hasher.forEach( h -> add( BitCountProducer.Factory.from( shape, h ))); + return isValid(); } @Override @@ -254,7 +254,8 @@ public boolean remove(final BloomFilter other) { @Override public boolean remove(final Hasher hasher) { Objects.requireNonNull( hasher, "hasher"); - return subtract( BitCountProducer.Factory.from( shape, hasher )); + hasher.forEach( h -> subtract( BitCountProducer.Factory.from( shape, h ))); + return isValid(); } @Override diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java new file mode 100644 index 0000000000..dfab37c00c --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -0,0 +1,103 @@ +package org.apache.commons.collections4.bloomfilter; + +/** + * Contains functions to convert {@code int} indices into Bloom filter bit positions. + */ +public class BitMap { + /** A bit shift to apply to an integer to divided by 64 (2^6). */ + private static final int DIVIDE_BY_64 = 6; + + /** Do not instantiate. */ + private BitMap() {} + + /** + * Calculates the number of buckets required for the numberOfBits parameter. + * @param numberOfBits the number of bits to store in the array of buckets. + * @return the number of buckets necessary. + */ + public static int numberOfBuckets( int numberOfBits ) { + int bucket = numberOfBits >> DIVIDE_BY_64; + return bucket+1; + } + + /** + * Checks if the specified index bit is enabled in the array of bit buckets. + * @param buckets The array of bit buckets + * @param idx the index of the bit to locate. + * @return {@code true} if the bit is enabled, {@code false} otherwise. + */ + public static boolean contains( long[] buckets, int idx ) { + return (buckets[ getLongIndex( idx )] & getLongBit( idx )) != 0; + } + + /** + * Check the index is positive. + * + * @param bitIndex the bit index + * @throws IndexOutOfBoundsException if the index is not positive + */ + public static void checkPositive(final int bitIndex) { + if (bitIndex < 0) { + throw new IndexOutOfBoundsException("Negative bitIndex: " + bitIndex); + } + } + + + /** + * Gets the filter index for the specified bit index assuming the filter is using 64-bit longs + * to store bits starting at index 0. + * + *

The index is assumed to be positive. For a positive index the result will match + * {@code bitIndex / 64}. + * + *

The divide is performed using bit shifts. If the input is negative the behavior + * is not defined. + * + * @param bitIndex the bit index (assumed to be positive) + * @return the filter index + * @see #checkPositive(int) + */ + public static int getLongIndex(final int bitIndex) { + // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is positive. + // We do not explicitly check for a negative here. Instead we use a + // a signed shift. Any negative index will produce a negative value + // by sign-extension and if used as an index into an array it will throw an exception. + return bitIndex >> DIVIDE_BY_64; + } + + /** + * Gets the filter bit mask for the specified bit index assuming the filter is using 64-bit + * longs to store bits starting at index 0. The returned value is a {@code long} with only + * 1 bit set. + * + *

The index is assumed to be positive. For a positive index the result will match + * {@code 1L << (bitIndex % 64)}. + * + *

If the input is negative the behavior is not defined. + * + * @param bitIndex the bit index (assumed to be positive) + * @return the filter bit + * @see #checkPositive(int) + */ + public static long getLongBit(final int bitIndex) { + // Bit shifts only use the first 6 bits. Thus it is not necessary to mask this + // using 0x3f (63) or compute bitIndex % 64. + // Note: If the index is negative the shift will be (64 - (bitIndex & 0x3f)) and + // this will identify an incorrect bit. + return 1L << bitIndex; + } + + /** + * Determines id a cardinality is sparse for the shape. + * Since the size of a bucket is a long and the size of an index is an int, there can be + * 2 indexes for each bucket. Since indexes are evenly distributed sparse is defined as + * {@code numberOfBuckets*2 >= cardinality} + * @param cardinality the cardinality to check. + * @param shape the Shape to check against + * @return true if the cardinality is sparse within the bucket. + */ + public static boolean isSparse( int cardinality, Shape shape ) { + return numberOfBuckets(shape.getNumberOfBits()-1)*2 >= cardinality; + } + +} \ No newline at end of file diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java index f7f18ce994..dd0651bac1 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -1,26 +1,59 @@ package org.apache.commons.collections4.bloomfilter; -import java.util.Set; -import java.util.TreeSet; -import java.util.function.Consumer; -import java.util.function.IntConsumer; +import java.util.Arrays; import java.util.function.LongConsumer; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; - /** - * Interface that produces bit map long values in a Bloom filter. + * Produces bit map longs for a Bloom filter. + * + * Each bit map is a little-endian long value representing a block of bits of this filter. + * + *

The returned array will have length {@code ceil(m / 64)} where {@code m} is the + * number of bits in the filter and {@code ceil} is the ceiling function. + * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit + * index is enabled. + * + * The producer may produce empty bit maps at the end of the sequence. * */ public interface BitMapProducer { /** - * Performs the given action for each bit map {@code long} that comprise the Bloom filter. + * Performs the given action for each {@code index} that represents an enabled bit. * Any exceptions thrown by the action are relayed to the caller. * - * @param consumer the action to be performed for each bit map long + * @param consumer the action to be performed for each non-zero bit index. * @throws NullPointerException if the specified action is null */ void forEachBitMap(LongConsumer consumer); + /** + * A LongConsumer that builds an Array of BitMaps as produced by a BitMapProducer. + * + */ + public class ArrayBuilder implements LongConsumer { + private long[] result; + private int idx=0; + + /** + * Constructor. + * @param shape The shape used to generate the BitMaps. + */ + public ArrayBuilder( Shape shape ) { + result = new long[ BitMap.numberOfBuckets( shape.getNumberOfBits() )]; + } + @Override + public void accept(long bitmap) { + result[idx++] = bitmap; + } + + /** + * Trims the resulting array so that there are no trailing empty BitMaps + * @return + */ + public long[] trim() { + return Arrays.copyOf( result, idx ); + } + } + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index a17d89a6f2..ba45bbf4dc 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -23,9 +23,7 @@ import java.util.Objects; import java.util.PrimitiveIterator; import java.util.function.IntConsumer; -import java.util.function.LongConsumer; -import org.apache.commons.collections4.bloomfilter.BloomFilter.BitMap; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; /** @@ -60,7 +58,7 @@ default long[] getBits() { return new long[0]; } - BitBuilder consumer = new BitBuilder(getShape()); + BitMapProducer.ArrayBuilder consumer = new BitMapProducer.ArrayBuilder(getShape()); forEachBitMap( consumer ); return consumer.trim(); } @@ -163,17 +161,12 @@ default boolean contains(Hasher hasher) { // Modification Operations /** - * Merges the specified Bloom filter into this Bloom filter. Specifically all bit indexes - * that are enabled in the {@code other} filter will be enabled in this filter. - * - *

Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the {@code other} Bloom filter. + * Merges the specified Bloom filter withthis Bloom filter creating a new Bloom filter. + * Specifically all bit indexes that are enabled in the {@code other} filter will be + * enabled in the resulting filter. * * @param other the other Bloom filter - * @return true if the merge was successful - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter + * @return The new Bloom filter. */ default BloomFilter merge(BloomFilter other) { Objects.requireNonNull( other, "other"); @@ -188,17 +181,13 @@ default BloomFilter merge(BloomFilter other) { } /** - * Merges the specified decomposed Bloom filter into this Bloom filter. Specifically all - * bit indexes that are identified by the {@code hasher} will be enabled in this filter. + * Merges the specified Hasher with this Bloom filter and returns a new Bloom filter. + * Specifically all bit indexes that are identified by the {@code hasher} will be enabled + * in the resulting filter. * - *

Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the specified decomposed Bloom filter. * * @param hasher the hasher to provide the indexes - * @return true if the merge was successful - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter + * @return the new Bloom filter. */ default BloomFilter merge(Hasher hasher) { Objects.requireNonNull( hasher, "hasher"); @@ -211,8 +200,30 @@ default BloomFilter merge(Hasher hasher) { return result; } + /** + * Merges the specified Bloom filter into this Bloom filter. Specifically all + * bit indexes that are identified by the {@code other} will be enabled in this filter. + * + *

Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter is not ensured to contain + * the specified Bloom filter. + * + * @param other The bloom filter to merge into this one. + * @return true if the merge was successful + */ boolean mergeInPlace(BloomFilter other); + /** + * Merges the specified hasher into this Bloom filter. Specifically all + * bit indexes that are identified by the {@code hasher} will be enabled in this filter. + * + *

Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter is not ensured to contain + * the specified Bloom filter. + * + * @param hasher The hasher to merge. + * @return true if the merge was successful + */ default boolean mergeInPlace(Hasher hasher) { Objects.requireNonNull( hasher, "hasher"); Shape shape = getShape(); @@ -273,10 +284,11 @@ default int estimateIntersection( BloomFilter other) { } /** - * Iterates over the enabled bits in an array of bit maps. + * Iterates over the enabled bits in an array of bit maps. Useful for when a + * array of bitmaps is available but an iterator of indices is needed. * */ - class BitIterator implements PrimitiveIterator.OfInt { + public class BitIterator implements PrimitiveIterator.OfInt { private long[] bits; private int bucket; private int offset; @@ -324,122 +336,4 @@ public int nextInt() { } } - /** - * Contains functions to convert {@code int} indices into Bloom filter bit positions. - */ - class BitMap { - /** A bit shift to apply to an integer to divided by 64 (2^6). */ - private static final int DIVIDE_BY_64 = 6; - - /** Do not instantiate. */ - private BitMap() {} - - /** - * Calculates the number of buckets required for the numberOfBits parameter. - * @param numberOfBits the number of bits to store in the array of buckets. - * @return the number of buckets necessary. - */ - public static int numberOfBuckets( int numberOfBits ) { - int bucket = numberOfBits >> DIVIDE_BY_64; - return bucket+1; - } - - /** - * Checks if the specified index bit is enabled in the array of bit buckets. - * @param buckets The array of bit buckets - * @param idx the index of the bit to locate. - * @return {@code true} if the bit is enabled, {@code false} otherwise. - */ - public static boolean contains( long[] buckets, int idx ) { - return (buckets[ getLongIndex( idx )] & getLongBit( idx )) != 0; - } - - /** - * Check the index is positive. - * - * @param bitIndex the bit index - * @throws IndexOutOfBoundsException if the index is not positive - */ - public static void checkPositive(final int bitIndex) { - if (bitIndex < 0) { - throw new IndexOutOfBoundsException("Negative bitIndex: " + bitIndex); - } - } - - - /** - * Gets the filter index for the specified bit index assuming the filter is using 64-bit longs - * to store bits starting at index 0. - * - *

The index is assumed to be positive. For a positive index the result will match - * {@code bitIndex / 64}. - * - *

The divide is performed using bit shifts. If the input is negative the behavior - * is not defined. - * - * @param bitIndex the bit index (assumed to be positive) - * @return the filter index - * @see #checkPositive(int) - */ - public static int getLongIndex(final int bitIndex) { - // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is positive. - // We do not explicitly check for a negative here. Instead we use a - // a signed shift. Any negative index will produce a negative value - // by sign-extension and if used as an index into an array it will throw an exception. - return bitIndex >> DIVIDE_BY_64; - } - - /** - * Gets the filter bit mask for the specified bit index assuming the filter is using 64-bit - * longs to store bits starting at index 0. The returned value is a {@code long} with only - * 1 bit set. - * - *

The index is assumed to be positive. For a positive index the result will match - * {@code 1L << (bitIndex % 64)}. - * - *

If the input is negative the behavior is not defined. - * - * @param bitIndex the bit index (assumed to be positive) - * @return the filter bit - * @see #checkPositive(int) - */ - public static long getLongBit(final int bitIndex) { - // Bit shifts only use the first 6 bits. Thus it is not necessary to mask this - // using 0x3f (63) or compute bitIndex % 64. - // Note: If the index is negative the shift will be (64 - (bitIndex & 0x3f)) and - // this will identify an incorrect bit. - return 1L << bitIndex; - } - - /** - * Determines id a cardinality is sparse for the shape. - * Since the size of a bucket is a long and the size of an index is an int, there can be - * 2 indexes for each bucket. Since indexes are evenly distributed sparse is defined as - * {@code numberOfBuckets*2 >= cardinality} - * @param cardinality the cardinality to check. - * @param shape the Shape to check against - * @return true if the cardinality is sparse within the bucket. - */ - public static boolean isSparse( int cardinality, Shape shape ) { - return numberOfBuckets(shape.getNumberOfBits()-1)*2 >= cardinality; - } - - } - - public class BitBuilder implements LongConsumer { - private long[] result; - private int idx=0; - public BitBuilder( Shape shape ) { - result = new long[ BitMap.numberOfBuckets( shape.getNumberOfBits() )]; - } - @Override - public void accept(long bitmap) { - result[idx++] = bitmap; - } - - public long[] trim() { - return Arrays.copyOf( result, idx ); - } - } - } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index c0d89e3a5e..032c9f8b50 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -16,7 +16,6 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.BloomFilter.BitMap; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; /** @@ -55,7 +54,7 @@ */ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { - + // Query Operations @@ -100,10 +99,13 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { boolean remove(BloomFilter other); /** - * Removes the specified decomposed Bloom filter from this Bloom filter. Specifically + * Removes the specified hasher from the Bloom filter from this Bloom filter. Specifically * all counts for the distinct indexes identified by the {@code hasher} will be * decremented by 1. If the {@code hasher} contains duplicate bit indexes these are ignored. * + * For HasherCollections each SimpleHasher will be considered a single item and decremented + * from the counts separately. + * *

This method will return true if the filter is valid after the operation. * * @param hasher the hasher to provide the indexes @@ -165,6 +167,9 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { * Merges the specified decomposed Bloom filter into this Bloom filter. Specifically all * bit indexes that are identified by the {@code hasher} will be enabled in this filter. * + * For HasherCollections each SimpleHasher will be considered a single item and increment + * the counts separately. + * *

Note: This method should return {@code true} even if no additional bit indexes were * enabled. A {@code false} result indicates that this filter is not ensured to contain * the specified decomposed Bloom filter. diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java index e8ab3bea80..b194b8b095 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -47,6 +47,11 @@ public SimpleBloomFilter(Shape shape) { this.bitSet = new BitSet(); } + /** + * Constructor. + * @param shape The shape for the filter. + * @param hasher the Hasher to initialize the filter with. + */ public SimpleBloomFilter(final Shape shape, Hasher hasher) { this( shape ); Objects.requireNonNull( hasher, "hasher"); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java index 34d2047fe4..3f369889d3 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -57,16 +57,33 @@ public SparseBloomFilter(Shape shape) { this.indices = new TreeSet(); } + /** + * Constructs a populated Bloom filter. + * @param shape the shape for the bloom filter. + * @param hasher the hasher to provide the initial data. + */ public SparseBloomFilter(final Shape shape, Hasher hasher) { this( shape ); Objects.requireNonNull( hasher, "hasher"); - hasher.iterator(shape).forEachRemaining( (IntConsumer) i -> indices.add( i )); + hasher.forEach( h -> h.iterator(shape).forEachRemaining( (IntConsumer) indices::add )); } + /** + * Constructs a populated Bloom filter. + * @param shape the shape of the filter. + * @param indices a list of indices to to enable. + * @throws IllegalArgumentException if indices contains a value greater than the number + * of bits in the shape. + */ public SparseBloomFilter(Shape shape, List indices) { this(shape); Objects.requireNonNull( indices, "indices"); this.indices.addAll( indices ); + if (this.indices.last() >= shape.getNumberOfBits()) { + throw new IllegalArgumentException( + String.format( "Value in list {} is greater than maximum value ({})", + this.indices.last(), shape.getNumberOfBits())); + } } @Override diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index dc2375d7fb..d695a26449 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -17,8 +17,10 @@ package org.apache.commons.collections4.bloomfilter.hasher; import java.util.PrimitiveIterator; +import java.util.function.Consumer; import org.apache.commons.collections4.bloomfilter.Shape; +import org.apache.commons.collections4.bloomfilter.BitCountProducer.BitCountConsumer; /** * A Hasher represents items of arbitrary byte size as a byte representation of @@ -70,5 +72,15 @@ public interface Hasher { */ int size(); + /** + * Performs the given action for each hasher. + * + * For collections of hashers, this method must be called on each hasher in the collection. + * + * @param consumer the action to be performed for each hasher + * @throws NullPointerException if the specified action is null + */ + void forEach(Consumer consumer); + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java index 18af931dc7..7b5572cac8 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.NoSuchElementException; import java.util.PrimitiveIterator; +import java.util.function.Consumer; import java.util.stream.Collectors; import org.apache.commons.collections4.bloomfilter.Shape; @@ -88,6 +89,14 @@ public int size() { return i; } + @Override + public void forEach(Consumer consumer) { + for (Hasher h : this.hashers) { + h.forEach(consumer); + } + } + + /** * The iterator of integers. * diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java index 4a6c99934b..a02f4b500a 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java @@ -19,13 +19,13 @@ import java.util.NoSuchElementException; import java.util.PrimitiveIterator; import java.util.PrimitiveIterator.OfInt; +import java.util.function.Consumer; import org.apache.commons.collections4.bloomfilter.Shape; /** - * A Hasher implementation that contains the index for all enabled bits for a specific - * Shape. + * A Hasher ithat implemente combinatorial hashing. * @since 4.5 */ public final class SimpleHasher implements Hasher { @@ -64,6 +64,10 @@ public int size() { return 1; } + @Override + public void forEach(Consumer consumer) { + consumer.accept( this ); + } /** * The iterator of integers. diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java index 5c70e5d214..b0dd34658e 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java @@ -11,10 +11,10 @@ public class BitMaptTest { @Test public void checkPositiveTest() { - BloomFilter.BitMap.checkPositive(0); - BloomFilter.BitMap.checkPositive(0); + BitMap.checkPositive(0); + BitMap.checkPositive(0); try { - BloomFilter.BitMap.checkPositive(-1); + BitMap.checkPositive(-1); } catch (IndexOutOfBoundsException expected) { // do nothing @@ -25,65 +25,65 @@ public void checkPositiveTest() { public void containsTest() { long[] ary = new long[1]; - assertFalse( BloomFilter.BitMap.contains(ary, 0) ); + assertFalse( BitMap.contains(ary, 0) ); ary[0] = 0x01; - assertTrue( BloomFilter.BitMap.contains(ary, 0) ); + assertTrue( BitMap.contains(ary, 0) ); - assertFalse( BloomFilter.BitMap.contains(ary, 63) ); + assertFalse( BitMap.contains(ary, 63) ); ary[0] = (1L << 63); - assertTrue( BloomFilter.BitMap.contains(ary, 63) ); + assertTrue( BitMap.contains(ary, 63) ); ary = new long[2]; - assertFalse( BloomFilter.BitMap.contains(ary, 64) ); + assertFalse( BitMap.contains(ary, 64) ); ary[1] = 1; - assertTrue( BloomFilter.BitMap.contains(ary, 64) ); + assertTrue( BitMap.contains(ary, 64) ); } @Test public void getLongBitTest() { - assertEquals( 1, BloomFilter.BitMap.getLongBit(0) ); - assertEquals( 0x8000000000000000L, BloomFilter.BitMap.getLongBit( 63 ) ); - assertEquals( 1, BloomFilter.BitMap.getLongBit( 64) ); - assertEquals( 0x8000000000000000L, BloomFilter.BitMap.getLongBit( 127 ) ); - assertEquals( 1, BloomFilter.BitMap.getLongBit( 128 ) ); + assertEquals( 1, BitMap.getLongBit(0) ); + assertEquals( 0x8000000000000000L, BitMap.getLongBit( 63 ) ); + assertEquals( 1, BitMap.getLongBit( 64) ); + assertEquals( 0x8000000000000000L, BitMap.getLongBit( 127 ) ); + assertEquals( 1, BitMap.getLongBit( 128 ) ); } @Test public void getLongIndexTest() { - assertEquals( 0, BloomFilter.BitMap.getLongIndex(0) ); - assertEquals( 0, BloomFilter.BitMap.getLongIndex( 63 ) ); - assertEquals( 1, BloomFilter.BitMap.getLongIndex( 64) ); - assertEquals( 1, BloomFilter.BitMap.getLongIndex( 127 ) ); - assertEquals( 2, BloomFilter.BitMap.getLongIndex( 128 ) ); + assertEquals( 0, BitMap.getLongIndex(0) ); + assertEquals( 0, BitMap.getLongIndex( 63 ) ); + assertEquals( 1, BitMap.getLongIndex( 64) ); + assertEquals( 1, BitMap.getLongIndex( 127 ) ); + assertEquals( 2, BitMap.getLongIndex( 128 ) ); } @Test public void isSparseTest() { Shape shape = new Shape( 17, 64 ); - assertTrue( BloomFilter.BitMap.isSparse(0, shape) ); - assertTrue( BloomFilter.BitMap.isSparse(1, shape) ); - assertTrue( BloomFilter.BitMap.isSparse(2, shape) ); - assertFalse( BloomFilter.BitMap.isSparse(3, shape) ); + assertTrue( BitMap.isSparse(0, shape) ); + assertTrue( BitMap.isSparse(1, shape) ); + assertTrue( BitMap.isSparse(2, shape) ); + assertFalse( BitMap.isSparse(3, shape) ); shape = new Shape( 17, 64*3 ); for (int i=0;i<7; i++) { - assertTrue( BloomFilter.BitMap.isSparse(i, shape) ); + assertTrue( BitMap.isSparse(i, shape) ); } - assertFalse( BloomFilter.BitMap.isSparse(7, shape) ); + assertFalse( BitMap.isSparse(7, shape) ); } @Test public void numberOfBucketsTest() { for (int i = 0;i<64;i++) { - assertEquals( 1, BloomFilter.BitMap.numberOfBuckets(i)); + assertEquals( 1, BitMap.numberOfBuckets(i)); } for (int i = 64;i<128;i++) { - assertEquals( 2, BloomFilter.BitMap.numberOfBuckets(i)); + assertEquals( 2, BitMap.numberOfBuckets(i)); } - assertEquals( 3, BloomFilter.BitMap.numberOfBuckets(128)); + assertEquals( 3, BitMap.numberOfBuckets(128)); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index 8b5388832a..3fac5f9610 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -99,8 +99,8 @@ public final void cosineSimilarityTest() { filter2 = new SimpleBloomFilter(shape, from11); - assertEquals(0.41176470, SetOperations.cosineSimilarity(filter1, filter2), 0.000000001); - assertEquals(0.41176470, SetOperations.cosineSimilarity(filter2, filter1), 0.000000001); + assertEquals(0.41176470, SetOperations.cosineSimilarity(filter1, filter2), 0.00000001); + assertEquals(0.41176470, SetOperations.cosineSimilarity(filter2, filter1), 0.00000001); } /** From c60eb90d36a0a03c1921fbb3b6d167d971201027 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Sun, 3 Oct 2021 17:12:13 +0100 Subject: [PATCH 04/27] cleaned up code --- .../bloomfilter/ArrayCountingBloomFilter.java | 12 +- .../collections4/bloomfilter/BloomFilter.java | 30 +- .../bloomfilter/IndexProducer.java | 6 - .../collections4/bloomfilter/Shape.java | 466 +++++++++--------- .../bloomfilter/SimpleBloomFilter.java | 3 +- .../bloomfilter/SparseBloomFilter.java | 11 +- 6 files changed, 255 insertions(+), 273 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index 52138eddfa..c542376f55 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -314,12 +314,12 @@ public void forEachIndex(IntConsumer consumer) { @Override public void forEachBitMap(LongConsumer consumer) { Objects.requireNonNull( consumer, "consumer"); - if (cardinality() == 0) { - return; - } - BitMapBuilder builder = new BitMapBuilder( consumer ); - forEachIndex( builder ); - builder.finish(); + if (cardinality() == 0) { + return; + } + BitMapBuilder builder = new BitMapBuilder( consumer ); + forEachIndex( builder ); + builder.finish(); } /** diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index ba45bbf4dc..7b72827ea5 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -16,9 +16,7 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; import java.util.NoSuchElementException; import java.util.Objects; import java.util.PrimitiveIterator; @@ -69,16 +67,16 @@ default long[] getBits() { * @return an array of indices for bits that are enabled in the filter. */ default int[] getIndices() { - int[] result = new int[ cardinality() ]; - IntConsumer consumer = new IntConsumer() { - int idx = 0; - @Override - public void accept(int i) { - result[idx++] = i; - } - }; - forEachIndex( consumer ); - return result; + int[] result = new int[ cardinality() ]; + IntConsumer consumer = new IntConsumer() { + int idx = 0; + @Override + public void accept(int i) { + result[idx++] = i; + } + }; + forEachIndex( consumer ); + return result; } /** @@ -154,7 +152,7 @@ default boolean contains(Hasher hasher) { Shape shape = getShape(); BloomFilter result = BitMap.isSparse( (hasher.size() * shape.getNumberOfHashFunctions()), shape ) ? new SparseBloomFilter(getShape(), hasher) : - new SimpleBloomFilter(getShape(), hasher); + new SimpleBloomFilter(getShape(), hasher); return contains( result ); } @@ -173,7 +171,7 @@ default BloomFilter merge(BloomFilter other) { Shape shape = getShape(); BloomFilter result = BitMap.isSparse( (cardinality() + other.cardinality()), getShape() ) ? new SparseBloomFilter(shape) : - new SimpleBloomFilter(shape); + new SimpleBloomFilter(shape); result.mergeInPlace( this ); result.mergeInPlace( other ); @@ -194,7 +192,7 @@ default BloomFilter merge(Hasher hasher) { Shape shape = getShape(); BloomFilter result = BitMap.isSparse( (hasher.size() * shape.getNumberOfHashFunctions())+ cardinality(), shape ) ? new SparseBloomFilter(shape, hasher) : - new SimpleBloomFilter(shape, hasher); + new SimpleBloomFilter(shape, hasher); result.mergeInPlace( this ); return result; @@ -229,7 +227,7 @@ default boolean mergeInPlace(Hasher hasher) { Shape shape = getShape(); BloomFilter result = BitMap.isSparse( (hasher.size() * shape.getNumberOfHashFunctions())+cardinality(),shape ) ? new SparseBloomFilter(getShape(), hasher) : - new SimpleBloomFilter(getShape(), hasher); + new SimpleBloomFilter(getShape(), hasher); return mergeInPlace( result ); } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java index dbf41dbe94..5b1816423b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -1,12 +1,6 @@ package org.apache.commons.collections4.bloomfilter; -import java.util.Set; -import java.util.TreeSet; -import java.util.function.Consumer; import java.util.function.IntConsumer; -import java.util.function.LongConsumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; public interface IndexProducer { diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java index ff028fc506..8042d38b0c 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -111,7 +111,7 @@ public boolean equals(final Object o) { if (o instanceof Shape) { final Shape other = (Shape) o; return numberOfBits == other.numberOfBits && - numberOfHashFunctions == other.numberOfHashFunctions; + numberOfHashFunctions == other.numberOfHashFunctions; } return false; } @@ -168,13 +168,13 @@ public double getProbability(int numberOfItems) { return 0; } return Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), - numberOfHashFunctions); + numberOfHashFunctions); } @Override public String toString() { return String.format("Shape[ m=%s k=%s ]", - numberOfBits, numberOfHashFunctions); + numberOfBits, numberOfHashFunctions); } public double estimateN( int hammingValue ) { @@ -200,259 +200,259 @@ public double estimateN( int hammingValue ) { public static class Factory { - /** - * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. - */ - private static final double LN_2 = Math.log(2.0); - - /** - * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201. - * - *

ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2) - */ - private static final double DENOMINATOR = -LN_2 * LN_2; - - /** - * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the - * specified number of bits ({@code m}) and hash functions ({@code k}). - * - *

The number of items ({@code n}) to be stored in the filter is computed. - *

n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))
- * - *

The actual probability will be approximately equal to the - * desired probability but will be dependent upon the calculated Bloom filter capacity - * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param probability The desired false-positive probability in the range {@code (0, 1)} - * @param numberOfBits The number of bits in the filter - * @param numberOfHashFunctions The number of hash functions in the filter - * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}; - * if {@code numberOfBits < 1}; if {@code numberOfHashFunctions < 1}; or if the actual - * probability is {@code >= 1.0} - */ - public static Shape fromPMK(final double probability, final int numberOfBits, + /** + * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. + */ + private static final double LN_2 = Math.log(2.0); + + /** + * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201. + * + *

ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2) + */ + private static final double DENOMINATOR = -LN_2 * LN_2; + + /** + * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the + * specified number of bits ({@code m}) and hash functions ({@code k}). + * + *

The number of items ({@code n}) to be stored in the filter is computed. + *

n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))
+ * + *

The actual probability will be approximately equal to the + * desired probability but will be dependent upon the calculated Bloom filter capacity + * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param probability The desired false-positive probability in the range {@code (0, 1)} + * @param numberOfBits The number of bits in the filter + * @param numberOfHashFunctions The number of hash functions in the filter + * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}; + * if {@code numberOfBits < 1}; if {@code numberOfHashFunctions < 1}; or if the actual + * probability is {@code >= 1.0} + */ + public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) { - checkProbability(probability); - checkNumberOfBits(numberOfBits); - checkNumberOfHashFunctions(numberOfHashFunctions); + checkProbability(probability); + checkNumberOfBits(numberOfBits); + checkNumberOfHashFunctions(numberOfHashFunctions); - // Number of items (n): - // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) - final double n = Math.ceil(numberOfBits / + // Number of items (n): + // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) + final double n = Math.ceil(numberOfBits / (-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions)))); - // log of probability is always < 0 - // number of hash functions is >= 1 - // e^x where x < 0 = [0,1) - // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53 - // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0 - // ceil( >0 ) >= 1 - // so we can not produce a negative value thus we don't check for it. - // - // similarly we can not produce a number greater than numberOfBits so we - // do not have to check for Integer.MAX_VALUE either. - - - Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); - // check that probability is within range - checkCalculatedProbability(shape.getProbability( (int) n )); - return shape; - } + // log of probability is always < 0 + // number of hash functions is >= 1 + // e^x where x < 0 = [0,1) + // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53 + // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0 + // ceil( >0 ) >= 1 + // so we can not produce a negative value thus we don't check for it. + // + // similarly we can not produce a number greater than numberOfBits so we + // do not have to check for Integer.MAX_VALUE either. + + + Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + // check that probability is within range + checkCalculatedProbability(shape.getProbability( (int) n )); + return shape; + } - /** - * Constructs a filter configuration with the specified number of items ({@code n}) and - * desired false-positive probability ({@code p}). - * - *

The number of bits ({@code m}) for the filter is computed. - *

m = ceil(n * ln(p) / ln(1 / 2^ln(2)))
- * - *

The optimal number of hash functions ({@code k}) is computed. - *

k = round((m / n) * ln(2))
- * - *

The actual probability will be approximately equal to the - * desired probability but will be dependent upon the calculated number of bits and hash - * functions. An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param numberOfItems Number of items to be placed in the filter - * @param probability The desired false-positive probability in the range {@code (0, 1)} - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if the desired probability - * is not in the range {@code (0, 1)}; or if the actual probability is {@code >= 1.0} - * @see #getProbability() - */ - public static Shape fromNP (final int numberOfItems, final double probability) { - checkNumberOfItems(numberOfItems); - checkProbability(probability); - - // Number of bits (m) - final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); - if (m > Integer.MAX_VALUE) { - throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); - } - int numberOfBits = (int) m; - - int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); - Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); - // check that probability is within range - checkCalculatedProbability(shape.getProbability( numberOfItems )); - return shape; + /** + * Constructs a filter configuration with the specified number of items ({@code n}) and + * desired false-positive probability ({@code p}). + * + *

The number of bits ({@code m}) for the filter is computed. + *

m = ceil(n * ln(p) / ln(1 / 2^ln(2)))
+ * + *

The optimal number of hash functions ({@code k}) is computed. + *

k = round((m / n) * ln(2))
+ * + *

The actual probability will be approximately equal to the + * desired probability but will be dependent upon the calculated number of bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfItems Number of items to be placed in the filter + * @param probability The desired false-positive probability in the range {@code (0, 1)} + * @throws IllegalArgumentException if {@code numberOfItems < 1}; if the desired probability + * is not in the range {@code (0, 1)}; or if the actual probability is {@code >= 1.0} + * @see #getProbability() + */ + public static Shape fromNP (final int numberOfItems, final double probability) { + checkNumberOfItems(numberOfItems); + checkProbability(probability); + + // Number of bits (m) + final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); + if (m > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); } + int numberOfBits = (int) m; - /** - * Constructs a filter configuration with the specified number of items ({@code n}) and - * bits ({@code m}). - * - *

The optimal number of hash functions ({@code k}) is computed. - *

k = round((m / n) * ln(2))
- * - *

The false-positive probability is computed using the number of items, bits and hash - * functions. An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param numberOfItems Number of items to be placed in the filter - * @param numberOfBits The number of bits in the filter - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; - * if the calculated number of hash function is {@code < 1}; - * or if the actual probability is {@code >= 1.0} - * @see #getProbability() - */ - public static Shape fromNM(final int numberOfItems, final int numberOfBits) { - checkNumberOfItems(numberOfItems); - checkNumberOfBits(numberOfBits); - int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); - Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); - // check that probability is within range - checkCalculatedProbability(shape.getProbability( numberOfItems )); - return shape; - } + int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + // check that probability is within range + checkCalculatedProbability(shape.getProbability( numberOfItems )); + return shape; + } - /** - * Constructs a filter configuration with the specified number of items, bits - * and hash functions. - * - *

The false-positive probability is computed using the number of items, bits and hash - * functions. An exception is raised if this is greater than or equal to 1 (i.e. the - * shape is invalid for use as a Bloom filter). - * - * @param numberOfItems Number of items to be placed in the filter - * @param numberOfBits The number of bits in the filter. - * @param numberOfHashFunctions The number of hash functions in the filter - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; - * if {@code numberOfHashFunctions < 1}; or if the actual probability is {@code >= 1.0} - * @see #getProbability() - */ - public static Shape fromNMK (final int numberOfItems, final int numberOfBits, + /** + * Constructs a filter configuration with the specified number of items ({@code n}) and + * bits ({@code m}). + * + *

The optimal number of hash functions ({@code k}) is computed. + *

k = round((m / n) * ln(2))
+ * + *

The false-positive probability is computed using the number of items, bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfItems Number of items to be placed in the filter + * @param numberOfBits The number of bits in the filter + * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; + * if the calculated number of hash function is {@code < 1}; + * or if the actual probability is {@code >= 1.0} + * @see #getProbability() + */ + public static Shape fromNM(final int numberOfItems, final int numberOfBits) { + checkNumberOfItems(numberOfItems); + checkNumberOfBits(numberOfBits); + int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); + Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + // check that probability is within range + checkCalculatedProbability(shape.getProbability( numberOfItems )); + return shape; + } + + /** + * Constructs a filter configuration with the specified number of items, bits + * and hash functions. + * + *

The false-positive probability is computed using the number of items, bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfItems Number of items to be placed in the filter + * @param numberOfBits The number of bits in the filter. + * @param numberOfHashFunctions The number of hash functions in the filter + * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; + * if {@code numberOfHashFunctions < 1}; or if the actual probability is {@code >= 1.0} + * @see #getProbability() + */ + public static Shape fromNMK (final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) { - checkNumberOfItems(numberOfItems); - checkNumberOfBits(numberOfBits); - checkNumberOfHashFunctions(numberOfHashFunctions); - // check that probability is within range - Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); - // check that probability is within range - checkCalculatedProbability(shape.getProbability( numberOfItems )); - return shape; - } + checkNumberOfItems(numberOfItems); + checkNumberOfBits(numberOfBits); + checkNumberOfHashFunctions(numberOfHashFunctions); + // check that probability is within range + Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + // check that probability is within range + checkCalculatedProbability(shape.getProbability( numberOfItems )); + return shape; + } - /** - * Check number of items is strictly positive. - * - * @param numberOfItems the number of items - * @return the number of items - * @throws IllegalArgumentException if the number of items is {@code < 1} - */ - private static int checkNumberOfItems(final int numberOfItems) { - if (numberOfItems < 1) { - throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems); - } - return numberOfItems; + /** + * Check number of items is strictly positive. + * + * @param numberOfItems the number of items + * @return the number of items + * @throws IllegalArgumentException if the number of items is {@code < 1} + */ + private static int checkNumberOfItems(final int numberOfItems) { + if (numberOfItems < 1) { + throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems); } + return numberOfItems; + } - /** - * Check number of bits is strictly positive. - * - * @param numberOfBits the number of bits - * @return the number of bits - * @throws IllegalArgumentException if the number of bits is {@code < 1} - */ - private static int checkNumberOfBits(final int numberOfBits) { - if (numberOfBits < 1) { - throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); - } - return numberOfBits; + /** + * Check number of bits is strictly positive. + * + * @param numberOfBits the number of bits + * @return the number of bits + * @throws IllegalArgumentException if the number of bits is {@code < 1} + */ + private static int checkNumberOfBits(final int numberOfBits) { + if (numberOfBits < 1) { + throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); } + return numberOfBits; + } - /** - * Check number of hash functions is strictly positive - * - * @param numberOfHashFunctions the number of hash functions - * @return the number of hash functions - * @throws IllegalArgumentException if the number of hash functions is {@code < 1} - */ - private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { - if (numberOfHashFunctions < 1) { - throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); - } - return numberOfHashFunctions; + /** + * Check number of hash functions is strictly positive + * + * @param numberOfHashFunctions the number of hash functions + * @return the number of hash functions + * @throws IllegalArgumentException if the number of hash functions is {@code < 1} + */ + private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { + if (numberOfHashFunctions < 1) { + throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); } + return numberOfHashFunctions; + } - /** - * Check the probability is in the range 0.0, exclusive, to 1.0, exclusive. - * - * @param probability the probability - * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)} - */ - private static void checkProbability(final double probability) { - // Using the negation of within the desired range will catch NaN - if (!(probability > 0.0 && probability < 1.0)) { - throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability); - } + /** + * Check the probability is in the range 0.0, exclusive, to 1.0, exclusive. + * + * @param probability the probability + * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)} + */ + private static void checkProbability(final double probability) { + // Using the negation of within the desired range will catch NaN + if (!(probability > 0.0 && probability < 1.0)) { + throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability); } + } - /** - * Check the calculated probability is {@code < 1.0}. - * - *

This function is used to verify that the dynamically calculated probability for the - * Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon - * construction. - * - * @param probability the probability - * @throws IllegalArgumentException if the probability is {@code >= 1.0} - */ - private static void checkCalculatedProbability(final double probability) { - // We do not need to check for p <= 0.0 since we only allow positive values for - // parameters and the closest we can come to exp(-kn/m) == 1 is - // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will - // always be 00 - if (probability >= 1.0) { - throw new IllegalArgumentException( + /** + * Check the calculated probability is {@code < 1.0}. + * + *

This function is used to verify that the dynamically calculated probability for the + * Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon + * construction. + * + * @param probability the probability + * @throws IllegalArgumentException if the probability is {@code >= 1.0} + */ + private static void checkCalculatedProbability(final double probability) { + // We do not need to check for p <= 0.0 since we only allow positive values for + // parameters and the closest we can come to exp(-kn/m) == 1 is + // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will + // always be 00 + if (probability >= 1.0) { + throw new IllegalArgumentException( String.format("Calculated probability is greater than or equal to 1: " + probability)); - } } + } - /** - * Calculates the number of hash functions given numberOfItems and numberofBits. - * This is a method so that the calculation is consistent across all constructors. - * - * @param numberOfItems the number of items in the filter. - * @param numberOfBits the number of bits in the filter. - * @return the optimal number of hash functions. - * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1} - */ - private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) { - // k = round((m / n) * ln(2)) We change order so that we use real math rather - // than integer math. - final long k = Math.round(LN_2 * numberOfBits / numberOfItems); - if (k < 1) { - throw new IllegalArgumentException( + /** + * Calculates the number of hash functions given numberOfItems and numberofBits. + * This is a method so that the calculation is consistent across all constructors. + * + * @param numberOfItems the number of items in the filter. + * @param numberOfBits the number of bits in the filter. + * @return the optimal number of hash functions. + * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1} + */ + private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) { + // k = round((m / n) * ln(2)) We change order so that we use real math rather + // than integer math. + final long k = Math.round(LN_2 * numberOfBits / numberOfItems); + if (k < 1) { + throw new IllegalArgumentException( String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k)); - } - // Normally we would check that numberofHashFunctions <= Integer.MAX_VALUE but - // since numberOfBits is at most Integer.MAX_VALUE the numerator of - // numberofHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the - // value of k can not be above Integer.MAX_VALUE. - return (int) k; } + // Normally we would check that numberofHashFunctions <= Integer.MAX_VALUE but + // since numberOfBits is at most Integer.MAX_VALUE the numerator of + // numberofHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the + // value of k can not be above Integer.MAX_VALUE. + return (int) k; + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java index b194b8b095..a17b9ca6ee 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -16,7 +16,6 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.Arrays; import java.util.BitSet; import java.util.Objects; import java.util.function.IntConsumer; @@ -108,4 +107,4 @@ public void forEachBitMap(LongConsumer consumer) { } } - } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java index 3f369889d3..7b3a4028c4 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -16,23 +16,14 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.Arrays; -import java.util.BitSet; -import java.util.Collections; -import java.util.HashSet; import java.util.List; import java.util.Objects; import java.util.PrimitiveIterator; -import java.util.PrimitiveIterator.OfInt; -import java.util.Set; import java.util.TreeSet; import java.util.function.IntConsumer; import java.util.function.LongConsumer; -import javax.swing.event.ListSelectionEvent; - import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; /** * A bloom filter using a Java BitSet to track enabled bits. This is a standard @@ -150,4 +141,4 @@ public void forEachBitMap(LongConsumer consumer) { } - } +} From f3be5fa3a6233087ae6765e1abf7e019e725f2de Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Mon, 4 Oct 2021 17:29:27 +0100 Subject: [PATCH 05/27] added license headers --- .../bloomfilter/BitCountProducer.java | 16 ++++++++++++++++ .../commons/collections4/bloomfilter/BitMap.java | 16 ++++++++++++++++ .../collections4/bloomfilter/BitMapProducer.java | 16 ++++++++++++++++ .../collections4/bloomfilter/IndexProducer.java | 16 ++++++++++++++++ .../collections4/bloomfilter/BitMaptTest.java | 16 ++++++++++++++++ 5 files changed, 80 insertions(+) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java index bd2ccf95e3..089dbeb50d 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.commons.collections4.bloomfilter; import java.util.Set; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java index dfab37c00c..793279e35b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.commons.collections4.bloomfilter; /** diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java index dd0651bac1..8cffa0095e 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.commons.collections4.bloomfilter; import java.util.Arrays; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java index 5b1816423b..fda053eb86 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.commons.collections4.bloomfilter; import java.util.function.IntConsumer; diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java index b0dd34658e..92e85fedce 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.commons.collections4.bloomfilter; import static org.junit.Assert.assertEquals; From 4f5c8f3b553cbc33c3efcf4a01da5f4f448038da Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Wed, 6 Oct 2021 20:09:52 +0100 Subject: [PATCH 06/27] Refactored and cleaned up Moved to dependency on BitMapProducer, IndexProducer and BitCountProducer to retrieve internal representations of the data. --- .../bloomfilter/ArrayCountingBloomFilter.java | 179 +----- .../bloomfilter/BitCountProducer.java | 79 +-- .../bloomfilter/BitMapProducer.java | 73 ++- .../collections4/bloomfilter/BloomFilter.java | 200 ++---- .../bloomfilter/IndexProducer.java | 34 + .../bloomfilter/SetOperations.java | 12 +- .../collections4/bloomfilter/Shape.java | 11 +- .../bloomfilter/SimpleBloomFilter.java | 84 ++- .../bloomfilter/SparseBloomFilter.java | 42 +- .../exceptions/NoMatchException.java | 35 + .../bloomfilter/exceptions/package-info.java | 1 + .../bloomfilter/hasher/Hasher.java | 16 +- .../bloomfilter/hasher/HasherCollection.java | 95 +-- .../bloomfilter/hasher/SimpleHasher.java | 94 ++- .../bloomfilter/AbstractBloomFilterTest.java | 18 +- .../AbstractCountingBloomFilterTest.java | 57 +- .../ArrayCountingBloomFilterTest.java | 15 +- .../bloomfilter/BitMapProducerTest.java | 54 ++ .../{BitMaptTest.java => BitMapTest.java} | 4 +- .../bloomfilter/SetOperationsTest.java | 5 - .../bloomfilter/ShapeFactoryTest.java | 4 - .../collections4/bloomfilter/ShapeTest.java | 597 +++++++++--------- .../bloomfilter/SimpleBloomFilterTest.java | 2 +- .../bloomfilter/SparseBloomFilterTest.java | 4 +- .../hasher/HasherCollectionTest.java | 22 +- .../bloomfilter/hasher/SimpleHasherTest.java | 20 +- 26 files changed, 783 insertions(+), 974 deletions(-) create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java rename src/test/java/org/apache/commons/collections4/bloomfilter/{BitMaptTest.java => BitMapTest.java} (98%) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index c542376f55..2df6ae787d 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -16,15 +16,12 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.BitSet; -import java.util.NoSuchElementException; import java.util.Objects; -import java.util.PrimitiveIterator; -import java.util.PrimitiveIterator.OfInt; import java.util.function.IntConsumer; import java.util.function.LongConsumer; import java.util.stream.IntStream; +import org.apache.commons.collections4.bloomfilter.exceptions.NoMatchException; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; /** @@ -39,7 +36,7 @@ * in {@link #isValid()} for details. * *

All the operations in the filter assume the counts are currently valid, - * for example cardinality or contains operations. Behaviour of an invalid + * for example cardinality or contains operations. Behavior of an invalid * filter is undefined. It will no longer function identically to a standard * Bloom filter that is the merge of all the Bloom filters that have been added * to and not later subtracted from the counting Bloom filter. @@ -56,6 +53,9 @@ */ public class ArrayCountingBloomFilter implements CountingBloomFilter { + /** + * The shape of this Bloom filter. + */ private final Shape shape; /** @@ -89,50 +89,6 @@ public class ArrayCountingBloomFilter implements CountingBloomFilter { */ private int state; - /** - * An iterator of all indexes with non-zero counts. - * - *

In the event that the filter state is invalid any index with a negative count - * will also be produced by the iterator. - */ - private class IndexIterator implements PrimitiveIterator.OfInt { - /** The next non-zero index (or counts.length). */ - private int next; - - /** - * Create an instance. - */ - IndexIterator() { - advance(); - } - - /** - * Advance to the next non-zero index. - */ - void advance() { - while (next < counts.length && counts[next] == 0) { - next++; - } - } - - @Override - public boolean hasNext() { - return next < counts.length; - } - - @Override - public int nextInt() { - if (hasNext()) { - final int result = next++; - advance(); - return result; - } - // Currently unreachable as the iterator is only used by - // the StaticHasher which iterates correctly. - throw new NoSuchElementException(); - } - } - /** * Constructs an empty counting Bloom filter with the specified shape. * @@ -155,60 +111,10 @@ public int cardinality() { return (int) IntStream.range( 0, counts.length ).filter( i -> counts[i] > 0 ).count(); } - @Override - public boolean contains(final BloomFilter other) { - Objects.requireNonNull( other, "other"); - try { - other.forEachIndex( idx -> {if ( this.counts[idx] == 0 ) { throw new ArrayCountingBloomFilter.NoMatchException(); }} ); - } catch (NoMatchException e) { - return false; - } - return true; - } - - @Override - public boolean contains(final Hasher hasher) { - Objects.requireNonNull( hasher, "hasher"); - return contains(hasher.iterator(getShape())); - } - /** - * Return true if this filter is has non-zero counts for each index in the iterator. - * - * @param iter the iterator - * @return true if this filter contains all the indexes + * Clones the filter. Used to create merged values. + * @return A clone of this filter. */ - private boolean contains(final OfInt iter) { - while (iter.hasNext()) { - if (counts[iter.nextInt()] == 0) { - return false; - } - } - return true; - } - - @Override - public long[] getBits() { - final BitSet bs = new BitSet(); - for (int i = 0; i < counts.length; i++) { - if (counts[i] != 0) { - bs.set(i); - } - } - return bs.toLongArray(); - } - - /** - * Returns an iterator over the enabled indexes in this filter. - * Any index with a non-zero count is considered enabled. - * The iterator returns indexes in their natural order. - * - * @return an iterator over the enabled indexes - */ - private PrimitiveIterator.OfInt iterator() { - return new IndexIterator(); - } - protected ArrayCountingBloomFilter makeClone() { ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); filter.add( this ); @@ -220,7 +126,7 @@ protected ArrayCountingBloomFilter makeClone() { public CountingBloomFilter merge(BloomFilter other) { Objects.requireNonNull( other, "other"); CountingBloomFilter filter = makeClone(); - filter.add( BitCountProducer.Factory.simple( other )); + filter.add( BitCountProducer.from(other)); return filter; } @@ -228,34 +134,32 @@ public CountingBloomFilter merge(BloomFilter other) { public CountingBloomFilter merge(Hasher hasher) { Objects.requireNonNull( hasher, "hasher"); ArrayCountingBloomFilter filter = makeClone(); - filter.mergeInPlace( hasher ); + filter.add( BitCountProducer.from( hasher.indices(shape))); return filter; } @Override public boolean mergeInPlace(final BloomFilter other) { Objects.requireNonNull( other, "other"); - return add( BitCountProducer.Factory.simple(other) ); + return add( BitCountProducer.from(other) ); } @Override public boolean mergeInPlace(final Hasher hasher) { Objects.requireNonNull( hasher, "hasher"); - hasher.forEach( h -> add( BitCountProducer.Factory.from( shape, h ))); - return isValid(); + return add( BitCountProducer.from( hasher.indices(shape))); } @Override public boolean remove(final BloomFilter other) { Objects.requireNonNull( other, "other"); - return subtract( BitCountProducer.Factory.simple(other)); + return subtract( BitCountProducer.from(other)); } @Override public boolean remove(final Hasher hasher) { Objects.requireNonNull( hasher, "hasher"); - hasher.forEach( h -> subtract( BitCountProducer.Factory.from( shape, h ))); - return isValid(); + return subtract( BitCountProducer.from( hasher.indices(shape))); } @Override @@ -314,12 +218,7 @@ public void forEachIndex(IntConsumer consumer) { @Override public void forEachBitMap(LongConsumer consumer) { Objects.requireNonNull( consumer, "consumer"); - if (cardinality() == 0) { - return; - } - BitMapBuilder builder = new BitMapBuilder( consumer ); - forEachIndex( builder ); - builder.finish(); + BitMapProducer.fromIndexProducer( this, shape).forEachBitMap(consumer); } /** @@ -346,55 +245,25 @@ protected void subtract(final int idx, final int subtrahend) { counts[idx] = updated; } - @Override - public int[] getIndices() { - return IntStream.range( 0, counts.length ).filter( i -> counts[i] > 0 ).toArray(); - } @Override public Shape getShape() { return shape; } - private static class BitMapBuilder implements IntConsumer { - - LongConsumer consumer; - long bucket = 0; - long bucektIdx=0; - - BitMapBuilder( LongConsumer consumer ) { - this.consumer = consumer; - } - - @Override - public void accept( int i ) { - int nextIndex = BitMap.getLongIndex( i ); - while (nextIndex > bucektIdx) - { - consumer.accept(bucket); - bucket =0; - bucektIdx++; - } - bucket |= BitMap.getLongBit( i ); - } - - public void finish() { - if (bucket != 0) { - consumer.accept( bucket ); - } + @Override + public boolean contains(IndexProducer indexProducer) { + try { + indexProducer.forEachIndex( idx -> {if ( this.counts[idx] == 0 ) { throw new NoMatchException(); }} ); + } catch (NoMatchException e) { + return false; } + return true; } - /** - * An exception throwns when no match was found in the byte buffer. - * - */ - private class NoMatchException extends RuntimeException { - - public NoMatchException() { - super(); - } - + @Override + public boolean contains(BitMapProducer bitMapProducer) { + return contains( IndexProducer.fromBitMapProducer(bitMapProducer)); } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java index 089dbeb50d..844942db97 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java @@ -16,17 +16,13 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.Set; -import java.util.TreeSet; -import java.util.function.Consumer; - -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import java.util.function.IntConsumer; /** * Produces bit counts for counting type Bloom filters. * */ -public interface BitCountProducer { +public interface BitCountProducer extends IndexProducer { /** * Performs the given action for each {@code } pair where the count is non-zero. @@ -39,64 +35,25 @@ public interface BitCountProducer { */ void forEachCount(BitCountConsumer consumer); + @Override + default void forEachIndex(IntConsumer consumer) { + forEachCount( (i,v) -> consumer.accept( i )); + } + /** - * Factory to construct BitCountProducers from common Bloom filter and Hashers. - * + * Creates a BitCountProducer from an IndexProducer. The resulting + * producer will count each enabled bit once. + * @param idx An index producer. + * @return A BitCountProducer with the same indices as the IndexProducer. */ - public static class Factory { - /** - * Creates a BitCountProducer from a bloom filter. - * - * If the filter implements the BitCountProducer it is returned unchanged. - * If the filter does not implement the BitCountProducer each enabled bit is - * returned with a count of one (1). - * - * @param filter the Bloom filter to count. - * @return The BitCountProducer for the Bloom filter. - */ - public static BitCountProducer from( BloomFilter filter ) { - return (filter instanceof BitCountProducer) ? (BitCountProducer) filter : simple( filter ); - } - - /** - * Create a BitCountProducer from a bloom filter without regard to previous BitCountProducer - * implementation. - * - * for each enabled bit a count of 1 is returned. - * - * @param filter The Bloom filter to create the BitCountProducer from. - * @return the BitCountProducer for the Bloom filter. - */ - public static BitCountProducer simple( BloomFilter filter ) { - return new BitCountProducer() { - - @Override - public void forEachCount(BitCountConsumer consumer) { - for (int i : filter.getIndices() ) - { - consumer.accept(i, 1); - } - } - }; - } - - /** - * Creates a Bit count producer from a shape and hasher. - * @param shape The shape to use - * @param hasher the hasher to use. - * @return A BitCountProducer for the hasher produced values. - */ - public static BitCountProducer from( Shape shape, Hasher hasher ) { - return new BitCountProducer() { + public static BitCountProducer from( IndexProducer idx ) { + return new BitCountProducer() { + @Override + public void forEachCount(BitCountConsumer consumer) { + idx.forEachIndex( i -> consumer.accept(i, 1 ) ); + } - @Override - public void forEachCount(BitCountConsumer consumer) { - final Set distinct = new TreeSet<>(); - hasher.iterator(shape).forEachRemaining((Consumer) distinct::add ); - distinct.forEach( i -> consumer.accept(i, 1)); - } - }; - } + }; } /** diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java index 8cffa0095e..b371472c31 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -17,10 +17,12 @@ package org.apache.commons.collections4.bloomfilter; import java.util.Arrays; +import java.util.Objects; +import java.util.function.IntConsumer; import java.util.function.LongConsumer; /** - * Produces bit map longs for a Bloom filter. + * Produces BitMap longs for a Bloom filter. * * Each bit map is a little-endian long value representing a block of bits of this filter. * @@ -43,6 +45,42 @@ public interface BitMapProducer { */ void forEachBitMap(LongConsumer consumer); + /** + * Creates a BitMapProducer from an IndexProducer. + * @param producer the IndexProducer that specifies the indexes of the bits to enable. + * @param shape the desired shape. + * @return A BitMapProducer that produces the BitMap equivalent of the Indices from the producer. + */ + public static BitMapProducer fromIndexProducer( IndexProducer producer, Shape shape ) { + + return new BitMapProducer() { + private int maxBucket = -1; + private long[] result = new long[ BitMap.numberOfBuckets( shape.getNumberOfBits())]; + + @Override + public void forEachBitMap(LongConsumer consumer) { + /* we can not assume that all the processes ints will be in order + * and not repeated. This is because the HasherCollection does + * not make the guarantee. + */ + // process all the ints into a array of BitMaps + IntConsumer builder = new IntConsumer() { + @Override + public void accept( int i ) { + int bucketIdx = BitMap.getLongIndex( i ); + maxBucket = maxBucket < bucketIdx ? bucketIdx : maxBucket; + result[bucketIdx] |= BitMap.getLongBit(i); + } + }; + producer.forEachIndex( builder ); + // send the bitmaps to the consumer. + for (int bucket=0;bucket<=maxBucket;bucket++) { + consumer.accept( result[bucket] ); + } + } + }; + } + /** * A LongConsumer that builds an Array of BitMaps as produced by a BitMapProducer. * @@ -50,25 +88,48 @@ public interface BitMapProducer { public class ArrayBuilder implements LongConsumer { private long[] result; private int idx=0; + private int bucketCount=0; /** * Constructor. * @param shape The shape used to generate the BitMaps. */ public ArrayBuilder( Shape shape ) { + this( shape, null ); + } + + /** + * Constructor. + * @param shape The shape used to generate the BitMaps. + * @param initialValue an array of BitMap values to initialize the builder with. May be {@code null}. + * @throws IllegalArgumentException is the length of initialValue is greater than the number of + * buckets as specified by the number of bits in the Shape. + */ + public ArrayBuilder( Shape shape, long[] initialValue ) { + Objects.requireNonNull( shape, "shape"); result = new long[ BitMap.numberOfBuckets( shape.getNumberOfBits() )]; + if (initialValue != null) { + if (initialValue.length > result.length) { + throw new IllegalArgumentException( String.format( + "initialValue length (%s) is longer than shape length (%s)", initialValue.length, result.length)); + } + bucketCount = initialValue.length; + System.arraycopy(initialValue, 0, result, 0, bucketCount); + } } + @Override public void accept(long bitmap) { - result[idx++] = bitmap; + result[idx++] |= bitmap; + bucketCount = bucketCount>=idx?bucketCount:idx; } /** - * Trims the resulting array so that there are no trailing empty BitMaps - * @return + * Returns the array. + * @return the Array of BitMaps. */ - public long[] trim() { - return Arrays.copyOf( result, idx ); + public long[] getArray() { + return Arrays.copyOf( result, bucketCount ); } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index 7b72827ea5..16aa62f121 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -16,11 +16,11 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.Arrays; +import java.util.ArrayList; +import java.util.List; import java.util.NoSuchElementException; import java.util.Objects; import java.util.PrimitiveIterator; -import java.util.function.IntConsumer; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; @@ -30,58 +30,42 @@ */ public interface BloomFilter extends IndexProducer, BitMapProducer { - // Query Operations - /** - * This method is used to determine the best mechod for matching. For `sparse` implementations the `getIndices()` - * method is more efficient. Implementers should determine if it is easier for the implementation to return am array of - * Indices (sparse) or a bit map as an array of unsigned longs. - * @return + * Return the Bloom filter data as a BitMap array. + * @param filter the filter to get the data from. + * @return An array of BitMap long. */ - boolean isSparse(); + public static long[] asBitMapArray( BloomFilter filter ) { + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(filter.getShape()); + filter.forEachBitMap( builder ); + return builder.getArray(); + } /** - * Gets an array of little-endian long values representing the bits of this filter. - * - *

The returned array will have length {@code ceil(m / 64)} where {@code m} is the - * number of bits in the filter and {@code ceil} is the ceiling function. - * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit - * index is enabled. - * - * @return the {@code long[]} representation of this filter + * Return the Bloom filter data as an array of indices for the enabled bits. + * @param filter the Filter to get the data from. + * @return An array of indices for enabled bits in the Bloom filter. */ - default long[] getBits() { + public static int[] asIndexArray( BloomFilter filter ) { + List lst = new ArrayList(); + filter.forEachIndex( lst::add ); + return lst.stream().mapToInt( Integer::intValue ).toArray(); + } - if (cardinality() == 0) { - return new long[0]; - } - BitMapProducer.ArrayBuilder consumer = new BitMapProducer.ArrayBuilder(getShape()); - forEachBitMap( consumer ); - return consumer.trim(); - } + // Query Operations /** - * Gets an array of indices of bits that are enabled. - * Array must be in sorted order. - * @return an array of indices for bits that are enabled in the filter. + * This method is used to determine the best method for matching. For `sparse` implementations the `getIndices()` + * method is more efficient. Implementers should determine if it is easier for the implementation to return am array of + * Indices (sparse) or a bit map as an array of unsigned longs. + * @return */ - default int[] getIndices() { - int[] result = new int[ cardinality() ]; - IntConsumer consumer = new IntConsumer() { - int idx = 0; - @Override - public void accept(int i) { - result[idx++] = i; - } - }; - forEachIndex( consumer ); - return result; - } + boolean isSparse(); /** * Gets the shape that was used when the filter was built. - * @return The shape the flter was built with. + * @return The shape the filter was built with. */ Shape getShape(); @@ -96,50 +80,14 @@ public void accept(int i) { */ default boolean contains(BloomFilter other) { Objects.requireNonNull( other, "other"); - if (isSparse()) { - int[] myIndicies = getIndices(); - if (other.isSparse()) { - int[] otherIndicies = other.getIndices(); - if (otherIndicies.length > myIndicies.length) { - return false; - } - return Arrays.stream( otherIndicies ).allMatch( i -> Arrays.binarySearch( myIndicies, i) >= 0); - } else { - BitIterator iter = new BitIterator( other.getBits() ); - while (iter.hasNext()) - { - if (Arrays.binarySearch( myIndicies, iter.next()) < 0) { - return false; - } - } - return true; - } - } else { - long[] myBits = getBits(); - if (other.isSparse()) { - return Arrays.stream( other.getIndices() ).allMatch( i -> BitMap.contains( myBits, i )); - } else { - long[] otherBits = other.getBits(); - if (myBits.length != otherBits.length) - { - return false; - } - for (int i=0;i=64) - { - offset=0; - bucket++; - } - if (bucket < bits.length && 0 != (bits[bucket] & (1L << offset))) { - next = (bucket*64)+offset; - } - } - return next >= 0; - } - - - @SuppressWarnings("cast") // Cast to long to workaround a bug in animal-sniffer. - @Override - public int nextInt() { - if (hasNext()) { - try { - return next; - } finally { - next = -1; - } - } - throw new NoSuchElementException(); - } - } - } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java index fda053eb86..c10ad18e87 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -16,8 +16,14 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.Objects; import java.util.function.IntConsumer; +import java.util.function.LongConsumer; +/** + * An object that produces indices for or of a Bloom filter. + * + */ public interface IndexProducer { /** @@ -29,4 +35,32 @@ public interface IndexProducer { */ void forEachIndex(IntConsumer consumer); + /** + * Creates an IndexProducer from a @{code BitMapProducer}. + * @param producer the @{code BitMapProducer} + * @return a new @{code IndexProducer}. + */ + public static IndexProducer fromBitMapProducer( BitMapProducer producer ) { + Objects.requireNonNull( producer, "producer"); + return new IndexProducer() { + @Override + public void forEachIndex(IntConsumer consumer) { + LongConsumer longConsumer = new LongConsumer(){ + int wordIdx = 0; + @Override + public void accept(long word) { + for (int i = 0;i<64;i++) + { + long mask = 1L< bitSet.set(i)); + this.shape = shape; + + BitMapProducer producer = BitMapProducer.fromIndexProducer( hasher.indices(shape), shape); + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape); + producer.forEachBitMap( builder ); + this.bitMap = builder.getArray(); + this.cardinality = 0; + forEachBitMap( w -> this.cardinality += Long.bitCount(w)); } @Override public boolean mergeInPlace(BloomFilter other) { Objects.requireNonNull( other, "other"); - if (other.isSparse()) { - other.forEachIndex( bitSet::set ); - } else { - bitSet.or( BitSet.valueOf(other.getBits() )); - } + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape, this.bitMap); + other.forEachBitMap( builder ); + this.bitMap = builder.getArray(); + this.cardinality = 0; + forEachBitMap( w -> this.cardinality += Long.bitCount(w)); return true; } @@ -80,31 +97,48 @@ public boolean isSparse() { @Override public int cardinality() { - return bitSet.cardinality(); - } - - @Override - public long[] getBits() { - return bitSet.toLongArray(); + return this.cardinality; } @Override public void forEachIndex(IntConsumer consumer) { Objects.requireNonNull( consumer, "consumer"); - for (int i = bitSet.nextSetBit(0); i >= 0; i = bitSet.nextSetBit(i+1)) { - consumer.accept(i); - if (i == Integer.MAX_VALUE) { - break; // or (i+1) would overflow - } - } + IndexProducer.fromBitMapProducer(this).forEachIndex(consumer); } @Override public void forEachBitMap(LongConsumer consumer) { Objects.requireNonNull( consumer, "consumer"); - for ( long l : getBits() ) { + for ( long l : bitMap ) { consumer.accept(l); } } + @Override + public boolean contains(IndexProducer indexProducer) { + return contains( BitMapProducer.fromIndexProducer(indexProducer, shape)); + } + + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + LongConsumer consumer = new LongConsumer() { + int i=0; + @Override + public void accept(long w) { + if ((bitMap[i++] & w) != w) + { throw new NoMatchException(); + } + }}; + try { + bitMapProducer.forEachBitMap( consumer ); + return true; + } + catch(NoMatchException e) + { + return false; + } + + } + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java index 7b3a4028c4..7dc140b3fa 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -18,16 +18,16 @@ import java.util.List; import java.util.Objects; -import java.util.PrimitiveIterator; import java.util.TreeSet; import java.util.function.IntConsumer; import java.util.function.LongConsumer; +import org.apache.commons.collections4.bloomfilter.exceptions.NoMatchException; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; /** - * A bloom filter using a Java BitSet to track enabled bits. This is a standard - * implementation and should work well for most Bloom filters. + * A bloom filter using a TreeSet of integers to track enabled bits. This is a standard + * implementation and should work well for most low cardinality Bloom filters. * @since 4.5 */ public class SparseBloomFilter implements BloomFilter { @@ -56,7 +56,7 @@ public SparseBloomFilter(Shape shape) { public SparseBloomFilter(final Shape shape, Hasher hasher) { this( shape ); Objects.requireNonNull( hasher, "hasher"); - hasher.forEach( h -> h.iterator(shape).forEachRemaining( (IntConsumer) indices::add )); + hasher.indices(shape).forEachIndex( this.indices::add ); } /** @@ -80,10 +80,7 @@ public SparseBloomFilter(Shape shape, List indices) { @Override public boolean mergeInPlace(Hasher hasher) { Objects.requireNonNull( hasher, "hasher"); - PrimitiveIterator.OfInt iter = hasher.iterator(shape); - while (iter.hasNext()) { - indices.add( iter.next() ); - } + hasher.indices(shape).forEachIndex( this.indices::add ); return true; } @@ -123,22 +120,23 @@ public void forEachBitMap(LongConsumer consumer) { if (cardinality() == 0) { return; } - long bucket = 0; - long bucektIdx=0; - for (int i : indices ) { - int nextIndex = BitMap.getLongIndex( i ); - while (nextIndex > bucektIdx) - { - consumer.accept(bucket); - bucket =0; - bucektIdx++; - } - bucket |= BitMap.getLongBit( i ); - } - if (bucket != 0) { - consumer.accept( bucket ); + BitMapProducer.fromIndexProducer( this, shape).forEachBitMap(consumer); + } + + @Override + public boolean contains(IndexProducer indexProducer) { + try { + indexProducer.forEachIndex( idx -> { if (!indices.contains(idx)) { throw new NoMatchException(); }}); + return true; + } catch (NoMatchException e) { + return false; } } + @Override + public boolean contains(BitMapProducer bitMapProducer) { + return contains( IndexProducer.fromBitMapProducer(bitMapProducer)); + } + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java new file mode 100644 index 0000000000..c14b984145 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.exceptions; + +/** + * An exception to short circuit Bloom filter match functionality using producers. + * + */ +public class NoMatchException extends RuntimeException { + + /** + * + */ + private static final long serialVersionUID = 1L; + + /** + * Constructor. + */ + public NoMatchException() { + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java new file mode 100644 index 0000000000..15120df803 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java @@ -0,0 +1 @@ +package org.apache.commons.collections4.bloomfilter.exceptions; \ No newline at end of file diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index d695a26449..1250f02877 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -16,11 +16,10 @@ */ package org.apache.commons.collections4.bloomfilter.hasher; -import java.util.PrimitiveIterator; import java.util.function.Consumer; import org.apache.commons.collections4.bloomfilter.Shape; -import org.apache.commons.collections4.bloomfilter.BitCountProducer.BitCountConsumer; +import org.apache.commons.collections4.bloomfilter.IndexProducer; /** * A Hasher represents items of arbitrary byte size as a byte representation of @@ -49,22 +48,19 @@ public interface Hasher { /** - * Gets an iterator of integers that are the bits to enable in the Bloom - * filter based on the shape. + * Creates an IndexProducer that for this hasher based on the Shape. * - *

The iterator will create indexes within the range defined by the number of bits in - * the shape. The total number of indexes will respect the number of hash functions per item + *

The iterator will create indices within the range defined by the number of bits in + * the shape. The total number of indices will respect the number of hash functions per item * defined by the shape. However the count of indexes may not be a multiple of the number of * hash functions if the implementation has removed duplicates. * *

No guarantee is made as to order of values. * - * @param shape the shape of the desired Bloom filter + * @param shape the shape of the desired Bloom filter. * @return the iterator of integers - * @throws IllegalArgumentException if the hasher cannot generate indexes for - * the specified @{@code shape} */ - PrimitiveIterator.OfInt iterator(Shape shape); + IndexProducer indices(Shape shape); /** * Gets the number of items that will be hashed by the iterator. diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java index 7b5572cac8..c4d7f9da87 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java @@ -20,11 +20,10 @@ import java.util.Arrays; import java.util.Collection; import java.util.List; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator; +import java.util.Objects; import java.util.function.Consumer; -import java.util.stream.Collectors; - +import java.util.function.IntConsumer; +import org.apache.commons.collections4.bloomfilter.IndexProducer; import org.apache.commons.collections4.bloomfilter.Shape; /** @@ -33,12 +32,10 @@ */ public class HasherCollection implements Hasher { - /** - * The list of hashers to be used to generate the iterator. - * Package private for access by the iterator. + * The list of hashers to be used to generate the indices. */ - final List hashers; + private final List hashers; /** * Constructs an empty HasherCollection. @@ -48,16 +45,17 @@ public HasherCollection() { } /** - * Constructs a DynamicHasher. + * Constructs a HasherCollection from a collection of Hasher objects. * - * @param hashers A collections of Hashers to build the iterator with. + * @param hashers A collections of Hashers to build the indices with. */ public HasherCollection(final Collection hashers) { + Objects.requireNonNull( hashers, "hashers"); this.hashers = new ArrayList<>(hashers); } /** - * Constructs a DynamicHasher. + * Constructor. * * @param function the function to use. * @param buffers the byte buffers that will be hashed. @@ -66,17 +64,35 @@ public HasherCollection(Hasher... hashers) { this( Arrays.asList(hashers)); } + /** + * Adds a hasher to the collection. + * @param hasher The hasher to add. + */ public void add(Hasher hasher) { + Objects.requireNonNull( hasher, "hasher"); hashers.add(hasher); } + /** + * Add all the Hashers in a collection to this HasherCollection. + * @param hashers The hashers to add. + */ public void add(Collection hashers) { + Objects.requireNonNull( hashers, "hashers"); hashers.addAll(hashers); } @Override - public PrimitiveIterator.OfInt iterator(final Shape shape) { - return new Iterator(shape); + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull( shape, "shape"); + return new IndexProducer() { + @Override + public void forEachIndex(IntConsumer consumer) { + for (Hasher hasher : hashers) { + hasher.indices( shape ).forEachIndex(consumer); + } + } + }; } @Override @@ -91,60 +107,9 @@ public int size() { @Override public void forEach(Consumer consumer) { + Objects.requireNonNull( consumer, "consumer"); for (Hasher h : this.hashers) { h.forEach(consumer); } } - - - /** - * The iterator of integers. - * - *

This assumes that the list of buffers is not empty. - */ - private class Iterator implements PrimitiveIterator.OfInt { - - /** The iterator over the hashers */ - private final java.util.Iterator wrappedIterator; - - /** The shape of the filter we are createing */ - private final Shape shape; - - /** The iterator over the internal hasher */ - private PrimitiveIterator.OfInt current; - - - /** - * Constructs iterator with the specified shape. - * - * @param shape - */ - private Iterator(final Shape shape) { - this.shape = shape; - wrappedIterator = hashers.iterator(); - current = null; - } - - @Override - public boolean hasNext() { - if (current == null || !current.hasNext()) { - if (wrappedIterator.hasNext()) { - current = wrappedIterator.next().iterator(shape); - } else { - current = null; - } - } - return current != null && current.hasNext(); - } - - @SuppressWarnings("cast") // Cast to long to workaround a bug in animal-sniffer. - @Override - public int nextInt() { - if (hasNext()) { - return current.nextInt(); - } - throw new NoSuchElementException(); - } - } - } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java index a02f4b500a..9686f53553 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java @@ -17,20 +17,31 @@ package org.apache.commons.collections4.bloomfilter.hasher; import java.util.NoSuchElementException; +import java.util.Objects; import java.util.PrimitiveIterator; import java.util.PrimitiveIterator.OfInt; +import java.util.TreeSet; import java.util.function.Consumer; +import java.util.function.IntConsumer; +import org.apache.commons.collections4.bloomfilter.IndexProducer; import org.apache.commons.collections4.bloomfilter.Shape; /** - * A Hasher ithat implemente combinatorial hashing. + * A Hasher that implements combinatorial hashing. * @since 4.5 */ public final class SimpleHasher implements Hasher { + /** + * The initial hash value. + */ private final long initial; + + /** + * The value to increment the hash value by. + */ private final long increment; @@ -46,8 +57,8 @@ public SimpleHasher(long initial, long increment) { /** - * Gets an iterator of integers that are the bits to enable in the Bloom - * filter based on the shape. The iterator will not return the same value multiple + * Gets an IndexProducer that produces indices based on the shape. + * The iterator will not return the same value multiple * times. Values will be returned in ascending order. * * @param shape {@inheritDoc} @@ -55,8 +66,32 @@ public SimpleHasher(long initial, long increment) { * @throws IllegalArgumentException {@inheritDoc} */ @Override - public OfInt iterator(final Shape shape) { - return new Iterator(shape); + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull( shape, "shape"); + + return new IndexProducer() { + /** The number of hash functions per item. */ + private final int k = shape.getNumberOfHashFunctions(); + /** The number of bits in the shape. */ + private final long m = shape.getNumberOfBits(); + + /** The index of the next item. */ + private long next = SimpleHasher.this.initial; + /** The count of hash functions for the current item. */ + private int functionCount = 0; + + @Override + public void forEachIndex(IntConsumer consumer) { + Objects.requireNonNull( consumer, "consumer"); + TreeSet seen = new TreeSet(); + while (functionCount < k) { + seen.add((int) Long.remainderUnsigned( next, m )); + functionCount++; + next += SimpleHasher.this.increment; + } + seen.stream().mapToInt( s -> s.intValue() ).forEach(consumer); + } + }; } @Override @@ -66,54 +101,7 @@ public int size() { @Override public void forEach(Consumer consumer) { + Objects.requireNonNull( consumer, "consumer"); consumer.accept( this ); } - - /** - * The iterator of integers. - * - *

This assumes that the list of buffers is not empty. - */ - private class Iterator implements PrimitiveIterator.OfInt { - /** The number of hash functions per item. */ - private final int k; - /** The number of bits in the shape. */ - private final long m; - - /** The index of the next item. */ - private long next; - /** The count of hash functions for the current item. */ - private int functionCount; - - /** - * Constructs iterator with the specified shape. - * - * @param shape - */ - private Iterator(final Shape shape) { - // Assumes that shape returns non-zero positive values for hash functions and bits - k = shape.getNumberOfHashFunctions(); - m = shape.getNumberOfBits(); - next = SimpleHasher.this.initial; - functionCount = 0; - } - - @Override - public boolean hasNext() { - return functionCount < k; - } - - @SuppressWarnings("cast") // Cast to long to workaround a bug in animal-sniffer. - @Override - public int nextInt() { - if (hasNext()) { - int result = (int) Long.remainderUnsigned( next, m ); - functionCount++; - next += SimpleHasher.this.increment; - return result; - } - throw new NoSuchElementException(); - } - } - } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 7901e8f6e1..076b098b17 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -19,18 +19,6 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -import java.util.List; -import java.util.PrimitiveIterator.OfInt; -import java.util.function.BiFunction; -import java.util.function.IntConsumer; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; - import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.HasherCollection; import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; @@ -181,7 +169,7 @@ public void estimateNTest() { public final void constructorTest_Empty() { final BloomFilter bf = createEmptyFilter(shape); - final long[] lb = bf.getBits(); + final long[] lb = BloomFilter.asBitMapArray( bf ); assertEquals(0, lb.length); } @@ -193,7 +181,7 @@ public final void constructorTest_Hasher() { Hasher hasher = new SimpleHasher(0,1); final BloomFilter bf = createFilter(shape, hasher); - final long[] lb = bf.getBits(); + final long[] lb = BloomFilter.asBitMapArray(bf); assertEquals(0x1FFFF, lb[0]); assertEquals(1, lb.length); } @@ -207,7 +195,7 @@ public final void getBitsTest_SpanLong() { final SimpleHasher hasher = new SimpleHasher(63,1); final BloomFilter bf = createFilter(new Shape(2, 72), hasher ); - final long[] lb = bf.getBits(); + final long[] lb = BloomFilter.asBitMapArray(bf); assertEquals(2, lb.length); assertEquals(0x8000000000000000L, lb[0]); assertEquals(0x1, lb[1]); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java index cf64ec0833..8a75bc998d 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java @@ -19,20 +19,8 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.Arrays; import java.util.HashMap; import java.util.Map; -import java.util.concurrent.ThreadLocalRandom; -import java.util.function.BiPredicate; -import java.util.function.Function; -import java.util.function.IntConsumer; -import java.util.function.ToIntBiFunction; - -import org.apache.commons.collections4.bloomfilter.BitCountProducer.BitCountConsumer; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.junit.jupiter.api.Test; /** @@ -55,43 +43,6 @@ public void forEachCount(BitCountProducer.BitCountConsumer consumer) { } }; - -// /** -// * Function to convert int arrays to BloomFilters for testing. -// */ -// private final Function converter = counts -> { -// final BloomFilter testingFilter = new SimpleBloomFilter(shape); -// testingFilter.merge(new FixedIndexesTestHasher(shape, counts)); -// return testingFilter; -// }; - -// @Override -// protected ArrayCountingBloomFilter createEmptyFilter(final Shape shape) { -// return new ArrayCountingBloomFilter(shape); -// } -// -// @Override -// protected ArrayCountingBloomFilter createFilter(final Hasher hasher, final Shape shape) { -// final ArrayCountingBloomFilter result = new ArrayCountingBloomFilter(shape); -// result.merge( hasher ); -// return result; -// } - -// private ArrayCountingBloomFilter createFromCounts(final int[] counts) { -// // Use a dummy filter to add the counts to an empty filter -// final CountingBloomFilter dummy = new ArrayCountingBloomFilter(shape) { -// @Override -// public void forEachCount(final BitCountConsumer action) { -// for (int i = 0; i < counts.length; i++) { -// action.accept(i, counts[i]); -// } -// } -// }; -// final ArrayCountingBloomFilter bf = new ArrayCountingBloomFilter(shape); -// bf.add(dummy); -// return bf; -// } - /** * Assert the counts match the expected values. Values are for indices starting * at 0. Assert the cardinality equals the number of non-zero counts. @@ -122,9 +73,9 @@ private static void assertCounts(final CountingBloomFilter bf, final int[] expec public void constructorTest_Hasher_Duplicates() { // bit hasher has duplicates for 11, 12,13,14,15,16, and 17 final CountingBloomFilter bf = createFilter( shape, from1); - bf.add( BitCountProducer.Factory.from( shape , from11) ); + bf.add( BitCountProducer.from( from11.indices(shape)) ); - final long[] lb = bf.getBits(); + final long[] lb = BloomFilter.asBitMapArray(bf); assertEquals(1, lb.length); assertEquals(bigHashValue, lb[0]); @@ -196,7 +147,7 @@ public void addTest_overflow() { @Test public void subtractTest() { final CountingBloomFilter bf1 = createFilter( shape, from1); - bf1.add( BitCountProducer.Factory.from( shape , from11) ); + bf1.add( BitCountProducer.from( from11.indices(shape))); final CountingBloomFilter bf2 = createFilter( shape, from11); @@ -235,7 +186,7 @@ public void subtractTest_underflow() { @Test public void removeTest() { final CountingBloomFilter bf1 = createFilter( shape, from1); - bf1.add( BitCountProducer.Factory.from( shape , from11) ); + bf1.add( BitCountProducer.from( from11.indices(shape))); assertTrue( "Remove should work", bf1.remove(new SimpleBloomFilter( shape, from11)) ); assertFalse( "Should not contain", bf1.contains( from11 )); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java index 5b1a83e8b6..0113e1ba52 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java @@ -16,20 +16,7 @@ */ package org.apache.commons.collections4.bloomfilter; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.ThreadLocalRandom; -import java.util.function.BiPredicate; -import java.util.function.Function; -import java.util.function.ToIntBiFunction; - import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.junit.jupiter.api.Test; /** * Tests for the {@link ArrayCountingBloomFilter}. @@ -44,7 +31,7 @@ protected ArrayCountingBloomFilter createEmptyFilter(Shape shape) { @Override protected ArrayCountingBloomFilter createFilter(Shape shape, Hasher hasher) { ArrayCountingBloomFilter filter = createEmptyFilter( shape ); - filter.add( BitCountProducer.Factory.from(shape, hasher)); + filter.add( BitCountProducer.from( hasher.indices(shape))); return filter; } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java new file mode 100644 index 0000000000..5d26a8fdc5 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertTrue; +import java.util.ArrayList; +import java.util.List; +import java.util.function.IntConsumer; + +import org.junit.Test; + +public class BitMapProducerTest { + + @Test + public void fromIndexProducer() { + IndexProducer iProducer = new IndexProducer() { + + @Override + public void forEachIndex(IntConsumer consumer) { + consumer.accept( 0 ); + consumer.accept( 1 ); + consumer.accept( 63 ); + consumer.accept( 64 ); + consumer.accept( 127 ); + consumer.accept( 128 ); + } + }; + BitMapProducer producer = BitMapProducer.fromIndexProducer(iProducer, new Shape( 1, 200 )); + List lst = new ArrayList(); + producer.forEachBitMap( lst::add ); + long[] buckets = lst.stream().mapToLong( l -> l.longValue()).toArray(); + assertTrue( BitMap.contains( buckets, 0)); + assertTrue( BitMap.contains( buckets, 1)); + assertTrue( BitMap.contains( buckets, 63)); + assertTrue( BitMap.contains( buckets, 64)); + assertTrue( BitMap.contains( buckets, 127)); + assertTrue( BitMap.contains( buckets, 128)); + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java similarity index 98% rename from src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java index 92e85fedce..beba34b979 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMaptTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java @@ -19,11 +19,9 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - import org.junit.Test; -public class BitMaptTest { +public class BitMapTest { @Test public void checkPositiveTest() { diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index 3fac5f9610..4c793ca0ae 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -17,11 +17,6 @@ package org.apache.commons.collections4.bloomfilter; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.fail; - -import java.util.List; -import java.util.Arrays; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.HasherCollection; import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; import org.junit.jupiter.api.Test; diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java index ecc18e5db5..9fc23e2db2 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java @@ -17,13 +17,9 @@ package org.apache.commons.collections4.bloomfilter; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.fail; -import java.util.ArrayList; - -import org.apache.commons.collections4.bloomfilter.Shape; import org.junit.jupiter.api.Test; /** diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java index d9d4b82334..fddf9f6ece 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java @@ -21,9 +21,6 @@ import static org.junit.jupiter.api.Assertions.fail; -import java.util.ArrayList; - -import org.apache.commons.collections4.bloomfilter.Shape; import org.junit.jupiter.api.Test; /** @@ -59,45 +56,45 @@ public void constructor_items_bits_BadNumberOfBitsTest() { } } -// /** -// * Tests that if the number of hash functions is less than 1 an IllegalArgumentException is thrown. -// */ -// @Test -// public void constructor_items_bits_BadNumberOfHashFunctionsTest() { -// try { -// new Shape( 16, 8); -// -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// } + // /** + // * Tests that if the number of hash functions is less than 1 an IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_bits_BadNumberOfHashFunctionsTest() { + // try { + // new Shape( 16, 8); + // + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } -// /** -// * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. -// */ -// @Test -// public void constructor_items_bits_BadNumberOfItemsTest() { -// try { -// new Shape(testFunction, 0, 24); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// } + // /** + // * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_bits_BadNumberOfItemsTest() { + // try { + // new Shape(testFunction, 0, 24); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } -// /** -// * Tests that if the number of bits is less than 1 an exception is thrown -// */ -// @Test -// public void constructor_items_bits_hash_BadNumberOfBitsTest() { -// try { -// new Shape(testFunction, 5, 0, 1); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// } + // /** + // * Tests that if the number of bits is less than 1 an exception is thrown + // */ + // @Test + // public void constructor_items_bits_hash_BadNumberOfBitsTest() { + // try { + // new Shape(testFunction, 5, 0, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } /** * Tests that if the number of hash functions is less than 1 an exception is thrown. @@ -112,269 +109,269 @@ public void constructor_items_bits_hash_BadNumberOfHashFunctionsTest() { } } -// /** -// * Tests that if the number of items is less than 1 an exception is thrown. -// */ -// @Test -// public void constructor_items_bits_hash_BadNumberOfItemsTest() { -// try { -// new Shape(testFunction, 0, 24, 1); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// } + // /** + // * Tests that if the number of items is less than 1 an exception is thrown. + // */ + // @Test + // public void constructor_items_bits_hash_BadNumberOfItemsTest() { + // try { + // new Shape(testFunction, 0, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } -// /** -// * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown -// */ -// @Test -// public void constructor_items_bits_hash_BadProbabilityTest() { -// try { -// new Shape(testFunction, 4000, 8, 1); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// } + // /** + // * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown + // */ + // @Test + // public void constructor_items_bits_hash_BadProbabilityTest() { + // try { + // new Shape(testFunction, 4000, 8, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } -// /** -// * Tests that when the number of items, number of bits and number of hash functions is passed the values are -// * calculated correctly. -// */ -// @Test -// public void constructor_items_bits_hashTest() { -// /* -// * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 -// */ -// final Shape filterConfig = new Shape(testFunction, 5, 24, 4); -// -// assertEquals(24, filterConfig.getNumberOfBits()); -// assertEquals(4, filterConfig.getNumberOfHashFunctions()); -// assertEquals(5, filterConfig.getNumberOfItems()); -// assertEquals(0.102194782, filterConfig.getProbability(), 0.000001); -// } + // /** + // * Tests that when the number of items, number of bits and number of hash functions is passed the values are + // * calculated correctly. + // */ + // @Test + // public void constructor_items_bits_hashTest() { + // /* + // * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 + // */ + // final Shape filterConfig = new Shape(testFunction, 5, 24, 4); + // + // assertEquals(24, filterConfig.getNumberOfBits()); + // assertEquals(4, filterConfig.getNumberOfHashFunctions()); + // assertEquals(5, filterConfig.getNumberOfItems()); + // assertEquals(0.102194782, filterConfig.getProbability(), 0.000001); + // } -// /** -// * Tests that the number of items and number of bits is passed the other values are calculated correctly. -// */ -// @Test -// public void constructor_items_bitsTest() { -// /* -// * values from https://hur.st/bloomfilter/?n=5&m=24 -// */ -// final Shape filterConfig = new Shape(testFunction, 5, 24); -// -// assertEquals(24, filterConfig.getNumberOfBits()); -// assertEquals(3, filterConfig.getNumberOfHashFunctions()); -// assertEquals(5, filterConfig.getNumberOfItems()); -// assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); -// } -// -// /** -// * Tests that if the number of items is less than 1 an IllegalArgumentException is thrown. -// */ -// @Test -// public void constructor_items_probability_BadNumberOfItemsTest() { -// try { -// new Shape(testFunction, 0, 1.0 / 10); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // do nothing. -// } -// } -// -// /** -// * Tests that if the probability is less than or equal to 0 or more than or equal to 1 an IllegalArgumentException is thrown. -// */ -// @Test -// public void constructor_items_probability_BadProbabilityTest() { -// try { -// new Shape(testFunction, 10, 0.0); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // do nothing. -// } -// try { -// new Shape(testFunction, 10, 1.0); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // do nothing. -// } -// try { -// new Shape(testFunction, 10, Double.NaN); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // do nothing. -// } -// } -// -// /** -// * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. -// */ -// @Test -// public void constructor_items_probability_NumberOfBitsOverflowTest() { -// try { -// new Shape(testFunction, Integer.MAX_VALUE, 1.0 / 10); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // do nothing. -// } -// } -// -// /** -// * Tests the the probability is calculated correctly. -// */ -// @Test -// public void constructor_items_probability_Test() { -// -// assertEquals(24, shape.getNumberOfBits()); -// assertEquals(3, shape.getNumberOfHashFunctions()); -// assertEquals(5, shape.getNumberOfItems()); -// assertEquals(0.100375138, shape.getProbability(), 0.000001); -// } -// -// /** -// * Tests that the constructor with a null name, number of items and size of filter fails. -// */ -// @Test -// public void constructor_nm_noName() { -// try { -// new Shape(null, 5, 72); -// fail("Should throw NullPointerException"); -// } catch (final NullPointerException expected) { -// // do nothing -// } -// } -// -// /** -// * Tests that the constructor with a null name, number of items, size of filter, and number of functions fails. -// */ -// @Test -// public void constructor_nmk_noName() { -// try { -// new Shape(null, 5, 72, 17); -// fail("Should throw NullPointerException"); -// } catch (final NullPointerException expected) { -// // do nothing -// } -// } -// -// /** -// * Tests that the constructor with a null name, number of items, and probability fails. -// */ -// @Test -// public void constructor_np_noName() { -// try { -// new Shape(null, 5, 0.1); -// fail("Should throw NullPointerException"); -// } catch (final NullPointerException expected) { -// // do nothing -// } -// } -// -// /** -// * Tests that the constructor with a null name, probability, size of filter, and number of functions fails. -// */ -// @Test -// public void constructor_pmk_noName() { -// try { -// new Shape(null, 0.1, 72, 17); -// fail("Should throw NullPointerException"); -// } catch (final NullPointerException expected) { -// // do nothing -// } -// } -// -// /** -// * Tests that if the number of bits is less than 1 an exception is thrown -// */ -// @Test -// public void constructor_probability_bits_hash_BadNumberOfBitsTest() { -// try { -// new Shape(testFunction, 0.5, 0, 1); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// } -// -// /** -// * Tests that if the number of functions is less than 1 an exception is thrown -// */ -// @Test -// public void constructor_probability_bits_hash_BadNumberOfHashFunctionsTest() { -// try { -// new Shape(testFunction, 0.5, 24, 0); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// } -// -// /** -// * Tests that invalid probability values cause and IllegalArgumentException to be thrown. -// */ -// @Test -// public void constructor_probability_bits_hash_BadProbabilityTest() { -// // probability should not be 0 -// try { -// new Shape(testFunction, 0.0, 24, 1); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// -// // probability should not be = -1 -// try { -// new Shape(testFunction, -1.0, 24, 1); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// -// // probability should not be < -1 -// try { -// new Shape(testFunction, -1.5, 24, 1); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// -// // probability should not be = 1 -// try { -// new Shape(testFunction, 1.0, 24, 1); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// -// // probability should not be > 1 -// try { -// new Shape(testFunction, 2.0, 24, 1); -// fail("Should have thrown IllegalArgumentException"); -// } catch (final IllegalArgumentException expected) { -// // expected -// } -// } -// -// /** -// * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash -// * functions. -// */ -// @Test -// public void constructor_probability_bits_hashTest() { -// /* -// * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 -// */ -// final Shape filterConfig = new Shape(testFunction, 0.1, 24, 3); -// -// assertEquals(24, filterConfig.getNumberOfBits()); -// assertEquals(3, filterConfig.getNumberOfHashFunctions()); -// assertEquals(5, filterConfig.getNumberOfItems()); -// assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); -// } -// + // /** + // * Tests that the number of items and number of bits is passed the other values are calculated correctly. + // */ + // @Test + // public void constructor_items_bitsTest() { + // /* + // * values from https://hur.st/bloomfilter/?n=5&m=24 + // */ + // final Shape filterConfig = new Shape(testFunction, 5, 24); + // + // assertEquals(24, filterConfig.getNumberOfBits()); + // assertEquals(3, filterConfig.getNumberOfHashFunctions()); + // assertEquals(5, filterConfig.getNumberOfItems()); + // assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); + // } + // + // /** + // * Tests that if the number of items is less than 1 an IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_probability_BadNumberOfItemsTest() { + // try { + // new Shape(testFunction, 0, 1.0 / 10); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // do nothing. + // } + // } + // + // /** + // * Tests that if the probability is less than or equal to 0 or more than or equal to 1 an IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_probability_BadProbabilityTest() { + // try { + // new Shape(testFunction, 10, 0.0); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // do nothing. + // } + // try { + // new Shape(testFunction, 10, 1.0); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // do nothing. + // } + // try { + // new Shape(testFunction, 10, Double.NaN); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // do nothing. + // } + // } + // + // /** + // * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_probability_NumberOfBitsOverflowTest() { + // try { + // new Shape(testFunction, Integer.MAX_VALUE, 1.0 / 10); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // do nothing. + // } + // } + // + // /** + // * Tests the the probability is calculated correctly. + // */ + // @Test + // public void constructor_items_probability_Test() { + // + // assertEquals(24, shape.getNumberOfBits()); + // assertEquals(3, shape.getNumberOfHashFunctions()); + // assertEquals(5, shape.getNumberOfItems()); + // assertEquals(0.100375138, shape.getProbability(), 0.000001); + // } + // + // /** + // * Tests that the constructor with a null name, number of items and size of filter fails. + // */ + // @Test + // public void constructor_nm_noName() { + // try { + // new Shape(null, 5, 72); + // fail("Should throw NullPointerException"); + // } catch (final NullPointerException expected) { + // // do nothing + // } + // } + // + // /** + // * Tests that the constructor with a null name, number of items, size of filter, and number of functions fails. + // */ + // @Test + // public void constructor_nmk_noName() { + // try { + // new Shape(null, 5, 72, 17); + // fail("Should throw NullPointerException"); + // } catch (final NullPointerException expected) { + // // do nothing + // } + // } + // + // /** + // * Tests that the constructor with a null name, number of items, and probability fails. + // */ + // @Test + // public void constructor_np_noName() { + // try { + // new Shape(null, 5, 0.1); + // fail("Should throw NullPointerException"); + // } catch (final NullPointerException expected) { + // // do nothing + // } + // } + // + // /** + // * Tests that the constructor with a null name, probability, size of filter, and number of functions fails. + // */ + // @Test + // public void constructor_pmk_noName() { + // try { + // new Shape(null, 0.1, 72, 17); + // fail("Should throw NullPointerException"); + // } catch (final NullPointerException expected) { + // // do nothing + // } + // } + // + // /** + // * Tests that if the number of bits is less than 1 an exception is thrown + // */ + // @Test + // public void constructor_probability_bits_hash_BadNumberOfBitsTest() { + // try { + // new Shape(testFunction, 0.5, 0, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } + // + // /** + // * Tests that if the number of functions is less than 1 an exception is thrown + // */ + // @Test + // public void constructor_probability_bits_hash_BadNumberOfHashFunctionsTest() { + // try { + // new Shape(testFunction, 0.5, 24, 0); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } + // + // /** + // * Tests that invalid probability values cause and IllegalArgumentException to be thrown. + // */ + // @Test + // public void constructor_probability_bits_hash_BadProbabilityTest() { + // // probability should not be 0 + // try { + // new Shape(testFunction, 0.0, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // + // // probability should not be = -1 + // try { + // new Shape(testFunction, -1.0, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // + // // probability should not be < -1 + // try { + // new Shape(testFunction, -1.5, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // + // // probability should not be = 1 + // try { + // new Shape(testFunction, 1.0, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // + // // probability should not be > 1 + // try { + // new Shape(testFunction, 2.0, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } + // + // /** + // * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash + // * functions. + // */ + // @Test + // public void constructor_probability_bits_hashTest() { + // /* + // * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 + // */ + // final Shape filterConfig = new Shape(testFunction, 0.1, 24, 3); + // + // assertEquals(24, filterConfig.getNumberOfBits()); + // assertEquals(3, filterConfig.getNumberOfHashFunctions()); + // assertEquals(5, filterConfig.getNumberOfItems()); + // assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); + // } + // /** * Test equality of shape. */ diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java index b06bf60183..760c307e4c 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java @@ -21,7 +21,7 @@ /** * Tests for the {@link SimpleBloomFilter}. */ -public class SimpleBloomFilterTest extends AbstractBloomFilterTest { +public class SimpleBloomFilterTest extends AbstractBloomFilterTest { @Override protected SimpleBloomFilter createEmptyFilter(final Shape shape) { return new SimpleBloomFilter(shape); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java index a65f23834d..9d45a7c2e5 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java @@ -19,9 +19,9 @@ import org.apache.commons.collections4.bloomfilter.hasher.Hasher; /** - * Tests for the {@link SimpleBloomFilter}. + * Tests for the {@link SparseBloomFilter}. */ -public class SparseBloomFilterTest extends AbstractBloomFilterTest { +public class SparseBloomFilterTest extends AbstractBloomFilterTest { @Override protected SparseBloomFilter createEmptyFilter(final Shape shape) { return new SparseBloomFilter(shape); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java index acd225d9d4..f505452c81 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java @@ -18,12 +18,9 @@ import static org.junit.Assert.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; - -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator; -import java.util.PrimitiveIterator.OfInt; - +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.bloomfilter.IndexProducer; import org.apache.commons.collections4.bloomfilter.Shape; import org.junit.jupiter.api.Test; @@ -44,15 +41,18 @@ public void sizeTest() { } + @Test - public void testIterator() { + public void testIndices() { Shape shape = new Shape( 5, 10 ); - int[] expected = { 1,2,3,4,5,2,4,6,8,0 }; - OfInt iter = hasher.iterator(shape); + Integer[] expected = { 1,2,3,4,5,0,2,4,6,8 }; + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( expected.length, lst.size()); for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], iter.next() ); + assertEquals( expected[i], lst.get(i) ); } - assertFalse( iter.hasNext()); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java index 09ae011123..259d5a91e9 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java @@ -18,12 +18,9 @@ import static org.junit.Assert.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; - -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator; -import java.util.PrimitiveIterator.OfInt; - +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.bloomfilter.IndexProducer; import org.apache.commons.collections4.bloomfilter.Shape; import org.junit.jupiter.api.Test; @@ -42,11 +39,14 @@ public void sizeTest() { @Test public void testIterator() { Shape shape = new Shape( 5, 10 ); - OfInt iter = hasher.iterator(shape); - for (int i=1;i<6;i++) { - assertEquals( i, iter.next() ); + Integer[] expected = { 1,2,3,4,5}; + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( expected.length, lst.size()); + for (int i=0;i< expected.length;i++) { + assertEquals( expected[i], lst.get(i) ); } - assertFalse( iter.hasNext()); } } From 0777870e847aaef2ec9129f566cb7843c4ff1ee3 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Wed, 6 Oct 2021 20:14:52 +0100 Subject: [PATCH 07/27] Added license header. --- .../bloomfilter/exceptions/package-info.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java index 15120df803..ca3f809ecc 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java @@ -1 +1,17 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.commons.collections4.bloomfilter.exceptions; \ No newline at end of file From 891cd4de66e551fc26288d18f714d45604061131 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Thu, 7 Oct 2021 18:57:30 +0100 Subject: [PATCH 08/27] Updated documentation --- .../collections4/bloomfilter/BitMap.java | 3 + .../bloomfilter/BitMapProducer.java | 11 +- .../collections4/bloomfilter/BloomFilter.java | 3 - .../bloomfilter/CountingBloomFilter.java | 54 +++---- .../bloomfilter/IndexProducer.java | 4 +- .../bloomfilter/SetOperations.java | 139 +++++++++++++++--- .../collections4/bloomfilter/Shape.java | 4 - .../bloomfilter/hasher/Hasher.java | 49 ++---- .../bloomfilter/hasher/HasherCollection.java | 14 +- .../bloomfilter/hasher/SimpleHasher.java | 50 +++++-- .../bloomfilter/hasher/package-info.java | 19 ++- .../bloomfilter/package-info.java | 115 ++++++--------- .../ArrayCountingBloomFilterTest.java | 2 +- .../bloomfilter/SetOperationsTest.java | 62 ++++++++ .../bloomfilter/ShapeFactoryTest.java | 1 - .../bloomfilter/hasher/SimpleHasherTest.java | 82 +++++++++++ 16 files changed, 416 insertions(+), 196 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java index 793279e35b..0d746ffb19 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -16,6 +16,8 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.Objects; + /** * Contains functions to convert {@code int} indices into Bloom filter bit positions. */ @@ -113,6 +115,7 @@ public static long getLongBit(final int bitIndex) { * @return true if the cardinality is sparse within the bucket. */ public static boolean isSparse( int cardinality, Shape shape ) { + Objects.requireNonNull( shape, "shape"); return numberOfBuckets(shape.getNumberOfBits()-1)*2 >= cardinality; } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java index b371472c31..10f4620bf3 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -37,11 +37,11 @@ public interface BitMapProducer { /** - * Performs the given action for each {@code index} that represents an enabled bit. + * Each BitMap is passed to the consumer in order. * Any exceptions thrown by the action are relayed to the caller. * - * @param consumer the action to be performed for each non-zero bit index. - * @throws NullPointerException if the specified action is null + * @param consumer the consumer of the BitMaps. + * @throws NullPointerException if the specified consumer is null */ void forEachBitMap(LongConsumer consumer); @@ -52,6 +52,8 @@ public interface BitMapProducer { * @return A BitMapProducer that produces the BitMap equivalent of the Indices from the producer. */ public static BitMapProducer fromIndexProducer( IndexProducer producer, Shape shape ) { + Objects.requireNonNull( producer, "producer"); + Objects.requireNonNull( shape, "shape"); return new BitMapProducer() { private int maxBucket = -1; @@ -59,7 +61,8 @@ public static BitMapProducer fromIndexProducer( IndexProducer producer, Shape sh @Override public void forEachBitMap(LongConsumer consumer) { - /* we can not assume that all the processes ints will be in order + Objects.requireNonNull( consumer, "consumer"); + /* we can not assume that all the ints will be in order * and not repeated. This is because the HasherCollection does * not make the guarantee. */ diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index 16aa62f121..b14763478b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -18,10 +18,7 @@ import java.util.ArrayList; import java.util.List; -import java.util.NoSuchElementException; import java.util.Objects; -import java.util.PrimitiveIterator; - import org.apache.commons.collections4.bloomfilter.hasher.Hasher; /** diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index 032c9f8b50..049f91755b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -118,66 +118,60 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { /** - * Adds the specified counting Bloom filter to this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be incremented - * by their corresponding counts in the {@code other} filter. + * Adds the specified BitCountProducer to this Bloom filter. Specifically + * all counts for the indexes identified by the {@code other} will be incremented + * by their corresponding values in the {@code other}. * - *

This method will return true if the filter is valid after the operation. + *

This method will return true if the filter is valid after the operation.

* - * @param other the other counting Bloom filter + * @param other the BitCountProducer to add. * @return true if the addition was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter * @see #isValid() */ boolean add(BitCountProducer other); /** - * Adds the specified counting Bloom filter to this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be decremented - * by their corresponding counts in the {@code other} filter. + * Adds the specified BitCountProducer to this Bloom filter. Specifically + * all counts for the indexes identified by the {@code other} will be decremented + * by their corresponding values in the {@code other}. * - *

This method will return true if the filter is valid after the operation. + *

This method will return true if the filter is valid after the operation.

* - * @param other the other counting Bloom filter + * @param other the BitCountProducer to subtract. * @return true if the subtraction was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter * @see #isValid() */ boolean subtract(BitCountProducer other); /** - * Merges the specified Bloom filter into this Bloom filter. Specifically all bit indexes - * that are enabled in the {@code other} filter will be enabled in this filter. + * Merges the specified Bloom filter into this Bloom filter to produce a new CountingBloomFilter. + * Specifically the new Bloom filter will contain all the counts of this filter and in addition + * all bit indexes that are enabled in the {@code other} filter will be incremented + * by one in the new filter. * - *

Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the {@code other} Bloom filter. + *

Note: the validity of the resulting filter is not guaranteed. When in doubt {@code isValid()} + * should be called on the new filter.

* * @param other the other Bloom filter - * @return true if the merge was successful - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter + * @return A new CountingBloomFilter instance. */ @Override CountingBloomFilter merge(BloomFilter other); /** - * Merges the specified decomposed Bloom filter into this Bloom filter. Specifically all - * bit indexes that are identified by the {@code hasher} will be enabled in this filter. + * Merges the specified hasher with this Bloom filter to create a new CountingBloomFilter. + * Specifically the new Bloom filter will contain all the counts of this filter and in addition + * all bit indexes specified by the {@code hasher} will be incremented + * by one in the new filter. * * For HasherCollections each SimpleHasher will be considered a single item and increment * the counts separately. * - *

Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the specified decomposed Bloom filter. + *

Note: the validity of the resulting filter is not guaranteed. When in doubt {@code isValid()} + * should be called on the new filter.

* * @param hasher the hasher to provide the indexes - * @return true if the merge was successful - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter + * @return A new CountingBloomFilter instance. */ @Override CountingBloomFilter merge(Hasher hasher); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java index c10ad18e87..cb79ba0b8a 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -21,13 +21,13 @@ import java.util.function.LongConsumer; /** - * An object that produces indices for or of a Bloom filter. + * An object that produces indices of a Bloom filter. * */ public interface IndexProducer { /** - * Performs the given action for each {@code index} that represents an enabled bit. + * Each index is passed to the consumer. * Any exceptions thrown by the action are relayed to the caller. * * @param consumer the action to be performed for each non-zero bit index. diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java index 1f03aa85e4..7a2801dee6 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java @@ -16,7 +16,9 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.BitSet; +import java.util.function.LongBinaryOperator; +import java.util.function.LongConsumer; +import java.util.function.LongUnaryOperator; /** * Implementations of set operations on Bloom filters. @@ -24,6 +26,117 @@ */ public final class SetOperations { + + /** + * Calculates cardinality from BitMaps. + * + * When there are 2 words to compare the op2 is executed and then the cardinality + * of the resulting word is calculated. + * + * When there is only one word to execute on the op1 is executed and the cardinality + * of the resulting word is caluclated. + * + * The calculated cardinalities are summed to return the cardinality of the operation. + * + */ + private static class CardCounter implements LongConsumer { + /** + * The calculated cardinality + */ + private int cardinality = 0; + /** + * The index into the array of words + */ + private int idx=0; + /** + * The array of words + */ + private long[] words; + /** + * The operator to execute for 2 words + */ + private LongBinaryOperator op2; + /** + * The operator to execute for a single word; + */ + private LongUnaryOperator op1; + + /** + * Constructor. + * @param words The array of BitMap words for a Bloom filter + * @param op2 The operation to execute when there are two words to compare. + * @param op1 The operation to execute when there is only one word to cmpare. + */ + public CardCounter( long[] words, LongBinaryOperator op2, LongUnaryOperator op1 ) { + this.words = words; + this.op2 = op2; + this.op1 = op1; + } + + @Override + public void accept(long word) { + if (idxx&y, (x)->0); + second.forEachBitMap(lc); + return lc.getCardinality(); + } + + /** + * Calculates the cardinality of the logical OR of the BitMaps for the two filters. + * @param first the first filter. + * @param second the second filter + * @return the cardinality of the OR of the filters. + */ + public static int orCardinality(final BloomFilter first, final BloomFilter second) { + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder( first.getShape()); + first.forEachBitMap( builder ); + CardCounter lc = new CardCounter(builder.getArray(), (x,y)->x|y, (x)->x); + second.forEachBitMap(lc); + return lc.getCardinality(); + } + + /** + * Calculates the cardinality of the logical XOR of the BitMaps for the two filters. + * @param first the first filter. + * @param second the second filter + * @return the cardinality of the XOR of the filters. + */ + public static int xorCardinality(final BloomFilter first, final BloomFilter second) { + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder( first.getShape()); + first.forEachBitMap( builder ); + CardCounter lc = new CardCounter(builder.getArray(), (x,y)->x^y, (x)->x); + second.forEachBitMap(lc); + return lc.getCardinality(); + } + /** * Calculates the Cosine distance between two Bloom filters. * @@ -37,24 +150,6 @@ public static double cosineDistance(final BloomFilter first, final BloomFilter s return 1.0 - cosineSimilarity(first, second); } - private static BitSet and(final BloomFilter first, final BloomFilter second) { - BitSet result = BitSet.valueOf(BloomFilter.asBitMapArray(first)); - result.and(BitSet.valueOf(BloomFilter.asBitMapArray(second))); - return result; - } - - private static BitSet or(final BloomFilter first, final BloomFilter second) { - BitSet result = BitSet.valueOf(BloomFilter.asBitMapArray(first)); - result.or(BitSet.valueOf(BloomFilter.asBitMapArray(second))); - return result; - } - - private static BitSet xor(final BloomFilter first, final BloomFilter second) { - BitSet result = BitSet.valueOf(BloomFilter.asBitMapArray(first)); - result.xor(BitSet.valueOf(BloomFilter.asBitMapArray(second))); - return result; - } - /** * Calculates the Cosine similarity between two Bloom filters. *

Also known as Orchini similarity and the Tucker coefficient of congruence or @@ -67,7 +162,7 @@ private static BitSet xor(final BloomFilter first, final BloomFilter second) { * @return the Cosine similarity. */ public static double cosineSimilarity(final BloomFilter first, final BloomFilter second) { - final int numerator = and( first, second).cardinality(); + final int numerator = andCardinality( first, second); return numerator == 0 ? 0 : numerator / (Math.sqrt(first.cardinality()) * Math.sqrt(second.cardinality())); } @@ -81,7 +176,7 @@ public static double cosineSimilarity(final BloomFilter first, final BloomFilter * @return the Hamming distance. */ public static int hammingDistance(final BloomFilter first, final BloomFilter second) { - return xor(first,second).cardinality(); + return xorCardinality(first,second); } /** @@ -107,7 +202,7 @@ public static double jaccardDistance(final BloomFilter first, final BloomFilter * @return the Jaccard similarity. */ public static double jaccardSimilarity(final BloomFilter first, final BloomFilter second) { - final int orCard = or(first,second).cardinality(); + final int orCard = orCardinality(first,second); // if the orCard is zero then the hamming distance will also be zero. return orCard == 0 ? 0 : hammingDistance(first, second) / (double) orCard; } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java index 1caf6b8b07..3f11188c0f 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -34,10 +34,6 @@ *

{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}
Number of * Functions ({@code k})
{@code k = round((m / n) * ln(2))}
* - *

Comparisons

For purposes of equality checking and hashCode - * calculations a {@code Shape} is defined by the hashing function identity, the number of - * bits ({@code m}), and the number of functions ({@code k}).

- * * @see Bloom Filter calculator * @see Bloom filter * [Wikipedia] diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index 1250f02877..c6f5322293 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -16,46 +16,27 @@ */ package org.apache.commons.collections4.bloomfilter.hasher; -import java.util.function.Consumer; - import org.apache.commons.collections4.bloomfilter.Shape; import org.apache.commons.collections4.bloomfilter.IndexProducer; /** - * A Hasher represents items of arbitrary byte size as a byte representation of - * fixed size (a hash). The hash representations can be used to create indexes - * for a Bloom filter. - * - *

The hash for each item is created using a hash function; use of different - * seeds allows generation of different hashes for the same item. The hashes can - * be dynamically converted into the bit index representation used by a Bloom - * filter. The shape of the Bloom filter defines the number of indexes per item - * and the range of the indexes. The hasher can generate the correct number of - * indexes in the range required by the Bloom filter for each item it - * represents. - * - *

Note that the process of generating hashes and mapping them to a Bloom - * filter shape may create duplicate indexes. The hasher may generate fewer than - * the required number of hash functions per item if duplicates have been - * removed. Implementations of {@code iterator()} may return duplicate values - * and may return values in a random order. See implementation javadoc notes as - * to the guarantees provided by the specific implementation. - * - *

Hashers have an identity based on the hashing algorithm used. + * A Hasher create IndexProducer based on the hash implementation and the + * provided Shape. * * @since 4.5 */ public interface Hasher { /** - * Creates an IndexProducer that for this hasher based on the Shape. + * Creates an IndexProducer for this hasher based on the Shape. * - *

The iterator will create indices within the range defined by the number of bits in + *

The @{code IndexProducer} will create indices within the range defined by the number of bits in * the shape. The total number of indices will respect the number of hash functions per item - * defined by the shape. However the count of indexes may not be a multiple of the number of - * hash functions if the implementation has removed duplicates. + * defined by the shape. However the count of indices may not be a multiple of the number of + * hash functions once implementation has removed duplicates.

* - *

No guarantee is made as to order of values. + *

No guarantee is made as to order of indices.

+ *

Duplicates indices for a single item must be removed.

* * @param shape the shape of the desired Bloom filter. * @return the iterator of integers @@ -63,20 +44,10 @@ public interface Hasher { IndexProducer indices(Shape shape); /** - * Gets the number of items that will be hashed by the iterator. - * @return The number of items that will be hashed by the iterator. + * Gets the number of items that will be hashed by the {@code IndexProducer}. + * @return The number of items that will be hashed by the {@code IndexProducer}. */ int size(); - /** - * Performs the given action for each hasher. - * - * For collections of hashers, this method must be called on each hasher in the collection. - * - * @param consumer the action to be performed for each hasher - * @throws NullPointerException if the specified action is null - */ - void forEach(Consumer consumer); - } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java index c4d7f9da87..4363cc4026 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java @@ -21,13 +21,15 @@ import java.util.Collection; import java.util.List; import java.util.Objects; -import java.util.function.Consumer; import java.util.function.IntConsumer; import org.apache.commons.collections4.bloomfilter.IndexProducer; import org.apache.commons.collections4.bloomfilter.Shape; /** - * The class that performs hashing on demand. + * A collection of Hashers. Useful when the generation of a Bloom filter depends upon + * multiple items. Hashers for each item are added to the HasherCollection and then + * the collection is used wherever a Hasher can be used in the API. + * * @since 4.5 */ public class HasherCollection implements Hasher { @@ -104,12 +106,4 @@ public int size() { } return i; } - - @Override - public void forEach(Consumer consumer) { - Objects.requireNonNull( consumer, "consumer"); - for (Hasher h : this.hashers) { - h.forEach(consumer); - } - } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java index 9686f53553..630d7fb384 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java @@ -16,12 +16,8 @@ */ package org.apache.commons.collections4.bloomfilter.hasher; -import java.util.NoSuchElementException; import java.util.Objects; -import java.util.PrimitiveIterator; -import java.util.PrimitiveIterator.OfInt; import java.util.TreeSet; -import java.util.function.Consumer; import java.util.function.IntConsumer; import org.apache.commons.collections4.bloomfilter.IndexProducer; @@ -30,6 +26,10 @@ /** * A Hasher that implements combinatorial hashing. + *

+ * Common use for this hasher is to generate a byte array as the output of a hashing + * or MessageDigest algorithm.

+ * * @since 4.5 */ public final class SimpleHasher implements Hasher { @@ -45,6 +45,42 @@ public final class SimpleHasher implements Hasher { private final long increment; + /** + * Convert bytes to long. + * @param byteArray the byte array to extract the values from. + * @param offset the offset to start extraction from. + * @param len the length of the extraction, may be longer than 8. + * @return + */ + private static final long toLong (byte[] byteArray, int offset, int len) + { + long val = 0; + len = Math.min(len, Long.BYTES); + for (int i = 0;i consumer) { - Objects.requireNonNull( consumer, "consumer"); - consumer.accept( this ); - } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java index e5f96bd271..8c2a46592c 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java @@ -16,7 +16,24 @@ */ /** - * Hasher definition and examples for the Bloom filter implementation. + * + * With the exception of the HasherCollection, a Hasher represents an item of arbitrary + * byte size as multiple byte representations of fixed size (multiple hashes). The hashers + * are be used to create indices for a Bloom filter.

+ * + *

Hashers create @{code IndexProducer} instances for hashed items based + * on a @{code Shape}.

+ * + *

The method used to generate the multiple hashes is dependent upon the Hasher + * implementation. The SimpleHasher uses a combinatorial strategy to create the + * multiple hashes from a single starting hash.

+ * + *

Note that the process of generating hashes and mapping them to a Bloom + * filter shape may create duplicate indexes. The Hasher implementation is required to + * remove all duplicate values for a single item. Thus tge hasher may generate fewer + * than the required number of hash values per item after duplicates have been + * removed.

+ * * * @since 4.5 */ diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java index bfc3d67abe..541af51169 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java @@ -21,13 +21,14 @@ *

* Background:

*

- * A Bloom filter is conceptually a bit vector. It is used to - * tell you where things are not. Basically, you create a Bloom filter by creating hashes - * and converting those to enabled bits in a vector. You can merge the Bloom filters - * together with logical "or" (call this filter "B"). You can then check to see if filter - * "A" was "or"ed into "B" by testing A & B == A. if the statement is false then "A" was - * not merged into "B", otherwise it _might_ have. They are generally used where hash - * tables would be too large or as a filter front end for longer processes. For example + * The Bloom filter is a probabilistic data structure that indicates where things are not. + * Conceptually it is a a bit vector. You create a Bloom filter by creating hashes + * and converting those to enabled bits in the vector. Multiple Bloom filters may be merged + * together into one Bloom filter. It is possible to test if a filter @{code B} as merged into + * another filter @{code A} by verifying that @{code (A & B) == B}. + *

+ *

Bloom filters are generally used where hash + * tables would be too large, or as a filter front end for longer processes. For example * most browsers have a Bloom filter that is built from all known bad URLs (ones that * serve up malware). When you enter a URL the browser builds a Bloom filter and checks to * see if it is "in" the bad URL filter. If not the URL is good, if it matches, then the @@ -36,32 +37,38 @@ * gateway for a longer operation.

*

* BloomFilter

- *

+ * *

* The bloom filter code is - * an abstract class that requires implementation of 4 methods:

    - *
  • - * getBits() which - * returns the set bits as a buffer encoded into an array of long.
  • - *
  • - * getHasher() - * which returns a list of integers that are indexes of the bits that are enabled. These - * are returned in a Hasher construct.
  • - *
  • - * merge( BloomFilter ) to merge another - * Bloom filter into this one.
  • - *
  • - * merge( Hasher ) to merge the values in a hasher - * into this Bloom filter.
  • - *
- * There are 3 implementations of Bloom filter - * provided:
    - *
  • - * BitSetBloomFilter - based on the Java BitSet class.
  • + * an interface that requires implementation of 6 methods:
      + *
    • + * @{code cardinality()} + * returns the number of bits enabled in the Bloom filter.
    • + *
    • + * @{code contains(BitMapProducer)} which + * returns true if the bits specified by the BitMaps generated by the BitMapProducer are enabled in the Bloom filter.
    • *
    • + * @{code contains(IndexProducer)} which + * returns true if the bits specified by the Indices generated by IndexProducer are enabled in the Bloom filter.
    • + *
    • + * @{code getShape()} which + * returns shape the Bloom filter was created with.
    • + *
    • + * @{code isSparse()} which + * returns true if an the implementation tracks indices natively, false if BitMaps are used. In cases where + * neither are used the @{code isSparse} return value should reflect which is faster to produce.
    • + *
    • + * @{code mergeInPlace(BloomFilter)} which + * utilizes either the @{code BitMapProducer} or @{code IndexProducer} from the argument to enable extra bits + * in the internal representation of the Bloom filter..
    • + *
    + *

    + * Other methods should be implemented where they can be done so more efficiently than the default implementations. + *

    * - * CountingBloomFilter - uses a sparse array of integers (Map) to implement a counting - * Bloom filter. This filter also implements remove() methods as that is the great - * advantage of a counting Bloom filter. + * <3>CountingBloomFilter + *

    The counting bloom filter extends the Bloom filter by counting the number of times a specific bit has been + * enabled or disabled. This allows the removal (opposite of merge) of Bloom filters at the expense of additional + * overhead.

    *
  • * HasherBloomFilter - implements bloom * filter on a Hasher. A rather slow implementation but convenient in some @@ -71,48 +78,18 @@ *

    * Shape

    *

    - * Describes the Bloom filter using the - * standard number of bits, number of hash functions and number of items along with a - * description of the HashFunction. It is this description that has caused the most issues - * of late.

    - *

    - * Hasher

    - *

    - * converts byte buffers into an iterator if int based - * on a Shape. There are 2 implementations of Hasher provided

      - *
    • - * Dynamic - calls - * the HashFunction for each value required in the Bloom filter.
    • - *
    • - * Static - based - * on a pre-calculated list of Bloom filter index values. It is also limited to generating - * values for a specific Shape.
    • - *
    + * The Shape describes the Bloom filter using the number of bits and the number of hash functions

    * *

    - * Hash Functions

    - *

    - * Hash - * functions generate individual index values for the filter from a byte buffer. There are - * four implementations provided.

    - *

    - * HashFunctionIdentity

    + * Hasher *

    - * The - * HashFunctionIdentity is the base interface for the HashFunction. It tracks three (3) - * properties:

      - *
    • - * The Hashing algorithm
    • - *
    • - * Whether the contents of the - * resulting hash buffer are read as signed or unsigned values.
    • - *
    • - * Whether the hash - * function uses an iterative or cyclic method. In traditional iterative methods this is - * done by calling the selected hash function with a different seed for each hash - * required. The second method described by Adam Kirsch and Micheal Mitzenmacher[1] has - * become more common and is used in applications like Cassandra[2].
    • - *
    + * A Hasher converts bytes into an series of integers based on a Shape. With the exception of the HasherCollecton, + * Each hasher represents one item being added to the Bloom filter. The HasherCollection represents the + * number of items as the sum of the number of items represented by Hashers in the collection.

    + *

    The SimpleHasher uses a combinatorial generation technique to create the integers. It is easily + * initialized by using a standard @{code MessageDigest} or other Hash function to hash the item to insert and + * then splitting the hash bytes in half and considering each as a long value. + * Other implementations of the Hasher are easy to implement.

    * *

    References

    * diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java index 0113e1ba52..0cc70c459c 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java @@ -21,7 +21,7 @@ /** * Tests for the {@link ArrayCountingBloomFilter}. */ -public class ArrayCountingBloomFilterTest extends AbstractCountingBloomFilterTest { +public class ArrayCountingBloomFilterTest extends AbstractCountingBloomFilterTest { @Override protected ArrayCountingBloomFilter createEmptyFilter(Shape shape) { diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index 4c793ca0ae..c9d225ea6b 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -17,6 +17,9 @@ package org.apache.commons.collections4.bloomfilter; import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.Arrays; + import org.apache.commons.collections4.bloomfilter.hasher.HasherCollection; import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; import org.junit.jupiter.api.Test; @@ -200,4 +203,63 @@ public final void jaccardSimilarityTest_NoValues() { assertEquals(1.0, SetOperations.jaccardSimilarity(filter1, filter3), 0.0001); assertEquals(1.0, SetOperations.jaccardSimilarity(filter3, filter1), 0.0001); } + + @Test + public final void orCardinalityTest() { + Shape shape = new Shape( 3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63, 64)); + SparseBloomFilter filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); + assertEquals( 5, SetOperations.orCardinality(filter1, filter2) ); + assertEquals( 5, SetOperations.orCardinality(filter2, filter1) ); + + filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63 )); + filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); + assertEquals( 5, SetOperations.orCardinality(filter1, filter2) ); + assertEquals( 5, SetOperations.orCardinality(filter2, filter1) ); + + filter1 = new SparseBloomFilter( shape, Arrays.asList(5, 63 )); + filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); + assertEquals( 4, SetOperations.orCardinality(filter1, filter2) ); + assertEquals( 4, SetOperations.orCardinality(filter2, filter1) ); + } + + @Test + public final void andCardinalityTest() { + Shape shape = new Shape( 3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63, 64)); + SparseBloomFilter filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); + assertEquals( 1, SetOperations.andCardinality(filter1, filter2) ); + assertEquals( 1, SetOperations.andCardinality(filter2, filter1) ); + + filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63 )); + filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); + assertEquals( 0, SetOperations.andCardinality(filter1, filter2) ); + assertEquals( 0, SetOperations.andCardinality(filter2, filter1) ); + + filter1 = new SparseBloomFilter( shape, Arrays.asList(5, 63 )); + filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); + assertEquals( 1, SetOperations.andCardinality(filter1, filter2) ); + assertEquals( 1, SetOperations.andCardinality(filter2, filter1) ); + + } + + @Test + public final void xorCardinalityTest() { + Shape shape = new Shape( 3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63, 64)); + SparseBloomFilter filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); + assertEquals( 4, SetOperations.xorCardinality(filter1, filter2) ); + assertEquals( 4, SetOperations.xorCardinality(filter2, filter1) ); + + filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63 )); + filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); + assertEquals( 5, SetOperations.xorCardinality(filter1, filter2) ); + assertEquals( 5, SetOperations.xorCardinality(filter2, filter1) ); + + filter1 = new SparseBloomFilter( shape, Arrays.asList(5, 63 )); + filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); + assertEquals( 3, SetOperations.xorCardinality(filter1, filter2) ); + assertEquals( 3, SetOperations.xorCardinality(filter2, filter1) ); + + } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java index 9fc23e2db2..5cab5cd6c3 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java @@ -40,7 +40,6 @@ public class ShapeFactoryTest { * k = 3 */ - private final Shape shape = new Shape(3, 24 ); /** * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java index 259d5a91e9..7c1f29edff 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java @@ -49,4 +49,86 @@ public void testIterator() { } } + @Test + public void constructorBufferTest() { + Shape shape = new Shape( 5, 10 ); + byte[] buffer = { 1, 1}; + SimpleHasher hasher = new SimpleHasher( buffer ); + Integer[] expected = { 1,2,3,4,5}; + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( expected.length, lst.size()); + for (int i=0;i< expected.length;i++) { + assertEquals( expected[i], lst.get(i) ); + } + + buffer = new byte[] { 1 }; + hasher = new SimpleHasher( buffer ); + expected = new Integer[]{ 0,1,2,3,4 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( expected.length, lst.size()); + for (int i=0;i< expected.length;i++) { + assertEquals( expected[i], lst.get(i) ); + } + + + buffer = new byte[] { 1, 0, 1 }; + hasher = new SimpleHasher( buffer ); + expected = new Integer[]{ 1,2,3,4,5 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( expected.length, lst.size()); + for (int i=0;i< expected.length;i++) { + assertEquals( expected[i], lst.get(i) ); + } + + buffer = new byte[] { 0, 1, 0, 1 }; + hasher = new SimpleHasher( buffer ); + expected = new Integer[]{ 1,2,3,4,5 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( expected.length, lst.size()); + for (int i=0;i< expected.length;i++) { + assertEquals( expected[i], lst.get(i) ); + } + + buffer = new byte[] { 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1}; + hasher = new SimpleHasher( buffer ); + expected = new Integer[]{ 1,2,3,4,5 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( expected.length, lst.size()); + for (int i=0;i< expected.length;i++) { + assertEquals( expected[i], lst.get(i) ); + } + + buffer = new byte[] { 0,0,0,0,0,0,0,1,5,5,0,0,0,0,0,0,0,1,5,5}; + hasher = new SimpleHasher( buffer ); + expected = new Integer[]{ 1,2,3,4,5 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( expected.length, lst.size()); + for (int i=0;i< expected.length;i++) { + assertEquals( expected[i], lst.get(i) ); + } + + buffer = new byte[] { 0,0,0,0,0,0,0,1,5,0,0,0,0,0,0,0,1,5,5}; + hasher = new SimpleHasher( buffer ); + expected = new Integer[]{ 1,2,3,4,5 }; + lst = new ArrayList(); + producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( expected.length, lst.size()); + for (int i=0;i< expected.length;i++) { + assertEquals( expected[i], lst.get(i) ); + } +} + } From df3742f4ad3c3c52e0cd9275db1cf43ca0413dfb Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Sat, 9 Oct 2021 13:56:24 +0100 Subject: [PATCH 09/27] Fixed bug and added tests --- .../bloomfilter/IndexProducer.java | 7 +- .../bloomfilter/SparseBloomFilter.java | 18 ++++- .../bloomfilter/BitCountProducerTest.java | 62 +++++++++++++++ .../bloomfilter/IndexProducerTest.java | 78 +++++++++++++++++++ 4 files changed, 161 insertions(+), 4 deletions(-) create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java index cb79ba0b8a..df016fa17e 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -56,9 +56,10 @@ public void accept(long word) { consumer.accept( (wordIdx*64)+i) ; } } - }}; - producer.forEachBitMap( longConsumer::accept ); - + wordIdx++; + } + }; + producer.forEachBitMap( longConsumer::accept ); } }; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java index 7dc140b3fa..03152a3a70 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -120,7 +120,23 @@ public void forEachBitMap(LongConsumer consumer) { if (cardinality() == 0) { return; } - BitMapProducer.fromIndexProducer( this, shape).forEachBitMap(consumer); + // because our indices are always in order we can + // shorten the time necessary to create the longs for the + // consumer + long bitMap =0; + int idx=0; + for (int i : indices) { + if (BitMap.getLongIndex(i) != idx) { + consumer.accept( bitMap ); + bitMap = 0; + idx = BitMap.getLongIndex(i); + } + bitMap |= BitMap.getLongBit(i); + } + if (bitMap != 0) + { + consumer.accept( bitMap ); + } } @Override diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java new file mode 100644 index 0000000000..707f02c5db --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.IntConsumer; + +import org.junit.Test; + +public class BitCountProducerTest { + + @Test + public void fromIndexProducer() { + IndexProducer iProducer = new IndexProducer() { + + @Override + public void forEachIndex(IntConsumer consumer) { + consumer.accept( 0 ); + consumer.accept( 1 ); + consumer.accept( 63 ); + consumer.accept( 64 ); + consumer.accept( 127 ); + consumer.accept( 128 ); + } + }; + BitCountProducer producer = BitCountProducer.from(iProducer); + Map m = new HashMap(); + + producer.forEachCount( (i,v) -> m.put(i, v)); + + assertEquals( 6, m.size()); + assertEquals( Integer.valueOf(1), m.get(0)); + assertEquals( Integer.valueOf(1), m.get(1)); + assertEquals( Integer.valueOf(1), m.get(63)); + assertEquals( Integer.valueOf(1), m.get(64)); + assertEquals( Integer.valueOf(1), m.get(127)); + assertEquals( Integer.valueOf(1), m.get(128)); + + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java new file mode 100644 index 0000000000..79c31171e7 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.LongConsumer; + +import org.junit.jupiter.api.Test; + +public class IndexProducerTest { + + public IndexProducerTest() { + + } + + + + @Test + public void fromBitMapProducerTest() { + TestingBitMapProducer producer = new TestingBitMapProducer( new long[] { 1L, 2L, 3L } ); + IndexProducer underTest = IndexProducer.fromBitMapProducer( producer ); + List lst = new ArrayList(); + + underTest.forEachIndex( lst::add ); + assertEquals( 4, lst.size() ); + assertEquals( Integer.valueOf(0), lst.get(0) ); + assertEquals( Integer.valueOf(1+64), lst.get(1) ); + assertEquals( Integer.valueOf(0+128), lst.get(2) ); + assertEquals( Integer.valueOf(1+128), lst.get(3) ); + + producer = new TestingBitMapProducer( new long[] { 0xFFFFFFFFFFFFFFFFL } ); + underTest = IndexProducer.fromBitMapProducer( producer ); + lst = new ArrayList(); + + underTest.forEachIndex( lst::add ); + + assertEquals( 64, lst.size() ); + for (int i=0;i<64;i++) { + assertEquals( Integer.valueOf(i), lst.get(i) ); + } + + } + + + private class TestingBitMapProducer implements BitMapProducer { + long[] values; + + TestingBitMapProducer(long[] values) { + this.values = values; + } + + @Override + public void forEachBitMap(LongConsumer consumer) { + for (long l : values) { + consumer.accept(l); + } + } + } + + + +} From e8157bfe80dfbd66e1af36a499b1f09d5d618e6f Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Sat, 9 Oct 2021 14:01:10 +0100 Subject: [PATCH 10/27] Added "@since 4.5" where necessary --- .../commons/collections4/bloomfilter/BitCountProducer.java | 3 ++- .../org/apache/commons/collections4/bloomfilter/BitMap.java | 1 + .../commons/collections4/bloomfilter/BitMapProducer.java | 1 + .../apache/commons/collections4/bloomfilter/IndexProducer.java | 1 + .../apache/commons/collections4/bloomfilter/SetOperations.java | 1 + .../collections4/bloomfilter/exceptions/NoMatchException.java | 1 + 6 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java index 844942db97..e220e1f592 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java @@ -21,7 +21,8 @@ /** * Produces bit counts for counting type Bloom filters. * - */ + * @since 4.5 +*/ public interface BitCountProducer extends IndexProducer { /** diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java index 0d746ffb19..7253b014eb 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -20,6 +20,7 @@ /** * Contains functions to convert {@code int} indices into Bloom filter bit positions. + * @since 4.5 */ public class BitMap { /** A bit shift to apply to an integer to divided by 64 (2^6). */ diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java index 10f4620bf3..d79eed7b1c 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -33,6 +33,7 @@ * * The producer may produce empty bit maps at the end of the sequence. * + * @since 4.5 */ public interface BitMapProducer { diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java index df016fa17e..6c8b4c51ca 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -23,6 +23,7 @@ /** * An object that produces indices of a Bloom filter. * + * @since 4.5 */ public interface IndexProducer { diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java index 7a2801dee6..4752949111 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java @@ -23,6 +23,7 @@ /** * Implementations of set operations on Bloom filters. * + * @since 4.5 */ public final class SetOperations { diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java index c14b984145..b0efff37f4 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/NoMatchException.java @@ -19,6 +19,7 @@ /** * An exception to short circuit Bloom filter match functionality using producers. * + * @since 4.5 */ public class NoMatchException extends RuntimeException { From 83482f040e1cb48c39200407c958185e136339c0 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Sun, 7 Nov 2021 11:17:38 +0000 Subject: [PATCH 11/27] Added BitMapProducer constructor to SimpleBloomFilter --- .../collections4/bloomfilter/SimpleBloomFilter.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java index 7c4fe17644..9be1a39739 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -62,11 +62,19 @@ public SimpleBloomFilter(Shape shape) { * @param hasher the Hasher to initialize the filter with. */ public SimpleBloomFilter(final Shape shape, Hasher hasher) { + this( shape, BitMapProducer.fromIndexProducer( Objects.requireNonNull( hasher, "hasher").indices(shape), shape)); + } + + /** + * Constructor. + * @param shape The shape for the filter. + * @param producer the BitMap Producer to initialize the filter with. + */ + public SimpleBloomFilter(final Shape shape, BitMapProducer producer ) { Objects.requireNonNull( shape, "shape"); - Objects.requireNonNull( hasher, "hasher"); + Objects.requireNonNull( producer, "producer"); this.shape = shape; - BitMapProducer producer = BitMapProducer.fromIndexProducer( hasher.indices(shape), shape); BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape); producer.forEachBitMap( builder ); this.bitMap = builder.getArray(); From 2a1e27d810be9a5d57fa2396cc0e0657cda0f5a0 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Mon, 8 Nov 2021 12:54:34 +0000 Subject: [PATCH 12/27] added BitMapProducer.fromLongArray() and Hasher.isEmpty() --- .../bloomfilter/BitMapProducer.java | 20 +++++++++++++++++++ .../bloomfilter/hasher/Hasher.java | 8 +++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java index d79eed7b1c..a97ba66142 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -46,6 +46,26 @@ public interface BitMapProducer { */ void forEachBitMap(LongConsumer consumer); + + /** + * Creates a BitMapProducer from an array of Long. + * @param bitMaps the bitMaps to return. + * @return a BitMapProducer. + */ + public static BitMapProducer fromLongArray( long[] bitMaps ) { + return new BitMapProducer() { + + @Override + public void forEachBitMap(LongConsumer consumer) { + for (long word : bitMaps) + { + consumer.accept(word); + } + } + + }; + } + /** * Creates a BitMapProducer from an IndexProducer. * @param producer the IndexProducer that specifies the indexes of the bits to enable. diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index c6f5322293..956da7b589 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -49,5 +49,11 @@ public interface Hasher { */ int size(); - + /** + * Returns true if there are no items to be hashed. + * @return {@code true} if there are no items to be hashed. + */ + default boolean isEmpty() { + return size() == 0; + } } From ebb1ac977ed07163fed9996561c0dbbe8a38f5cd Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Wed, 10 Nov 2021 16:53:25 +0000 Subject: [PATCH 13/27] Changes to speed up Simple filter processing --- .../collections4/bloomfilter/BitMap.java | 11 ++++- .../bloomfilter/BitMapProducer.java | 2 +- .../bloomfilter/SimpleBloomFilter.java | 44 ++++++++++++++++--- .../bloomfilter/SparseBloomFilter.java | 4 +- .../bloomfilter/hasher/Hasher.java | 2 + 5 files changed, 53 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java index 7253b014eb..7199130b42 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -46,7 +46,16 @@ public static int numberOfBuckets( int numberOfBits ) { * @return {@code true} if the bit is enabled, {@code false} otherwise. */ public static boolean contains( long[] buckets, int idx ) { - return (buckets[ getLongIndex( idx )] & getLongBit( idx )) != 0; + return (getLongIndex(idx) < buckets.length && (buckets[ getLongIndex( idx )] & getLongBit( idx )) != 0); + } + + /** + * Sets the bit in the buckets + * @param buckets The array of bit buckets + * @param idx the index of the bit to set. + */ + public static void set( long[] buckets, int idx ) { + buckets[ getLongIndex( idx )] |= getLongBit( idx ); } /** diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java index a97ba66142..bc6ccd10b0 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -138,7 +138,7 @@ public ArrayBuilder( Shape shape, long[] initialValue ) { "initialValue length (%s) is longer than shape length (%s)", initialValue.length, result.length)); } bucketCount = initialValue.length; - System.arraycopy(initialValue, 0, result, 0, bucketCount); + System.arraycopy(initialValue, 0, result, 0, initialValue.length); } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java index 9be1a39739..c6ef8e79eb 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -62,7 +62,11 @@ public SimpleBloomFilter(Shape shape) { * @param hasher the Hasher to initialize the filter with. */ public SimpleBloomFilter(final Shape shape, Hasher hasher) { - this( shape, BitMapProducer.fromIndexProducer( Objects.requireNonNull( hasher, "hasher").indices(shape), shape)); + Objects.requireNonNull( shape, "shape"); + Objects.requireNonNull( hasher, "hasher"); + this.shape = shape; + this.bitMap = new long[0]; + mergeInPlace( hasher ); } /** @@ -78,8 +82,24 @@ public SimpleBloomFilter(final Shape shape, BitMapProducer producer ) { BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape); producer.forEachBitMap( builder ); this.bitMap = builder.getArray(); - this.cardinality = 0; - forEachBitMap( w -> this.cardinality += Long.bitCount(w)); + this.cardinality = -1; + } + + @Override + public boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull( hasher, "hasher"); + Shape shape = getShape(); + + hasher.indices(shape).forEachIndex( idx -> { + if (bitMap.length <= BitMap.getLongIndex(idx)) { + long[] newMap = new long[BitMap.numberOfBuckets( idx )]; + System.arraycopy( bitMap, 0, newMap, 0, bitMap.length); + bitMap = newMap; + } + BitMap.set( bitMap, idx); + }); + this.cardinality = -1; + return true; } @Override @@ -88,8 +108,7 @@ public boolean mergeInPlace(BloomFilter other) { BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape, this.bitMap); other.forEachBitMap( builder ); this.bitMap = builder.getArray(); - this.cardinality = 0; - forEachBitMap( w -> this.cardinality += Long.bitCount(w)); + this.cardinality = -1; return true; } @@ -105,6 +124,14 @@ public boolean isSparse() { @Override public int cardinality() { + if (this.cardinality == -1) { + synchronized( this ) { + if (this.cardinality == -1) { + this.cardinality = 0; + forEachBitMap( w -> this.cardinality += Long.bitCount(w)); + } + } + } return this.cardinality; } @@ -124,7 +151,12 @@ public void forEachBitMap(LongConsumer consumer) { @Override public boolean contains(IndexProducer indexProducer) { - return contains( BitMapProducer.fromIndexProducer(indexProducer, shape)); + try { + indexProducer.forEachIndex( idx -> {if (!BitMap.contains( bitMap, idx)) { throw new NoMatchException(); }} ); + return true; + } catch (NoMatchException e) { + return false; + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java index 03152a3a70..5837d890dc 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -126,10 +126,10 @@ public void forEachBitMap(LongConsumer consumer) { long bitMap =0; int idx=0; for (int i : indices) { - if (BitMap.getLongIndex(i) != idx) { + while (BitMap.getLongIndex(i) != idx) { consumer.accept( bitMap ); bitMap = 0; - idx = BitMap.getLongIndex(i); + idx++; } bitMap |= BitMap.getLongBit(i); } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index 956da7b589..b3efa0f26f 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -35,6 +35,8 @@ public interface Hasher { * defined by the shape. However the count of indices may not be a multiple of the number of * hash functions once implementation has removed duplicates.

    * + *

    This IndexProducer must be deterministic in that it mustreturn the same indices for the + * same Shape.

    *

    No guarantee is made as to order of indices.

    *

    Duplicates indices for a single item must be removed.

    * From 60b4f51477339baa07d2cc6231ea2009e46088cf Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Mon, 22 Nov 2021 07:52:12 +0000 Subject: [PATCH 14/27] Null hasher used when a hasher is required but no values are available. --- .../bloomfilter/hasher/NullHasher.java | 53 +++++++++++++++++++ .../bloomfilter/hasher/NullHasherTest.java | 49 +++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java new file mode 100644 index 0000000000..a27033f4c8 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.Objects; +import java.util.function.IntConsumer; + +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; + +/** + * A Hasher that returns no values. + * + * @since 4.5 + */ +public final class NullHasher implements Hasher { + + public static final NullHasher INSTANCE = new NullHasher(); + + private NullHasher() { + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + + return new IndexProducer() { + @Override + public void forEachIndex(IntConsumer consumer) { + // do nothing + } + }; + } + + @Override + public int size() { + return 0; + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java new file mode 100644 index 0000000000..3f5a1e1f93 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link NullHasher}. + */ +public class NullHasherTest { + + private Hasher hasher = NullHasher.INSTANCE; + + @Test + public void sizeTest() { + assertEquals( 0, hasher.size() ); + } + + @Test + public void testIterator() { + Shape shape = new Shape( 5, 10 ); + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( 0, lst.size()); + } + + +} From a9cd499d5f8e2b7d9d50f5d9993324fbfdb05b46 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Mon, 22 Nov 2021 07:54:04 +0000 Subject: [PATCH 15/27] Added Hasher.Filter and Hasher.FilteredIntConsumer --- .../bloomfilter/hasher/Hasher.java | 86 ++++++++++++++++++- .../bloomfilter/hasher/HasherFilterTest.java | 32 +++++++ 2 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index b3efa0f26f..3c4912138b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -17,10 +17,14 @@ package org.apache.commons.collections4.bloomfilter.hasher; import org.apache.commons.collections4.bloomfilter.Shape; + +import java.util.function.IntConsumer; + +import org.apache.commons.collections4.bloomfilter.BitMap; import org.apache.commons.collections4.bloomfilter.IndexProducer; /** - * A Hasher create IndexProducer based on the hash implementation and the + * A Hasher creates IndexProducer based on the hash implementation and the * provided Shape. * * @since 4.5 @@ -35,8 +39,9 @@ public interface Hasher { * defined by the shape. However the count of indices may not be a multiple of the number of * hash functions once implementation has removed duplicates.

    * - *

    This IndexProducer must be deterministic in that it mustreturn the same indices for the + *

    This IndexProducer must be deterministic in that it must return the same indices for the * same Shape.

    + * *

    No guarantee is made as to order of indices.

    *

    Duplicates indices for a single item must be removed.

    * @@ -58,4 +63,81 @@ public interface Hasher { default boolean isEmpty() { return size() == 0; } + + /** + * A convenience class for Hasher implementations to filter out duplicate indices. + * + *

    If the index is negative the behavior is not defined.

    + * + *

    This is conceptually a unique filter implemented as a {@code Predicate}.

    + * @since 4.5 + */ + public class Filter { + private long[] bits; + private int size; + + /** + * Constructor. + * + * @param size The number of numbers to track. Values from 0 to size-1 will be tracked. + */ + public Filter(int size) { + bits = new long[BitMap.numberOfBitMaps(size)]; + this.size = size; + } + + /** + * Test if the number has not been seen. + * + *

    The first time a number is tested the method returns {@code true} and returns + * {@code false} for every time after that.

    + * + *

    If the input is negative the behavior is not defined.

    + * + *

    Note: only positive number are + * @param number the number to check. + * @return {@code true} if the number has not been seen, {@code false} otherwise. + */ + public boolean test(int number) { + BitMap.checkPositive(number); + if (number >= size) { + throw new IndexOutOfBoundsException(String.format("number to large %d >= %d", number, size)); + } + boolean retval = !BitMap.contains(bits, number); + BitMap.set(bits, number); + return retval; + } + } + + /** + * Class to wrap an that an IntConsumer only receives an integer value once. + * + *

    If the index is negative the behavior is not defined.

    + * + * @since 4.5 + */ + public class FilteredIntConsumer implements IntConsumer { + private Hasher.Filter filter; + private IntConsumer consumer; + + /** + * Constructor. + * @param shape The shape of the output. + * @param consumer to wrap. + */ + public FilteredIntConsumer(int maxIntegerValue, IntConsumer consumer) { + this.filter = new Hasher.Filter(maxIntegerValue); + this.consumer = consumer; + } + + @Override + public void accept(int value) { + if (filter.test(value)) { + consumer.accept(value); + ; + } + + } + } + } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java new file mode 100644 index 0000000000..b5aa62f8a4 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java @@ -0,0 +1,32 @@ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.junit.Test; + +public class HasherFilterTest { + + @Test + public void testBasicFiltering() { + Hasher.Filter filter = new Hasher.Filter(10); + + for (int i=0;i<10;i++) { + assertTrue( filter.test(i)); + } + + for (int i=0;i<10;i++) { + assertFalse( filter.test(i)); + } + + try { + filter.test(10); + fail( "Should have thrown IndexOutOfBounds exception"); + } + catch (IndexOutOfBoundsException expected) { + // do nothing. + } + } + +} From c21194b27360eed3e213ad25d96c2cc8c182a5b0 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Mon, 22 Nov 2021 07:54:36 +0000 Subject: [PATCH 16/27] Updated documentation + formatted. --- .../bloomfilter/ArrayCountingBloomFilter.java | 73 ++++---- .../bloomfilter/BitCountProducer.java | 10 +- .../collections4/bloomfilter/BitMap.java | 104 +++++++---- .../bloomfilter/BitMapProducer.java | 64 +++---- .../collections4/bloomfilter/BloomFilter.java | 165 +++++++++++------- .../bloomfilter/CountingBloomFilter.java | 86 ++++----- .../bloomfilter/IndexProducer.java | 20 ++- .../bloomfilter/SetOperations.java | 108 ++++++------ .../collections4/bloomfilter/Shape.java | 100 ++++++----- .../bloomfilter/SimpleBloomFilter.java | 72 ++++---- .../bloomfilter/SparseBloomFilter.java | 57 +++--- .../bloomfilter/exceptions/package-info.java | 3 + .../bloomfilter/hasher/HasherCollection.java | 27 ++- .../bloomfilter/hasher/SimpleHasher.java | 62 +++---- .../hasher/SingleItemHasherCollection.java | 81 +++++++++ .../bloomfilter/hasher/package-info.java | 50 +++++- .../bloomfilter/package-info.java | 95 +++++----- .../collections4/bloomfilter/BitMapTest.java | 81 ++++++--- .../bloomfilter/SetOperationsTest.java | 36 ++-- .../hasher/HasherCollectionTest.java | 6 +- .../SingleItemHasherCollectionTest.java | 58 ++++++ 21 files changed, 830 insertions(+), 528 deletions(-) create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index 2df6ae787d..092e1460d6 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -25,7 +25,7 @@ import org.apache.commons.collections4.bloomfilter.hasher.Hasher; /** - * A counting Bloom filter using an array to track counts for each enabled bit + * A counting Bloom filter using an int array to track counts for each enabled bit * index. * *

    Any operation that results in negative counts or integer overflow of @@ -33,13 +33,13 @@ * The operation is completed in full, no exception is raised and the state is * set to invalid. This allows the counts for the filter immediately prior to the * operation that created the invalid state to be recovered. See the documentation - * in {@link #isValid()} for details. + * in {@link #isValid()} for details.

    * *

    All the operations in the filter assume the counts are currently valid, - * for example cardinality or contains operations. Behavior of an invalid + * for example {@code cardinality} or {@code contains} operations. Behavior of an invalid * filter is undefined. It will no longer function identically to a standard * Bloom filter that is the merge of all the Bloom filters that have been added - * to and not later subtracted from the counting Bloom filter. + * to and not later subtracted from the counting Bloom filter.

    * *

    The maximum supported number of items that can be stored in the filter is * limited by the maximum array size combined with the {@link Shape}. For @@ -64,20 +64,20 @@ public class ArrayCountingBloomFilter implements CountingBloomFilter { private final int[] counts; /** - * The state flag. This is a bitwise OR of the entire history of all updated + * The state flag. This is a bitwise @{code OR} of the entire history of all updated * counts. If negative then a negative count or integer overflow has occurred on * one or more counts in the history of the filter and the state is invalid. * *

    Maintenance of this state flag is branch-free for improved performance. It * eliminates a conditional check for a negative count during remove/subtract * operations and a conditional check for integer overflow during merge/add - * operations. + * operations.

    * *

    Note: Integer overflow is unlikely in realistic usage scenarios. A count * that overflows indicates that the number of items in the filter exceeds the * maximum possible size (number of bits) of any Bloom filter constrained by * integer indices. At this point the filter is most likely full (all bits are - * non-zero) and thus useless. + * non-zero) and thus useless.

    * *

    Negative counts are a concern if the filter is used incorrectly by * removing an item that was never added. It is expected that a user of a @@ -85,7 +85,7 @@ public class ArrayCountingBloomFilter implements CountingBloomFilter { * Enabling an explicit recovery path for negative or overflow counts is a major * performance burden not deemed necessary for the unlikely scenarios when an * invalid state is created. Maintenance of the state flag is a concession to - * flag improper use that should not have a major performance impact. + * flag improper use that should not have a major performance impact.

    */ private int state; @@ -96,19 +96,19 @@ public class ArrayCountingBloomFilter implements CountingBloomFilter { * */ public ArrayCountingBloomFilter(final Shape shape) { - Objects.requireNonNull( shape, "shape"); + Objects.requireNonNull(shape, "shape"); this.shape = shape; counts = new int[shape.getNumberOfBits()]; } @Override public boolean isSparse() { - return BitMap.isSparse( cardinality(), shape); + return true; } @Override public int cardinality() { - return (int) IntStream.range( 0, counts.length ).filter( i -> counts[i] > 0 ).count(); + return (int) IntStream.range(0, counts.length).filter(i -> counts[i] > 0).count(); } /** @@ -117,61 +117,61 @@ public int cardinality() { */ protected ArrayCountingBloomFilter makeClone() { ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); - filter.add( this ); + filter.add(this); filter.state = this.state; return filter; } @Override public CountingBloomFilter merge(BloomFilter other) { - Objects.requireNonNull( other, "other"); + Objects.requireNonNull(other, "other"); CountingBloomFilter filter = makeClone(); - filter.add( BitCountProducer.from(other)); + filter.add(BitCountProducer.from(other)); return filter; } @Override public CountingBloomFilter merge(Hasher hasher) { - Objects.requireNonNull( hasher, "hasher"); + Objects.requireNonNull(hasher, "hasher"); ArrayCountingBloomFilter filter = makeClone(); - filter.add( BitCountProducer.from( hasher.indices(shape))); + filter.add(BitCountProducer.from(hasher.indices(shape))); return filter; } @Override public boolean mergeInPlace(final BloomFilter other) { - Objects.requireNonNull( other, "other"); - return add( BitCountProducer.from(other) ); + Objects.requireNonNull(other, "other"); + return add(BitCountProducer.from(other)); } @Override public boolean mergeInPlace(final Hasher hasher) { - Objects.requireNonNull( hasher, "hasher"); - return add( BitCountProducer.from( hasher.indices(shape))); + Objects.requireNonNull(hasher, "hasher"); + return add(BitCountProducer.from(hasher.indices(shape))); } @Override public boolean remove(final BloomFilter other) { - Objects.requireNonNull( other, "other"); - return subtract( BitCountProducer.from(other)); + Objects.requireNonNull(other, "other"); + return subtract(BitCountProducer.from(other)); } @Override public boolean remove(final Hasher hasher) { - Objects.requireNonNull( hasher, "hasher"); - return subtract( BitCountProducer.from( hasher.indices(shape))); + Objects.requireNonNull(hasher, "hasher"); + return subtract(BitCountProducer.from(hasher.indices(shape))); } @Override public boolean add(final BitCountProducer other) { - Objects.requireNonNull( other, "other"); + Objects.requireNonNull(other, "other"); other.forEachCount(this::add); return isValid(); } @Override public boolean subtract(final BitCountProducer other) { - Objects.requireNonNull( other, "other"); + Objects.requireNonNull(other, "other"); other.forEachCount(this::subtract); return isValid(); } @@ -181,14 +181,14 @@ public boolean subtract(final BitCountProducer other) { * *

    Implementation note * - *

    The state transition to invalid is permanent. + *

    The state transition to invalid is permanent.

    * *

    This implementation does not correct negative counts to zero or integer * overflow counts to {@link Integer#MAX_VALUE}. Thus the operation that * generated invalid counts can be reversed by using the complement of the * original operation with the same Bloom filter. This will restore the counts * to the state prior to the invalid operation. Counts can then be extracted - * using {@link #forEachCount(BitCountConsumer)}. + * using {@link #forEachCount(BitCountConsumer)}.

    */ @Override public boolean isValid() { @@ -197,7 +197,7 @@ public boolean isValid() { @Override public void forEachCount(final BitCountProducer.BitCountConsumer consumer) { - Objects.requireNonNull( consumer, "consumer"); + Objects.requireNonNull(consumer, "consumer"); for (int i = 0; i < counts.length; i++) { if (counts[i] != 0) { consumer.accept(i, counts[i]); @@ -207,7 +207,7 @@ public void forEachCount(final BitCountProducer.BitCountConsumer consumer) { @Override public void forEachIndex(IntConsumer consumer) { - Objects.requireNonNull( consumer, "consumer"); + Objects.requireNonNull(consumer, "consumer"); for (int i = 0; i < counts.length; i++) { if (counts[i] != 0) { consumer.accept(i); @@ -217,8 +217,8 @@ public void forEachIndex(IntConsumer consumer) { @Override public void forEachBitMap(LongConsumer consumer) { - Objects.requireNonNull( consumer, "consumer"); - BitMapProducer.fromIndexProducer( this, shape).forEachBitMap(consumer); + Objects.requireNonNull(consumer, "consumer"); + BitMapProducer.fromIndexProducer(this, shape).forEachBitMap(consumer); } /** @@ -245,7 +245,6 @@ protected void subtract(final int idx, final int subtrahend) { counts[idx] = updated; } - @Override public Shape getShape() { return shape; @@ -254,7 +253,11 @@ public Shape getShape() { @Override public boolean contains(IndexProducer indexProducer) { try { - indexProducer.forEachIndex( idx -> {if ( this.counts[idx] == 0 ) { throw new NoMatchException(); }} ); + indexProducer.forEachIndex(idx -> { + if (this.counts[idx] == 0) { + throw new NoMatchException(); + } + }); } catch (NoMatchException e) { return false; } @@ -263,7 +266,7 @@ public boolean contains(IndexProducer indexProducer) { @Override public boolean contains(BitMapProducer bitMapProducer) { - return contains( IndexProducer.fromBitMapProducer(bitMapProducer)); + return contains(IndexProducer.fromBitMapProducer(bitMapProducer)); } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java index e220e1f592..e4f88ecc26 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java @@ -22,14 +22,14 @@ * Produces bit counts for counting type Bloom filters. * * @since 4.5 -*/ + */ public interface BitCountProducer extends IndexProducer { /** * Performs the given action for each {@code } pair where the count is non-zero. * Any exceptions thrown by the action are relayed to the caller. * - * Must only process each index once, and must process indexes in order. + *

    Must only process each index once, and must process indexes in order.

    * * @param consumer the action to be performed for each non-zero bit count * @throws NullPointerException if the specified action is null @@ -38,7 +38,7 @@ public interface BitCountProducer extends IndexProducer { @Override default void forEachIndex(IntConsumer consumer) { - forEachCount( (i,v) -> consumer.accept( i )); + forEachCount((i, v) -> consumer.accept(i)); } /** @@ -47,11 +47,11 @@ default void forEachIndex(IntConsumer consumer) { * @param idx An index producer. * @return A BitCountProducer with the same indices as the IndexProducer. */ - public static BitCountProducer from( IndexProducer idx ) { + public static BitCountProducer from(IndexProducer idx) { return new BitCountProducer() { @Override public void forEachCount(BitCountConsumer consumer) { - idx.forEachIndex( i -> consumer.accept(i, 1 ) ); + idx.forEachIndex(i -> consumer.accept(i, 1)); } }; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java index 7199130b42..070c91b65f 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -19,7 +19,11 @@ import java.util.Objects; /** - * Contains functions to convert {@code int} indices into Bloom filter bit positions. + * Contains functions to convert {@code int} indices into Bloom filter bit positions and visa versa. + * + *

    The functions view an array of longs as a collection of bitmaps each containing 64 bits. The bits are arranged + * in memory as a little-endian long value. This matches the requirements of the BitMapProducer interface.

    + * * @since 4.5 */ public class BitMap { @@ -27,39 +31,48 @@ public class BitMap { private static final int DIVIDE_BY_64 = 6; /** Do not instantiate. */ - private BitMap() {} + private BitMap() { + } /** - * Calculates the number of buckets required for the numberOfBits parameter. - * @param numberOfBits the number of bits to store in the array of buckets. - * @return the number of buckets necessary. + * Calculates the number of bitmaps (longs) required for the numberOfBits parameter. + * + *

    If the input is negative the behavior is not defined.

    + + * @param numberOfBits the number of bits to store in the array of bitmaps. + * @return the number of bitmaps necessary. */ - public static int numberOfBuckets( int numberOfBits ) { - int bucket = numberOfBits >> DIVIDE_BY_64; - return bucket+1; + public static int numberOfBitMaps(int numberOfBits) { + return numberOfBits == 0 ? 0 : ((numberOfBits - 1) >> DIVIDE_BY_64) + 1; } /** - * Checks if the specified index bit is enabled in the array of bit buckets. - * @param buckets The array of bit buckets + * Checks if the specified index bit is enabled in the array of bit bitmaps. + * + * If the bit specified by idx is not in the bitMap false is returned. + * + * @param bitMaps The array of bit maps. * @param idx the index of the bit to locate. * @return {@code true} if the bit is enabled, {@code false} otherwise. */ - public static boolean contains( long[] buckets, int idx ) { - return (getLongIndex(idx) < buckets.length && (buckets[ getLongIndex( idx )] & getLongBit( idx )) != 0); + public static boolean contains(long[] bitMaps, int idx) { + return (idx >= 0 && getLongIndex(idx) < bitMaps.length && (bitMaps[getLongIndex(idx)] & getLongBit(idx)) != 0); } /** - * Sets the bit in the buckets - * @param buckets The array of bit buckets + * Sets the bit in the bitmaps. + *

    Does not perform range checking

    + * + * @param bitMaps The array of bit maps.. * @param idx the index of the bit to set. + * @throws IndexOutOfBoundsException if idx specifies a bit not in the range being tracked. */ - public static void set( long[] buckets, int idx ) { - buckets[ getLongIndex( idx )] |= getLongBit( idx ); + public static void set(long[] bitMaps, int idx) { + bitMaps[checkRange(bitMaps.length, idx)] |= getLongBit(idx); } /** - * Check the index is positive. + * Checks that the index is positive. * * @param bitIndex the bit index * @throws IndexOutOfBoundsException if the index is not positive @@ -70,26 +83,43 @@ public static void checkPositive(final int bitIndex) { } } + /** + * Checks that the bitIndex produces a value in the range of a collection. + * + * @param limit the number of bitmaps in a collection. + * @param bitIndex the bit index + * @throws IndexOutOfBoundsException if the index is not positive + */ + public static int checkRange(final int limit, final int bitIndex) { + checkPositive(bitIndex); + int idx = getLongIndex(bitIndex); + if (limit <= idx) { + throw new IndexOutOfBoundsException("bitIndex to large: " + bitIndex); + } + return idx; + } /** * Gets the filter index for the specified bit index assuming the filter is using 64-bit longs * to store bits starting at index 0. * *

    The index is assumed to be positive. For a positive index the result will match - * {@code bitIndex / 64}. + * {@code bitIndex / 64}.

    * - *

    The divide is performed using bit shifts. If the input is negative the behavior - * is not defined. + *

    The divide is performed using bit shifts. If the input is negative the behavior + * is not defined.

    * * @param bitIndex the bit index (assumed to be positive) - * @return the filter index + * @return the index of the BitMap in an array of BitMaps. * @see #checkPositive(int) */ public static int getLongIndex(final int bitIndex) { - // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is positive. + // An integer divide by 64 is equivalent to a shift of 6 bits if the integer is + // positive. // We do not explicitly check for a negative here. Instead we use a // a signed shift. Any negative index will produce a negative value - // by sign-extension and if used as an index into an array it will throw an exception. + // by sign-extension and if used as an index into an array it will throw an + // exception. return bitIndex >> DIVIDE_BY_64; } @@ -99,9 +129,9 @@ public static int getLongIndex(final int bitIndex) { * 1 bit set. * *

    The index is assumed to be positive. For a positive index the result will match - * {@code 1L << (bitIndex % 64)}. + * {@code 1L << (bitIndex % 64)}.

    * - *

    If the input is negative the behavior is not defined. + *

    If the input is negative the behavior is not defined.

    * * @param bitIndex the bit index (assumed to be positive) * @return the filter bit @@ -116,17 +146,25 @@ public static long getLongBit(final int bitIndex) { } /** - * Determines id a cardinality is sparse for the shape. - * Since the size of a bucket is a long and the size of an index is an int, there can be - * 2 indexes for each bucket. Since indexes are evenly distributed sparse is defined as - * {@code numberOfBuckets*2 >= cardinality} + * Determines if a cardinality is sparse based on the shape. + *

    This method assumes that BitMaps are 64bits and indexes are 32bits. If the memory + * necessary to store the cardinality as indexes is less than the estimated memory for BitMaps, + * the cardinality is determined to be {@code sparse}.

    * @param cardinality the cardinality to check. * @param shape the Shape to check against - * @return true if the cardinality is sparse within the bucket. + * @return true if the cardinality is sparse within the shape. */ - public static boolean isSparse( int cardinality, Shape shape ) { - Objects.requireNonNull( shape, "shape"); - return numberOfBuckets(shape.getNumberOfBits()-1)*2 >= cardinality; + public static boolean isSparse(int cardinality, Shape shape) { + /* + * Since the size of a BitMap is a long and the size of an index is an int, + * there can be 2 indexes for each bitmap. In Bloom filters indexes are evenly + * distributed across the range of possible values, Thus if the cardinality + * (number of indexes) is less than or equal to 2*number of BitMaps the + * cardinality is sparse within the shape. + */ + + Objects.requireNonNull(shape, "shape"); + return cardinality <= (numberOfBitMaps(shape.getNumberOfBits()) * 2); } } \ No newline at end of file diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java index bc6ccd10b0..dded8351c2 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -24,14 +24,15 @@ /** * Produces BitMap longs for a Bloom filter. * - * Each bit map is a little-endian long value representing a block of bits of this filter. + * Each bit map is a little-endian long value representing a block of bits of this filter. * *

    The returned array will have length {@code ceil(m / 64)} where {@code m} is the * number of bits in the filter and {@code ceil} is the ceiling function. * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit * index is enabled. * - * The producer may produce empty bit maps at the end of the sequence. + * The producer may stop at the last non zero BitMap or may produce zero value bit maps to the limit determined by + * a shape.. * * @since 4.5 */ @@ -46,19 +47,17 @@ public interface BitMapProducer { */ void forEachBitMap(LongConsumer consumer); - /** * Creates a BitMapProducer from an array of Long. * @param bitMaps the bitMaps to return. * @return a BitMapProducer. */ - public static BitMapProducer fromLongArray( long[] bitMaps ) { + public static BitMapProducer fromLongArray(long[] bitMaps) { return new BitMapProducer() { @Override public void forEachBitMap(LongConsumer consumer) { - for (long word : bitMaps) - { + for (long word : bitMaps) { consumer.accept(word); } } @@ -72,34 +71,34 @@ public void forEachBitMap(LongConsumer consumer) { * @param shape the desired shape. * @return A BitMapProducer that produces the BitMap equivalent of the Indices from the producer. */ - public static BitMapProducer fromIndexProducer( IndexProducer producer, Shape shape ) { - Objects.requireNonNull( producer, "producer"); - Objects.requireNonNull( shape, "shape"); + public static BitMapProducer fromIndexProducer(IndexProducer producer, Shape shape) { + Objects.requireNonNull(producer, "producer"); + Objects.requireNonNull(shape, "shape"); return new BitMapProducer() { private int maxBucket = -1; - private long[] result = new long[ BitMap.numberOfBuckets( shape.getNumberOfBits())]; + private long[] result = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; @Override public void forEachBitMap(LongConsumer consumer) { - Objects.requireNonNull( consumer, "consumer"); - /* we can not assume that all the ints will be in order - * and not repeated. This is because the HasherCollection does - * not make the guarantee. + Objects.requireNonNull(consumer, "consumer"); + /* + * we can not assume that all the ints will be in order and not repeated. This + * is because the HasherCollection does not make the guarantee. */ // process all the ints into a array of BitMaps IntConsumer builder = new IntConsumer() { @Override - public void accept( int i ) { - int bucketIdx = BitMap.getLongIndex( i ); + public void accept(int i) { + int bucketIdx = BitMap.getLongIndex(i); maxBucket = maxBucket < bucketIdx ? bucketIdx : maxBucket; result[bucketIdx] |= BitMap.getLongBit(i); } }; - producer.forEachIndex( builder ); + producer.forEachIndex(builder); // send the bitmaps to the consumer. - for (int bucket=0;bucket<=maxBucket;bucket++) { - consumer.accept( result[bucket] ); + for (int bucket = 0; bucket <= maxBucket; bucket++) { + consumer.accept(result[bucket]); } } }; @@ -111,31 +110,32 @@ public void accept( int i ) { */ public class ArrayBuilder implements LongConsumer { private long[] result; - private int idx=0; - private int bucketCount=0; + private int idx = 0; + private int bucketCount = 0; /** - * Constructor. + * Constructor that creates an empty ArrayBuilder. * @param shape The shape used to generate the BitMaps. */ - public ArrayBuilder( Shape shape ) { - this( shape, null ); + public ArrayBuilder(Shape shape) { + this(shape, null); } /** - * Constructor. + * Constructor that creates an array builder with an initial value. * @param shape The shape used to generate the BitMaps. * @param initialValue an array of BitMap values to initialize the builder with. May be {@code null}. * @throws IllegalArgumentException is the length of initialValue is greater than the number of * buckets as specified by the number of bits in the Shape. */ - public ArrayBuilder( Shape shape, long[] initialValue ) { - Objects.requireNonNull( shape, "shape"); - result = new long[ BitMap.numberOfBuckets( shape.getNumberOfBits() )]; + public ArrayBuilder(Shape shape, long[] initialValue) { + Objects.requireNonNull(shape, "shape"); + result = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; if (initialValue != null) { if (initialValue.length > result.length) { - throw new IllegalArgumentException( String.format( - "initialValue length (%s) is longer than shape length (%s)", initialValue.length, result.length)); + throw new IllegalArgumentException( + String.format("initialValue length (%s) is longer than shape length (%s)", + initialValue.length, result.length)); } bucketCount = initialValue.length; System.arraycopy(initialValue, 0, result, 0, initialValue.length); @@ -145,7 +145,7 @@ public ArrayBuilder( Shape shape, long[] initialValue ) { @Override public void accept(long bitmap) { result[idx++] |= bitmap; - bucketCount = bucketCount>=idx?bucketCount:idx; + bucketCount = bucketCount >= idx ? bucketCount : idx; } /** @@ -153,7 +153,7 @@ public void accept(long bitmap) { * @return the Array of BitMaps. */ public long[] getArray() { - return Arrays.copyOf( result, bucketCount ); + return Arrays.copyOf(result, bucketCount); } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index b14763478b..65bc8b7e9f 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Objects; + import org.apache.commons.collections4.bloomfilter.hasher.Hasher; /** @@ -32,9 +33,9 @@ public interface BloomFilter extends IndexProducer, BitMapProducer { * @param filter the filter to get the data from. * @return An array of BitMap long. */ - public static long[] asBitMapArray( BloomFilter filter ) { + public static long[] asBitMapArray(BloomFilter filter) { BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(filter.getShape()); - filter.forEachBitMap( builder ); + filter.forEachBitMap(builder); return builder.getArray(); } @@ -43,20 +44,24 @@ public static long[] asBitMapArray( BloomFilter filter ) { * @param filter the Filter to get the data from. * @return An array of indices for enabled bits in the Bloom filter. */ - public static int[] asIndexArray( BloomFilter filter ) { + public static int[] asIndexArray(BloomFilter filter) { List lst = new ArrayList(); - filter.forEachIndex( lst::add ); - return lst.stream().mapToInt( Integer::intValue ).toArray(); + filter.forEachIndex(lst::add); + return lst.stream().mapToInt(Integer::intValue).toArray(); } - // Query Operations /** - * This method is used to determine the best method for matching. For `sparse` implementations the `getIndices()` - * method is more efficient. Implementers should determine if it is easier for the implementation to return am array of - * Indices (sparse) or a bit map as an array of unsigned longs. - * @return + * This method is used to determine the best method for matching. + * + *

    For `sparse` implementations + * the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations + * the {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier + * for the implementation to produce indexes of BitMap blocks.

    + * + * @return {@code true} if the implementation is sparse {@code false} otherwise. + * @see BitMap */ boolean isSparse(); @@ -67,44 +72,45 @@ public static int[] asIndexArray( BloomFilter filter ) { Shape getShape(); /** - * Returns {@code true} if this filter contains the specified filter. Specifically this + * Returns {@code true} if this filter contains the specified filter. + * + *

    Specifically this * returns {@code true} if this filter is enabled for all bits that are enabled in the * {@code other} filter. Using the bit representations this is - * effectively {@code (this AND other) == other}. + * effectively {@code (this AND other) == other}.

    * * @param other the other Bloom filter * @return true if all enabled bits in the other filter are enabled in this filter. */ default boolean contains(BloomFilter other) { - Objects.requireNonNull( other, "other"); - return isSparse() ? contains( (IndexProducer) other) : - contains( (BitMapProducer) other ); + Objects.requireNonNull(other, "other"); + return isSparse() ? contains((IndexProducer) other) : contains((BitMapProducer) other); } /** * Returns {@code true} if this filter contains the bits specified in the hasher. - * Specifically this returns {@code true} if this filter is enabled for all bit indexes + * + *

    Specifically this returns {@code true} if this filter is enabled for all bit indexes * identified by the {@code hasher}. Using the BitMap representations this is - * effectively {@code (this AND hasher) == hasher}. + * effectively {@code (this AND hasher) == hasher}.

    * * @param hasher the hasher to provide the indexes * @return true if this filter is enabled for all bits specified by the hasher - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter */ default boolean contains(Hasher hasher) { - Objects.requireNonNull( hasher, "Hasher"); + Objects.requireNonNull(hasher, "Hasher"); Shape shape = getShape(); - return contains( hasher.indices(shape)); + return contains(hasher.indices(shape)); } /** * Returns {@code true} if this filter contains the indices specified IndexProducer. - * Specifically this returns {@code true} if this filter is enabled for all bit indexes - * identified by the {@code IndexProducer}. + * + *

    Specifically this returns {@code true} if this filter is enabled for all bit indexes + * identified by the {@code IndexProducer}.

    * * @param indexProducer the IndexProducer to provide the indexes - * @return true if this filter is enabled for all bits specified by the IndexProducer + * @return {@code true} if this filter is enabled for all bits specified by the IndexProducer */ boolean contains(IndexProducer indexProducer); @@ -113,55 +119,60 @@ default boolean contains(Hasher hasher) { * bitMapProducer. * * @param bitMapProducer the the {@code BitMapProducer} to provide the BitMaps. - * @return true if this filter is enabled for all bits specified by the BitMaps + * @return {@code true} if this filter is enabled for all bits specified by the BitMaps */ boolean contains(BitMapProducer bitMapProducer); /** * Merges the specified Bloom filter with this Bloom filter creating a new Bloom filter. - * Specifically all bit indexes that are enabled in the {@code other} and in @code this} filter will be - * enabled in the resulting filter. + * + *

    Specifically all bit indexes that are enabled in the {@code other} and in @code this} filter will be + * enabled in the resulting filter.

    * * @param other the other Bloom filter * @return The new Bloom filter. */ default BloomFilter merge(BloomFilter other) { - Objects.requireNonNull( other, "other"); + Objects.requireNonNull(other, "other"); Shape shape = getShape(); - BloomFilter result = BitMap.isSparse( (cardinality() + other.cardinality()), getShape() ) ? - new SparseBloomFilter(shape) : - new SimpleBloomFilter(shape); + BloomFilter result = BitMap.isSparse((cardinality() + other.cardinality()), getShape()) + ? new SparseBloomFilter(shape) + : new SimpleBloomFilter(shape); - result.mergeInPlace( this ); - result.mergeInPlace( other ); + result.mergeInPlace(this); + result.mergeInPlace(other); return result; } /** * Merges the specified Hasher with this Bloom filter and returns a new Bloom filter. - * Specifically all bit indexes that are identified by the {@code hasher} and in {@code this} Bloom filter - * be enabled in the resulting filter. + * + *

    Specifically all bit indexes that are identified by the {@code hasher} and in {@code this} Bloom filter + * be enabled in the resulting filter.

    * * @param hasher the hasher to provide the indices * @return the new Bloom filter. */ default BloomFilter merge(Hasher hasher) { - Objects.requireNonNull( hasher, "hasher"); + Objects.requireNonNull(hasher, "hasher"); Shape shape = getShape(); - BloomFilter result = BitMap.isSparse( (hasher.size() * shape.getNumberOfHashFunctions())+ cardinality(), shape ) ? - new SparseBloomFilter(shape, hasher) : - new SimpleBloomFilter(shape, hasher); - result.mergeInPlace( this ); + BloomFilter result = BitMap.isSparse((hasher.size() * shape.getNumberOfHashFunctions()) + cardinality(), shape) + ? new SparseBloomFilter(shape, hasher) + : new SimpleBloomFilter(shape, hasher); + result.mergeInPlace(this); return result; } /** - * Merges the specified Bloom filter into this Bloom filter. Specifically all - * bit indexes that are identified by the {@code other} will be enabled in this filter. + * Merges the specified Bloom filter into this Bloom filter. + * + *

    Specifically all + * bit indexes that are identified by the {@code other} will be enabled in this filter.

    * - *

    Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the specified Bloom filter. + *

    Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter may or may not contain + * the {@code other} Bloom filter. This state may occur in complex Bloom filter implementations like + * counting Bloom filters.

    * * @param other The bloom filter to merge into this one. * @return true if the merge was successful @@ -172,30 +183,33 @@ default BloomFilter merge(Hasher hasher) { * Merges the specified hasher into this Bloom filter. Specifically all * bit indexes that are identified by the {@code hasher} will be enabled in this filter. * - *

    Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the specified Bloom filter. + *

    Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter may or may not contain + * the {@code other} Bloom filter. This state may occur in complex Bloom filter implementations like + * counting Bloom filters.

    * * @param hasher The hasher to merge. * @return true if the merge was successful */ default boolean mergeInPlace(Hasher hasher) { - Objects.requireNonNull( hasher, "hasher"); + Objects.requireNonNull(hasher, "hasher"); Shape shape = getShape(); - BloomFilter result = BitMap.isSparse( (hasher.size() * shape.getNumberOfHashFunctions())+cardinality(),shape ) ? - new SparseBloomFilter(getShape(), hasher) : - new SimpleBloomFilter(getShape(), hasher); - return mergeInPlace( result ); + BloomFilter result = BitMap.isSparse((hasher.size() * shape.getNumberOfHashFunctions()) + cardinality(), shape) + ? new SparseBloomFilter(getShape(), hasher) + : new SimpleBloomFilter(getShape(), hasher); + return mergeInPlace(result); } /** - * Determines if the bloom filter is "full". Full is defined as having no unset - * bits. + * Determines if the bloom filter is "full". * - * @return true if the filter is full. + *

    Full is defined as having no unset + * bits.

    + * + * @return {@code true} if the filter is full, {@code false} otherwise. */ default boolean isFull(Shape shape) { - Objects.requireNonNull( shape, "shape"); + Objects.requireNonNull(shape, "shape"); return cardinality() == shape.getNumberOfBits(); } @@ -204,7 +218,7 @@ default boolean isFull(Shape shape) { /** * Gets the cardinality (number of enabled bits) of this Bloom filter. * - *

    This is also known as the Hamming value.

    + *

    This is also known as the Hamming value or Hamming number.

    * * @return the cardinality of this filter */ @@ -212,29 +226,50 @@ default boolean isFull(Shape shape) { /** * Estimates the number of items in the Bloom filter. + * + *

    By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the + * shape and cardinality of this filter.

    + * + *

    An item is roughly equivalent to the number of Hashers that have been merged. As the Bloom filter + * is a probabilistic structure this value is an estimate.

    + * * @return an estimate of the number of items in the bloom filter. + * @see Shape#estimateN(int) */ default int estimateN() { - return (int) Math.round( getShape().estimateN( cardinality() )); + return (int) Math.round(getShape().estimateN(cardinality())); } /** * Estimates the number of items in the union of this Bloom filter with the other bloom filter. + * + *

    By default this is the {@code estimateN()} of the merging of this filter with the {@code other} filter.

    + * + *

    An item is roughly equivalent to the number of Hashers that have been merged. As the Bloom filter + * is a probabilistic structure this value is an estimate.

    + * * @param other The other Bloom filter * @return an estimate of the number of items in the union. + * @see #estimateN() */ - default int estimateUnion( BloomFilter other) { - Objects.requireNonNull( other, "other"); - return this.merge( other ).estimateN(); + default int estimateUnion(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return this.merge(other).estimateN(); } /** * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter. + * + *

    By default this is the {@code estimateN() + other.estimateN() - estimateUnion(other)}

    + * + *

    An item is roughly equivalent to the number of Hashers that have been merged. As the Bloom filter + * is a probabilistic structure this value is an estimate.

    + * * @param other The other Bloom filter * @return an estimate of the number of items in the intersection. */ - default int estimateIntersection( BloomFilter other) { - Objects.requireNonNull( other, "other"); - return estimateN() + other.estimateN() - estimateUnion( other ); + default int estimateIntersection(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return estimateN() + other.estimateN() - estimateUnion(other); } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index 049f91755b..6891ac2213 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -27,7 +27,7 @@ * to and not later subtracted from the counting Bloom filter. The functional * state of a CountingBloomFilter at the start and end of a series of merge and * subsequent remove operations of the same Bloom filters, irrespective of - * remove order, is expected to be the same. + * remove order, is expected to be the same.

    * *

    Removal of a filter that has not previously been merged results in an * invalid state where the counts no longer represent a sum of merged Bloom @@ -36,118 +36,121 @@ * undetected. The CountingBloomFilter maintains a state flag that is used as a * warning that an operation was performed that resulted in invalid counts and * thus an invalid state. For example this may occur if a count for an index was - * set to negative following a remove operation. + * set to negative following a remove operation.

    * *

    Implementations should document the expected state of the filter after an * operation that generates invalid counts, and any potential recovery options. * An implementation may support a reversal of the operation to restore the * state to that prior to the operation. In the event that invalid counts are * adjusted to a valid range then it should be documented if there has been - * irreversible information loss. + * irreversible information loss.

    * *

    Implementations may choose to throw an exception during an operation that * generates invalid counts. Implementations should document the expected state * of the filter after such an operation. For example are the counts not updated, - * partially updated or updated entirely before the exception is raised. + * partially updated or updated entirely before the exception is raised.

    * * @since 4.5 */ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { - - // Query Operations /** - * Returns true if the internal state is valid. This flag is a warning that an addition or + * Returns {@code true} if the internal state is valid. + * + *

    This flag is a warning that an addition or * subtraction of counts from this filter resulted in an invalid count for one or more * indexes. For example this may occur if a count for an index was * set to negative following a subtraction operation, or overflows an {@code int} following an - * addition operation. + * addition operation.

    * *

    A counting Bloom filter that has an invalid state is no longer ensured to function * identically to a standard Bloom filter instance that is the merge of all the Bloom filters - * that have been added to and not later subtracted from this counting Bloom filter. + * that have been added to and not later subtracted from this counting Bloom filter.

    * *

    Note: The change to an invalid state may or may not be reversible. Implementations * are expected to document their policy on recovery from an addition or removal operation - * that generated an invalid state. + * that generated an invalid state.

    * - * @return true if the state is valid + * @return {@code true} if the state is valid */ boolean isValid(); // Modification Operations - /** - * Removes the specified Bloom filter from this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be decremented by 1. + * Removes the specified Bloom filter from this Bloom filter. + * + *

    Specifically + * all counts for the indexes identified by the {@code other} filter will be decremented by 1.

    * *

    Note: If the other filter is a counting Bloom filter the index counts are ignored; only - * the enabled indexes are used. + * the enabled indexes are used.

    * - *

    This method will return true if the filter is valid after the operation. + *

    This method will return {@code true} if the filter is valid after the operation.

    * * @param other the other Bloom filter - * @return true if the removal was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter + * @return {@code true} if the removal was successful and the state is valid * @see #isValid() * @see #subtract(CountingBloomFilter) */ boolean remove(BloomFilter other); /** - * Removes the specified hasher from the Bloom filter from this Bloom filter. Specifically + * Removes the specified hasher from the Bloom filter from this Bloom filter. + * + *

    Specifically * all counts for the distinct indexes identified by the {@code hasher} will be - * decremented by 1. If the {@code hasher} contains duplicate bit indexes these are ignored. + * decremented by 1. If the {@code hasher} contains duplicate bit indexes these are ignored.

    * - * For HasherCollections each SimpleHasher will be considered a single item and decremented - * from the counts separately. + *

    For HasherCollections each enclosed Hasher will be considered a single item and decremented + * from the counts separately.

    * - *

    This method will return true if the filter is valid after the operation. + *

    This method will return {@code true} if the filter is valid after the operation.

    * * @param hasher the hasher to provide the indexes - * @return true if the removal was successful and the state is valid - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter + * @return {@code true} if the removal was successful and the state is valid * @see #isValid() */ boolean remove(Hasher hasher); - /** - * Adds the specified BitCountProducer to this Bloom filter. Specifically + * Adds the specified BitCountProducer to this Bloom filter. + * + *

    Specifically * all counts for the indexes identified by the {@code other} will be incremented - * by their corresponding values in the {@code other}. + * by their corresponding values in the {@code other}.

    * - *

    This method will return true if the filter is valid after the operation.

    + *

    This method will return {@code true} if the filter is valid after the operation.

    * * @param other the BitCountProducer to add. - * @return true if the addition was successful and the state is valid + * @return {@code true} if the addition was successful and the state is valid * @see #isValid() */ boolean add(BitCountProducer other); /** - * Adds the specified BitCountProducer to this Bloom filter. Specifically + * Adds the specified BitCountProducer to this Bloom filter. + * + *

    Specifically * all counts for the indexes identified by the {@code other} will be decremented - * by their corresponding values in the {@code other}. + * by their corresponding values in the {@code other}.

    * *

    This method will return true if the filter is valid after the operation.

    * * @param other the BitCountProducer to subtract. - * @return true if the subtraction was successful and the state is valid + * @return {@code true} if the subtraction was successful and the state is valid * @see #isValid() */ boolean subtract(BitCountProducer other); /** * Merges the specified Bloom filter into this Bloom filter to produce a new CountingBloomFilter. - * Specifically the new Bloom filter will contain all the counts of this filter and in addition + * + *

    Specifically the new Bloom filter will contain all the counts of this filter and in addition * all bit indexes that are enabled in the {@code other} filter will be incremented - * by one in the new filter. + * by one in the new filter.

    * *

    Note: the validity of the resulting filter is not guaranteed. When in doubt {@code isValid()} * should be called on the new filter.

    @@ -160,12 +163,13 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { /** * Merges the specified hasher with this Bloom filter to create a new CountingBloomFilter. - * Specifically the new Bloom filter will contain all the counts of this filter and in addition + * + *

    Specifically the new Bloom filter will contain all the counts of this filter and in addition * all bit indexes specified by the {@code hasher} will be incremented - * by one in the new filter. + * by one in the new filter.

    * - * For HasherCollections each SimpleHasher will be considered a single item and increment - * the counts separately. + *

    For HasherCollections each enclosed Hasher will be considered a single item and increment + * the counts separately.

    * *

    Note: the validity of the resulting filter is not guaranteed. When in doubt {@code isValid()} * should be called on the new filter.

    diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java index 6c8b4c51ca..97681e7b88 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -29,7 +29,9 @@ public interface IndexProducer { /** * Each index is passed to the consumer. - * Any exceptions thrown by the action are relayed to the caller. + *

    Any exceptions thrown by the action are relayed to the caller.

    + * + *

    Indices ordering is not guaranteed

    * * @param consumer the action to be performed for each non-zero bit index. * @throws NullPointerException if the specified action is null @@ -41,26 +43,26 @@ public interface IndexProducer { * @param producer the @{code BitMapProducer} * @return a new @{code IndexProducer}. */ - public static IndexProducer fromBitMapProducer( BitMapProducer producer ) { - Objects.requireNonNull( producer, "producer"); + public static IndexProducer fromBitMapProducer(BitMapProducer producer) { + Objects.requireNonNull(producer, "producer"); return new IndexProducer() { @Override public void forEachIndex(IntConsumer consumer) { - LongConsumer longConsumer = new LongConsumer(){ + LongConsumer longConsumer = new LongConsumer() { int wordIdx = 0; + @Override public void accept(long word) { - for (int i = 0;i<64;i++) - { - long mask = 1L<The first array is build in the constructor. The second array is processed as a LongConsumer. Whenever there are + * two values the op2 operation is used. Whenever the one array is longer than the other the op1 operation is used on the + * bitMaps that do not have matching entries.

    * - * The calculated cardinalities are summed to return the cardinality of the operation. + *

    The calculated cardinalities are summed to return the cardinality of the operation.

    * */ private static class CardCounter implements LongConsumer { @@ -46,40 +43,42 @@ private static class CardCounter implements LongConsumer { */ private int cardinality = 0; /** - * The index into the array of words + * The index into the array of BitMaps */ - private int idx=0; + private int idx = 0; /** - * The array of words + * The array of BitMaps */ - private long[] words; + private long[] bitMaps; /** - * The operator to execute for 2 words + * The operator to execute for 2 BitMaps */ private LongBinaryOperator op2; /** - * The operator to execute for a single word; + * The operator to execute for a single BitMap; */ private LongUnaryOperator op1; /** * Constructor. - * @param words The array of BitMap words for a Bloom filter - * @param op2 The operation to execute when there are two words to compare. - * @param op1 The operation to execute when there is only one word to cmpare. + * @param BitMaps The array of BitMap BitMaps for a Bloom filter + * @param op2 The operation to execute when there are two BitMaps to compare. + * @param op1 The operation to execute when there is only one BitMap to cmpare. */ - public CardCounter( long[] words, LongBinaryOperator op2, LongUnaryOperator op1 ) { - this.words = words; + public CardCounter(BitMapProducer producer, Shape shape, LongBinaryOperator op2, LongUnaryOperator op1) { + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape); + producer.forEachBitMap(builder); + this.bitMaps = builder.getArray(); this.op2 = op2; this.op1 = op1; } @Override - public void accept(long word) { - if (idxx&y, (x)->0); + public static int andCardinality(final Shape shape, final BitMapProducer first, final BitMapProducer second) { + CardCounter lc = new CardCounter(first, shape, (x, y) -> x & y, (x) -> 0); second.forEachBitMap(lc); return lc.getCardinality(); } /** - * Calculates the cardinality of the logical OR of the BitMaps for the two filters. - * @param first the first filter. - * @param second the second filter - * @return the cardinality of the OR of the filters. + * Calculates the cardinality of the logical {@code OR} of the BitMaps for the two filters. + * @param shape the shape of the filter + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code OR} of the filters. */ - public static int orCardinality(final BloomFilter first, final BloomFilter second) { - BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder( first.getShape()); - first.forEachBitMap( builder ); - CardCounter lc = new CardCounter(builder.getArray(), (x,y)->x|y, (x)->x); + public static int orCardinality(final Shape shape, final BitMapProducer first, final BitMapProducer second) { + CardCounter lc = new CardCounter(first, shape, (x, y) -> x | y, (x) -> x); second.forEachBitMap(lc); return lc.getCardinality(); } /** - * Calculates the cardinality of the logical XOR of the BitMaps for the two filters. - * @param first the first filter. - * @param second the second filter - * @return the cardinality of the XOR of the filters. + * Calculates the cardinality of the logical {@code XOR} of the BitMaps for the two filters. + * @param shape the shape of the filter + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code XOR} of the filters. */ - public static int xorCardinality(final BloomFilter first, final BloomFilter second) { - BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder( first.getShape()); - first.forEachBitMap( builder ); - CardCounter lc = new CardCounter(builder.getArray(), (x,y)->x^y, (x)->x); + public static int xorCardinality(final Shape shape, final BitMapProducer first, final BitMapProducer second) { + CardCounter lc = new CardCounter(first, shape, (x, y) -> x ^ y, (x) -> x); second.forEachBitMap(lc); return lc.getCardinality(); } @@ -163,12 +158,10 @@ public static double cosineDistance(final BloomFilter first, final BloomFilter s * @return the Cosine similarity. */ public static double cosineSimilarity(final BloomFilter first, final BloomFilter second) { - final int numerator = andCardinality( first, second); + final int numerator = andCardinality(first.getShape(), first, second); return numerator == 0 ? 0 : numerator / (Math.sqrt(first.cardinality()) * Math.sqrt(second.cardinality())); } - - /** * Calculates the Hamming distance between two Bloom filters. * @@ -177,7 +170,7 @@ public static double cosineSimilarity(final BloomFilter first, final BloomFilter * @return the Hamming distance. */ public static int hammingDistance(final BloomFilter first, final BloomFilter second) { - return xorCardinality(first,second); + return xorCardinality(first.getShape(), first, second); } /** @@ -203,15 +196,14 @@ public static double jaccardDistance(final BloomFilter first, final BloomFilter * @return the Jaccard similarity. */ public static double jaccardSimilarity(final BloomFilter first, final BloomFilter second) { - final int orCard = orCardinality(first,second); + final int orCard = orCardinality(first.getShape(), first, second); // if the orCard is zero then the hamming distance will also be zero. return orCard == 0 ? 0 : hammingDistance(first, second) / (double) orCard; } - /** * Do not instantiate. */ - private SetOperations() {} + private SetOperations() { + } } - diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java index 3f11188c0f..c68c79fe5f 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -39,7 +39,7 @@ * [Wikipedia] * @since 4.5 */ -public final class Shape { +public final class Shape implements Comparable { /** * Number of hash functions to create a filter ({@code k}). @@ -94,19 +94,21 @@ private static int checkNumberOfBits(final int numberOfBits) { */ private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { if (numberOfHashFunctions < 1) { - throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); + throw new IllegalArgumentException( + "Number of hash functions must be greater than 0: " + numberOfHashFunctions); } return numberOfHashFunctions; } + @Override + public int compareTo(Shape other) { + int i = Integer.compare(numberOfBits, other.numberOfBits); + return i == 0 ? Integer.compare(numberOfHashFunctions, other.numberOfHashFunctions) : i; + } + @Override public boolean equals(final Object o) { - if (o instanceof Shape) { - final Shape other = (Shape) o; - return numberOfBits == other.numberOfBits && - numberOfHashFunctions == other.numberOfHashFunctions; - } - return false; + return (o instanceof Shape) ? compareTo((Shape) o) == 0 : false; } @Override @@ -124,7 +126,6 @@ public int getNumberOfBits() { return numberOfBits; } - /** * Gets the number of hash functions used to construct the filter. * This is also known as {@code k}. @@ -135,19 +136,18 @@ public int getNumberOfHashFunctions() { return numberOfHashFunctions; } - /** * Calculates the probability of false positives ({@code p}) given * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). *
    p = pow(1 - exp(-k / (m / n)), k)
    * *

    This is the probability that a Bloom filter will return true for the presence of an item - * when it does not contain the item. + * when it does not contain the item.

    * *

    The probability assumes that the Bloom filter is filled with the expected number of * items. If the filter contains fewer items then the actual probability will be lower. - * Thus this returns the worst-case false positive probability for a filter that has not - * exceeded its expected number of items. + * Thus, this returns the worst-case false positive probability for a filter that has not + * exceeded its expected number of items.

    * * @param numberOfItems the number of items hashed into the Bloom filter. * @return the probability of false positives. @@ -166,8 +166,7 @@ public double getProbability(int numberOfItems) { @Override public String toString() { - return String.format("Shape[ m=%s k=%s ]", - numberOfBits, numberOfHashFunctions); + return String.format("Shape[ m=%s k=%s ]", numberOfBits, numberOfHashFunctions); } /** @@ -175,17 +174,17 @@ public String toString() { * @param hammingValue the number of enabled bits. * @return An estimate of the number of items in the Bloom filter. */ - public double estimateN( int hammingValue ) { + public double estimateN(int hammingValue) { double c = hammingValue; double m = numberOfBits; double k = numberOfHashFunctions; - return -(m / k) * Math.log(1.0 - (c / m)); + return -(m / k) * Math.log(1.0 - (c / m)); } /** * The factory to assist in the creation of proper Shapes. * - * In the methods of this factory the `fraom` names are appended with the standard variable + * In the methods of this factory the `from` names are appended with the standard variable * names in the order expected: * *
    @@ -197,7 +196,6 @@ public double estimateN( int hammingValue ) { */ public static class Factory { - /** * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. */ @@ -225,20 +223,19 @@ public static class Factory { * @param probability The desired false-positive probability in the range {@code (0, 1)} * @param numberOfBits The number of bits in the filter * @param numberOfHashFunctions The number of hash functions in the filter - * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}; - * if {@code numberOfBits < 1}; if {@code numberOfHashFunctions < 1}; or if the actual + * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}, + * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual * probability is {@code >= 1.0} */ - public static Shape fromPMK(final double probability, final int numberOfBits, - final int numberOfHashFunctions) { + public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) { checkProbability(probability); checkNumberOfBits(numberOfBits); checkNumberOfHashFunctions(numberOfHashFunctions); // Number of items (n): // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) - final double n = Math.ceil(numberOfBits / - (-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions)))); + final double n = Math.ceil(numberOfBits + / (-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions)))); // log of probability is always < 0 // number of hash functions is >= 1 @@ -251,10 +248,9 @@ public static Shape fromPMK(final double probability, final int numberOfBits, // similarly we can not produce a number greater than numberOfBits so we // do not have to check for Integer.MAX_VALUE either. - - Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); // check that probability is within range - checkCalculatedProbability(shape.getProbability( (int) n )); + checkCalculatedProbability(shape.getProbability((int) n)); return shape; } @@ -275,25 +271,26 @@ public static Shape fromPMK(final double probability, final int numberOfBits, * * @param numberOfItems Number of items to be placed in the filter * @param probability The desired false-positive probability in the range {@code (0, 1)} - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if the desired probability - * is not in the range {@code (0, 1)}; or if the actual probability is {@code >= 1.0} + * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability + * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}. * @see #getProbability() */ - public static Shape fromNP (final int numberOfItems, final double probability) { + public static Shape fromNP(final int numberOfItems, final double probability) { checkNumberOfItems(numberOfItems); checkProbability(probability); // Number of bits (m) final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); if (m > Integer.MAX_VALUE) { - throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); + throw new IllegalArgumentException( + "Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); } int numberOfBits = (int) m; int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); - Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); // check that probability is within range - checkCalculatedProbability(shape.getProbability( numberOfItems )); + checkCalculatedProbability(shape.getProbability(numberOfItems)); return shape; } @@ -310,18 +307,17 @@ public static Shape fromNP (final int numberOfItems, final double probability) { * * @param numberOfItems Number of items to be placed in the filter * @param numberOfBits The number of bits in the filter - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; - * if the calculated number of hash function is {@code < 1}; - * or if the actual probability is {@code >= 1.0} + * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, + * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0} * @see #getProbability() */ public static Shape fromNM(final int numberOfItems, final int numberOfBits) { checkNumberOfItems(numberOfItems); checkNumberOfBits(numberOfBits); int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); - Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); // check that probability is within range - checkCalculatedProbability(shape.getProbability( numberOfItems )); + checkCalculatedProbability(shape.getProbability(numberOfItems)); return shape; } @@ -336,19 +332,18 @@ public static Shape fromNM(final int numberOfItems, final int numberOfBits) { * @param numberOfItems Number of items to be placed in the filter * @param numberOfBits The number of bits in the filter. * @param numberOfHashFunctions The number of hash functions in the filter - * @throws IllegalArgumentException if {@code numberOfItems < 1}; if {@code numberOfBits < 1}; - * if {@code numberOfHashFunctions < 1}; or if the actual probability is {@code >= 1.0} + * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, + * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}. * @see #getProbability() */ - public static Shape fromNMK (final int numberOfItems, final int numberOfBits, - final int numberOfHashFunctions) { + public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) { checkNumberOfItems(numberOfItems); checkNumberOfBits(numberOfBits); checkNumberOfHashFunctions(numberOfHashFunctions); // check that probability is within range - Shape shape = new Shape( numberOfHashFunctions, numberOfBits ); + Shape shape = new Shape(numberOfHashFunctions, numberOfBits); // check that probability is within range - checkCalculatedProbability(shape.getProbability( numberOfItems )); + checkCalculatedProbability(shape.getProbability(numberOfItems)); return shape; } @@ -357,7 +352,7 @@ public static Shape fromNMK (final int numberOfItems, final int numberOfBits, * * @param numberOfItems the number of items * @return the number of items - * @throws IllegalArgumentException if the number of items is {@code < 1} + * @throws IllegalArgumentException if the number of items is {@code < 1}. */ private static int checkNumberOfItems(final int numberOfItems) { if (numberOfItems < 1) { @@ -371,7 +366,7 @@ private static int checkNumberOfItems(final int numberOfItems) { * * @param numberOfBits the number of bits * @return the number of bits - * @throws IllegalArgumentException if the number of bits is {@code < 1} + * @throws IllegalArgumentException if the number of bits is {@code < 1}. */ private static int checkNumberOfBits(final int numberOfBits) { if (numberOfBits < 1) { @@ -385,11 +380,12 @@ private static int checkNumberOfBits(final int numberOfBits) { * * @param numberOfHashFunctions the number of hash functions * @return the number of hash functions - * @throws IllegalArgumentException if the number of hash functions is {@code < 1} + * @throws IllegalArgumentException if the number of hash functions is {@code < 1}. */ private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { if (numberOfHashFunctions < 1) { - throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); + throw new IllegalArgumentException( + "Number of hash functions must be greater than 0: " + numberOfHashFunctions); } return numberOfHashFunctions; } @@ -403,7 +399,8 @@ private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { private static void checkProbability(final double probability) { // Using the negation of within the desired range will catch NaN if (!(probability > 0.0 && probability < 1.0)) { - throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability); + throw new IllegalArgumentException( + "Probability must be greater than 0 and less than 1: " + probability); } } @@ -415,7 +412,7 @@ private static void checkProbability(final double probability) { * construction. * * @param probability the probability - * @throws IllegalArgumentException if the probability is {@code >= 1.0} + * @throws IllegalArgumentException if the probability is {@code >= 1.0}. */ private static void checkCalculatedProbability(final double probability) { // We do not need to check for p <= 0.0 since we only allow positive values for @@ -452,4 +449,5 @@ private static int calculateNumberOfHashFunctions(final int numberOfItems, final return (int) k; } } + } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java index c6ef8e79eb..fdf79a7415 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -50,7 +50,7 @@ public class SimpleBloomFilter implements BloomFilter { * */ public SimpleBloomFilter(Shape shape) { - Objects.requireNonNull( shape, "shape"); + Objects.requireNonNull(shape, "shape"); this.shape = shape; this.bitMap = new long[0]; this.cardinality = 0; @@ -62,11 +62,11 @@ public SimpleBloomFilter(Shape shape) { * @param hasher the Hasher to initialize the filter with. */ public SimpleBloomFilter(final Shape shape, Hasher hasher) { - Objects.requireNonNull( shape, "shape"); - Objects.requireNonNull( hasher, "hasher"); + Objects.requireNonNull(shape, "shape"); + Objects.requireNonNull(hasher, "hasher"); this.shape = shape; this.bitMap = new long[0]; - mergeInPlace( hasher ); + mergeInPlace(hasher); } /** @@ -74,29 +74,30 @@ public SimpleBloomFilter(final Shape shape, Hasher hasher) { * @param shape The shape for the filter. * @param producer the BitMap Producer to initialize the filter with. */ - public SimpleBloomFilter(final Shape shape, BitMapProducer producer ) { - Objects.requireNonNull( shape, "shape"); - Objects.requireNonNull( producer, "producer"); + public SimpleBloomFilter(final Shape shape, BitMapProducer producer) { + Objects.requireNonNull(shape, "shape"); + Objects.requireNonNull(producer, "producer"); this.shape = shape; BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape); - producer.forEachBitMap( builder ); + producer.forEachBitMap(builder); this.bitMap = builder.getArray(); this.cardinality = -1; } @Override public boolean mergeInPlace(Hasher hasher) { - Objects.requireNonNull( hasher, "hasher"); + Objects.requireNonNull(hasher, "hasher"); Shape shape = getShape(); - hasher.indices(shape).forEachIndex( idx -> { - if (bitMap.length <= BitMap.getLongIndex(idx)) { - long[] newMap = new long[BitMap.numberOfBuckets( idx )]; - System.arraycopy( bitMap, 0, newMap, 0, bitMap.length); + hasher.indices(shape).forEachIndex(idx -> { + int lidx = BitMap.getLongIndex(idx); + if (bitMap.length <= lidx) { + long[] newMap = new long[lidx + 1]; + System.arraycopy(bitMap, 0, newMap, 0, bitMap.length); bitMap = newMap; } - BitMap.set( bitMap, idx); + BitMap.set(bitMap, idx); }); this.cardinality = -1; return true; @@ -104,9 +105,9 @@ public boolean mergeInPlace(Hasher hasher) { @Override public boolean mergeInPlace(BloomFilter other) { - Objects.requireNonNull( other, "other"); + Objects.requireNonNull(other, "other"); BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape, this.bitMap); - other.forEachBitMap( builder ); + other.forEachBitMap(builder); this.bitMap = builder.getArray(); this.cardinality = -1; return true; @@ -125,10 +126,10 @@ public boolean isSparse() { @Override public int cardinality() { if (this.cardinality == -1) { - synchronized( this ) { + synchronized (this) { if (this.cardinality == -1) { this.cardinality = 0; - forEachBitMap( w -> this.cardinality += Long.bitCount(w)); + forEachBitMap(w -> this.cardinality += Long.bitCount(w)); } } } @@ -137,14 +138,14 @@ public int cardinality() { @Override public void forEachIndex(IntConsumer consumer) { - Objects.requireNonNull( consumer, "consumer"); + Objects.requireNonNull(consumer, "consumer"); IndexProducer.fromBitMapProducer(this).forEachIndex(consumer); } @Override public void forEachBitMap(LongConsumer consumer) { - Objects.requireNonNull( consumer, "consumer"); - for ( long l : bitMap ) { + Objects.requireNonNull(consumer, "consumer"); + for (long l : bitMap) { consumer.accept(l); } } @@ -152,32 +153,35 @@ public void forEachBitMap(LongConsumer consumer) { @Override public boolean contains(IndexProducer indexProducer) { try { - indexProducer.forEachIndex( idx -> {if (!BitMap.contains( bitMap, idx)) { throw new NoMatchException(); }} ); + indexProducer.forEachIndex(idx -> { + if (!BitMap.contains(bitMap, idx)) { + throw new NoMatchException(); + } + }); return true; } catch (NoMatchException e) { return false; } } - @Override public boolean contains(BitMapProducer bitMapProducer) { LongConsumer consumer = new LongConsumer() { - int i=0; + int i = 0; + @Override public void accept(long w) { - if ((bitMap[i++] & w) != w) - { throw new NoMatchException(); + if ((bitMap[i++] & w) != w) { + throw new NoMatchException(); } - }}; - try { - bitMapProducer.forEachBitMap( consumer ); - return true; - } - catch(NoMatchException e) - { - return false; } + }; + try { + bitMapProducer.forEachBitMap(consumer); + return true; + } catch (NoMatchException e) { + return false; + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java index 5837d890dc..7a43ded059 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -36,6 +36,10 @@ public class SparseBloomFilter implements BloomFilter { * The bitSet that defines this BloomFilter. */ private final TreeSet indices; + + /** + * The shape of this BloomFilter + */ private final Shape shape; /** @@ -43,7 +47,7 @@ public class SparseBloomFilter implements BloomFilter { * */ public SparseBloomFilter(Shape shape) { - Objects.requireNonNull( shape, "shape"); + Objects.requireNonNull(shape, "shape"); this.shape = shape; this.indices = new TreeSet(); } @@ -54,9 +58,9 @@ public SparseBloomFilter(Shape shape) { * @param hasher the hasher to provide the initial data. */ public SparseBloomFilter(final Shape shape, Hasher hasher) { - this( shape ); - Objects.requireNonNull( hasher, "hasher"); - hasher.indices(shape).forEachIndex( this.indices::add ); + this(shape); + Objects.requireNonNull(hasher, "hasher"); + hasher.indices(shape).forEachIndex(this.indices::add); } /** @@ -68,26 +72,25 @@ public SparseBloomFilter(final Shape shape, Hasher hasher) { */ public SparseBloomFilter(Shape shape, List indices) { this(shape); - Objects.requireNonNull( indices, "indices"); - this.indices.addAll( indices ); + Objects.requireNonNull(indices, "indices"); + this.indices.addAll(indices); if (this.indices.last() >= shape.getNumberOfBits()) { - throw new IllegalArgumentException( - String.format( "Value in list {} is greater than maximum value ({})", - this.indices.last(), shape.getNumberOfBits())); + throw new IllegalArgumentException(String.format("Value in list {} is greater than maximum value ({})", + this.indices.last(), shape.getNumberOfBits())); } } @Override public boolean mergeInPlace(Hasher hasher) { - Objects.requireNonNull( hasher, "hasher"); - hasher.indices(shape).forEachIndex( this.indices::add ); + Objects.requireNonNull(hasher, "hasher"); + hasher.indices(shape).forEachIndex(this.indices::add); return true; } @Override public boolean mergeInPlace(BloomFilter other) { - Objects.requireNonNull( other, "other"); - other.forEachIndex( indices::add ); + Objects.requireNonNull(other, "other"); + other.forEachIndex(indices::add); return true; } @@ -108,41 +111,44 @@ public int cardinality() { @Override public void forEachIndex(IntConsumer consumer) { - Objects.requireNonNull( consumer, "consumer"); - for (int value : indices ) { - consumer.accept( value ); + Objects.requireNonNull(consumer, "consumer"); + for (int value : indices) { + consumer.accept(value); } } @Override public void forEachBitMap(LongConsumer consumer) { - Objects.requireNonNull( consumer, "consumer"); + Objects.requireNonNull(consumer, "consumer"); if (cardinality() == 0) { return; } // because our indices are always in order we can // shorten the time necessary to create the longs for the // consumer - long bitMap =0; - int idx=0; + long bitMap = 0; + int idx = 0; for (int i : indices) { while (BitMap.getLongIndex(i) != idx) { - consumer.accept( bitMap ); + consumer.accept(bitMap); bitMap = 0; idx++; } bitMap |= BitMap.getLongBit(i); } - if (bitMap != 0) - { - consumer.accept( bitMap ); + if (bitMap != 0) { + consumer.accept(bitMap); } } @Override public boolean contains(IndexProducer indexProducer) { try { - indexProducer.forEachIndex( idx -> { if (!indices.contains(idx)) { throw new NoMatchException(); }}); + indexProducer.forEachIndex(idx -> { + if (!indices.contains(idx)) { + throw new NoMatchException(); + } + }); return true; } catch (NoMatchException e) { return false; @@ -151,8 +157,7 @@ public boolean contains(IndexProducer indexProducer) { @Override public boolean contains(BitMapProducer bitMapProducer) { - return contains( IndexProducer.fromBitMapProducer(bitMapProducer)); + return contains(IndexProducer.fromBitMapProducer(bitMapProducer)); } - } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java index ca3f809ecc..9491de9bca 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java @@ -14,4 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +/** + * Exceptions specific to Bloom filter processing. + */ package org.apache.commons.collections4.bloomfilter.exceptions; \ No newline at end of file diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java index 4363cc4026..4376e58641 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java @@ -27,7 +27,9 @@ /** * A collection of Hashers. Useful when the generation of a Bloom filter depends upon - * multiple items. Hashers for each item are added to the HasherCollection and then + * multiple items. + * + * Hashers for each item are added to the HasherCollection and then * the collection is used wherever a Hasher can be used in the API. * * @since 4.5 @@ -52,7 +54,7 @@ public HasherCollection() { * @param hashers A collections of Hashers to build the indices with. */ public HasherCollection(final Collection hashers) { - Objects.requireNonNull( hashers, "hashers"); + Objects.requireNonNull(hashers, "hashers"); this.hashers = new ArrayList<>(hashers); } @@ -63,7 +65,7 @@ public HasherCollection(final Collection hashers) { * @param buffers the byte buffers that will be hashed. */ public HasherCollection(Hasher... hashers) { - this( Arrays.asList(hashers)); + this(Arrays.asList(hashers)); } /** @@ -71,7 +73,7 @@ public HasherCollection(Hasher... hashers) { * @param hasher The hasher to add. */ public void add(Hasher hasher) { - Objects.requireNonNull( hasher, "hasher"); + Objects.requireNonNull(hasher, "hasher"); hashers.add(hasher); } @@ -80,28 +82,35 @@ public void add(Hasher hasher) { * @param hashers The hashers to add. */ public void add(Collection hashers) { - Objects.requireNonNull( hashers, "hashers"); + Objects.requireNonNull(hashers, "hashers"); hashers.addAll(hashers); } @Override public IndexProducer indices(final Shape shape) { - Objects.requireNonNull( shape, "shape"); + Objects.requireNonNull(shape, "shape"); return new IndexProducer() { @Override public void forEachIndex(IntConsumer consumer) { for (Hasher hasher : hashers) { - hasher.indices( shape ).forEachIndex(consumer); + hasher.indices(shape).forEachIndex(consumer); } } }; } + /** + * Allow child classes access to the hashers. + * @return hashers + */ + protected List getHashers() { + return hashers; + } + @Override public int size() { int i = 0; - for (Hasher h : hashers ) - { + for (Hasher h : hashers) { i += h.size(); } return i; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java index 630d7fb384..22795b94ae 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java @@ -17,15 +17,14 @@ package org.apache.commons.collections4.bloomfilter.hasher; import java.util.Objects; -import java.util.TreeSet; import java.util.function.IntConsumer; import org.apache.commons.collections4.bloomfilter.IndexProducer; import org.apache.commons.collections4.bloomfilter.Shape; - /** - * A Hasher that implements combinatorial hashing. + * A Hasher that implements combinatorial hashing as as described by + * Krisch amd Mitzenmacher. *

    * Common use for this hasher is to generate a byte array as the output of a hashing * or MessageDigest algorithm.

    @@ -44,7 +43,6 @@ public final class SimpleHasher implements Hasher { */ private final long increment; - /** * Convert bytes to long. * @param byteArray the byte array to extract the values from. @@ -52,33 +50,31 @@ public final class SimpleHasher implements Hasher { * @param len the length of the extraction, may be longer than 8. * @return */ - private static final long toLong (byte[] byteArray, int offset, int len) - { - long val = 0; - len = Math.min(len, Long.BYTES); - for (int i = 0;iThe byte array is split in 2 and each half is interpreted as a long value. + * Excess bytes are ignored. This simplifies the conversion from a Digest or hasher algorithm output + * to the two values used by the SimpleHasher.

    * @param buffer the buffer to extract the longs from. * @throws IllegalArgumentException is buffer length is zero. */ - public SimpleHasher( byte[] buffer ) { - if (buffer.length == 0) - { - throw new IllegalArgumentException( "buffer length must be greater than 0"); + public SimpleHasher(byte[] buffer) { + if (buffer.length == 0) { + throw new IllegalArgumentException("buffer length must be greater than 0"); } - int segment = buffer.length/2; - this.initial = toLong( buffer, 0, segment ); - this.increment = toLong( buffer, segment, buffer.length-segment); + int segment = buffer.length / 2; + this.initial = toLong(buffer, 0, segment); + this.increment = toLong(buffer, segment, buffer.length - segment); } /** @@ -91,7 +87,6 @@ public SimpleHasher(long initial, long increment) { this.increment = increment; } - /** * Gets an IndexProducer that produces indices based on the shape. * The iterator will not return the same value multiple @@ -103,29 +98,22 @@ public SimpleHasher(long initial, long increment) { */ @Override public IndexProducer indices(final Shape shape) { - Objects.requireNonNull( shape, "shape"); + Objects.requireNonNull(shape, "shape"); return new IndexProducer() { - /** The number of hash functions per item. */ - private final int k = shape.getNumberOfHashFunctions(); - /** The number of bits in the shape. */ - private final long m = shape.getNumberOfBits(); /** The index of the next item. */ private long next = SimpleHasher.this.initial; - /** The count of hash functions for the current item. */ - private int functionCount = 0; @Override public void forEachIndex(IntConsumer consumer) { - Objects.requireNonNull( consumer, "consumer"); - TreeSet seen = new TreeSet(); - while (functionCount < k) { - seen.add((int) Long.remainderUnsigned( next, m )); - functionCount++; + Objects.requireNonNull(consumer, "consumer"); + FilteredIntConsumer filtered = new FilteredIntConsumer(shape.getNumberOfBits(), consumer); + for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) { + int value = (int) Long.remainderUnsigned(next, shape.getNumberOfBits()); + filtered.accept(value); next += SimpleHasher.this.increment; } - seen.stream().mapToInt( s -> s.intValue() ).forEach(consumer); } }; } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java new file mode 100644 index 0000000000..d1a15587e2 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.Collection; +import java.util.Objects; +import java.util.function.IntConsumer; +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; + +/** + * A collection of Hashers that are combined to be a single item. This differs from + * the HasherCollection in that the HasherCollection counts each Hasher in the collection as + * a different item, or in the case of an enclosed HasherCollection multiple items. This collection + * assumes that all hashers are combined to make a single item. + * + * @since 4.5 + */ +public class SingleItemHasherCollection extends HasherCollection { + + public SingleItemHasherCollection() { + super(); + } + + public SingleItemHasherCollection(Collection hashers) { + super(hashers); + } + + public SingleItemHasherCollection(Hasher... hashers) { + super(hashers); + } + + /** + * Produces unique indices. + * + *

    Specifically, this method create an IndexProducer that will not return duplicate indices. The effect is + * to make the entire collection appear as one item. This useful when working with complex Bloom filters like the + * CountingBloomFilter.

    + * + * @param shape The shape of the desired Bloom filter. + * @return an IndexProducer that only produces unique values. + */ + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + IndexProducer baseProducer = super.indices(shape); + + return new IndexProducer() { + @Override + public void forEachIndex(IntConsumer consumer) { + Objects.requireNonNull(consumer, "consumer"); + FilteredIntConsumer filtered = new FilteredIntConsumer(shape.getNumberOfBits() - 1, consumer); + baseProducer.forEachIndex(filtered); + } + }; + } + + @Override + public int size() { + for (Hasher hasher : getHashers()) { + if (hasher.size() > 0) { + return 1; + } + } + return 0; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java index 8c2a46592c..b53e159063 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java @@ -16,6 +16,47 @@ */ /** + * Hasher implementations and tools. + * + * + *

    Hasher

    + * + *

    A Hasher converts one or more items into an {@code IndexProducer} series of integers based on a {@code Shape}. + * + * + * The base Hasher implementations + * are as follows:

    + * + *

    SimpleHasher

    + * + *

    The SimpleHasher represents one item being added to the Bloom filter. It utilizes the combinatorial strategy + * as described by Krisch and Mitzenmacher[1]. Generally, a hash value is created by hashing + * together multiple properties of the item being added. The hash value is then used to create a SimpleHasher.

    + * + *

    This hasher represents a single item and thus does not return duplicate indices.

    + * + *

    HasherCollection

    + * + *

    The HasherCollection is a collection of Hashers that implemehts the Hasher interface. Each hasher within the collection + * represents a single item, or in the case of a HasherCollections multiple items.

    + * + *

    This hahser represents multiple items and thus may return duplicate indices.

    + * + *

    SingleItemHasherCollection

    + * + *

    A collection of Hashers that are combined to represent a single item. Like the HasherCollection this Hasher is composed + * of multiple Hashers. Unlike the HasherCollection, this hasher reports that it is only one item.

    + * + * + *

    This hasher represents a single item and thus does not return duplicate indices.

    + * + *

    Other Implementations

    + * + *

    Other implementations of the Hasher are easy to implement. Hashers that represent single items should make use of the + * {@code Hahser.Filter} and/or {@code Hahser.FileredIntConsumer} classes to filter out duplicate indices.

    + * + * + * * * With the exception of the HasherCollection, a Hasher represents an item of arbitrary * byte size as multiple byte representations of fixed size (multiple hashes). The hashers @@ -30,12 +71,17 @@ * *

    Note that the process of generating hashes and mapping them to a Bloom * filter shape may create duplicate indexes. The Hasher implementation is required to - * remove all duplicate values for a single item. Thus tge hasher may generate fewer + * remove all duplicate values for a single item. Thus the hasher may generate fewer * than the required number of hash values per item after duplicates have been * removed.

    * + *

    Footnotes * + * , + * Harvard Computer Science Group Technical Report TR-02-05. + * + * @see org.apache.commons.collections4.bloomfilter.IndexProducer * @since 4.5 */ package org.apache.commons.collections4.bloomfilter.hasher; - diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java index 541af51169..45f0a81b99 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java @@ -18,15 +18,14 @@ /** * A collection of extensible Bloom filter classes and interfaces. * - *

    - * Background:

    - *

    - * The Bloom filter is a probabilistic data structure that indicates where things are not. + *

    Background:

    + * + *

    The Bloom filter is a probabilistic data structure that indicates where things are not. * Conceptually it is a a bit vector. You create a Bloom filter by creating hashes * and converting those to enabled bits in the vector. Multiple Bloom filters may be merged - * together into one Bloom filter. It is possible to test if a filter @{code B} as merged into - * another filter @{code A} by verifying that @{code (A & B) == B}. - *

    + * together into one Bloom filter. It is possible to test if a filter {@code B} as merged into + * another filter {@code A} by verifying that {@code (A & B) == B}.

    + * *

    Bloom filters are generally used where hash * tables would be too large, or as a filter front end for longer processes. For example * most browsers have a Bloom filter that is built from all known bad URLs (ones that @@ -35,61 +34,63 @@ * expensive lookup on a remote system is made to see if it actually is in the list. There * are lots of other uses, and in most cases the reason is to perform a fast check as a * gateway for a longer operation.

    - *

    - * BloomFilter

    - * *

    - * The bloom filter code is - * an interface that requires implementation of 6 methods:

      - *
    • - * @{code cardinality()} + * + *

      BloomFilter

      + * + *

      The Bloom filter architecture here is designed so that the implementation of the storage of bit is abstracted. + * Programs that utilize the Bloom filters may use the {@code BitMapProducer} or {@code IndexProducer} to retrieve a + * representation of the internal structure. Additional methods are available in the {@code BitMap} to assist in + * manipulation of the representations.

      + * + *

      The bloom filter code is an interface that requires implementation of 6 methods: + *

        + *
      • {@code cardinality()} * returns the number of bits enabled in the Bloom filter.
      • - *
      • - * @{code contains(BitMapProducer)} which + * + *
      • {@code contains(BitMapProducer)} which * returns true if the bits specified by the BitMaps generated by the BitMapProducer are enabled in the Bloom filter.
      • - *
      • - * @{code contains(IndexProducer)} which + * + *
      • {@code contains(IndexProducer)} which * returns true if the bits specified by the Indices generated by IndexProducer are enabled in the Bloom filter.
      • - *
      • - * @{code getShape()} which + * + *
      • {@code getShape()} which * returns shape the Bloom filter was created with.
      • - *
      • - * @{code isSparse()} which + + *
      • {@code isSparse()} which * returns true if an the implementation tracks indices natively, false if BitMaps are used. In cases where - * neither are used the @{code isSparse} return value should reflect which is faster to produce.
      • - *
      • - * @{code mergeInPlace(BloomFilter)} which - * utilizes either the @{code BitMapProducer} or @{code IndexProducer} from the argument to enable extra bits + * neither are used the {@code isSparse} return value should reflect which is faster to produce.
      • + * + *
      • {@code mergeInPlace(BloomFilter)} which + * utilizes either the {@code BitMapProducer} or {@code IndexProducer} from the argument to enable extra bits * in the internal representation of the Bloom filter..
      • *
      - *

      - * Other methods should be implemented where they can be done so more efficiently than the default implementations. + *

      + *

      Other methods should be implemented where they can be done so more efficiently than the default implementations. *

      * - * <3>CountingBloomFilter

    + *

    CountingBloomFilter

    + * *

    The counting bloom filter extends the Bloom filter by counting the number of times a specific bit has been * enabled or disabled. This allows the removal (opposite of merge) of Bloom filters at the expense of additional * overhead.

    *
  • - * HasherBloomFilter - implements bloom - * filter on a Hasher. A rather slow implementation but convenient in some - * situations.
  • - *
- * - *

- * Shape

- *

- * The Shape describes the Bloom filter using the number of bits and the number of hash functions

- * - *

- * Hasher

- *

- * A Hasher converts bytes into an series of integers based on a Shape. With the exception of the HasherCollecton, + * + *

Shape

+ * + *

The Shape describes the Bloom filter using the number of bits and the number of hash functions

+ * + *

Hasher

+ * + *

A Hasher converts bytes into an series of integers based on a Shape. With the exception of the HasherCollecton, * Each hasher represents one item being added to the Bloom filter. The HasherCollection represents the - * number of items as the sum of the number of items represented by Hashers in the collection.

+ * number of items as the sum of the number of items represented by the Hashers in the collection.

+ * *

The SimpleHasher uses a combinatorial generation technique to create the integers. It is easily - * initialized by using a standard @{code MessageDigest} or other Hash function to hash the item to insert and - * then splitting the hash bytes in half and considering each as a long value. - * Other implementations of the Hasher are easy to implement.

+ * initialized by using a standard {@code MessageDigest} or other Hash function to hash the item to insert and + * then splitting the hash bytes in half and considering each as a long value.

+ * + *

Other implementations of the Hasher are easy to implement, and should make use of the {@code Hahser.Filter} + * and/r {@code Hahser.FileredIntConsumer} classes to filter out duplicate indices.

* *

References

* diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java index beba34b979..d8e1459ad4 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java @@ -19,6 +19,8 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + import org.junit.Test; public class BitMapTest { @@ -35,24 +37,7 @@ public void checkPositiveTest() { } } - @Test - public void containsTest() { - long[] ary = new long[1]; - - assertFalse( BitMap.contains(ary, 0) ); - ary[0] = 0x01; - assertTrue( BitMap.contains(ary, 0) ); - assertFalse( BitMap.contains(ary, 63) ); - ary[0] = (1L << 63); - assertTrue( BitMap.contains(ary, 63) ); - - ary = new long[2]; - assertFalse( BitMap.contains(ary, 64) ); - ary[1] = 1; - assertTrue( BitMap.contains(ary, 64) ); - - } @Test public void getLongBitTest() { @@ -90,14 +75,64 @@ public void isSparseTest() { } @Test - public void numberOfBucketsTest() { - for (int i = 0;i<64;i++) { - assertEquals( 1, BitMap.numberOfBuckets(i)); + public void numberOfBitMapsTest() { + assertEquals( "Number of bits 0", 0, BitMap.numberOfBitMaps(0)); + for (int i = 1;i<65;i++) { + assertEquals( String.format( "Number of bits %d", i ), 1, BitMap.numberOfBitMaps(i)); + } + for (int i = 65;i<129;i++) { + assertEquals( String.format( "Number of bits %d", i ),2, BitMap.numberOfBitMaps(i)); + } + assertEquals( "Number of bits 129", 3, BitMap.numberOfBitMaps(129)); + + } + + @Test + public void setTest() { + long[] bitMaps = new long[ BitMap.numberOfBitMaps(129)]; + for (int i=0;i<129;i++) { + BitMap.set( bitMaps, i); + assertTrue( String.format("Failed at index: %d",i), BitMap.contains( bitMaps, i)); } - for (int i = 64;i<128;i++) { - assertEquals( 2, BitMap.numberOfBuckets(i)); + assertEquals( 0xFFFFFFFFFFFFFFFFL, bitMaps[0] ); + assertEquals( 0xFFFFFFFFFFFFFFFFL, bitMaps[1] ); + assertEquals( 1L, bitMaps[2] ); + } + + @Test + public void containsTest() { + long[] bitMaps = new long[ 1 ]; + + for (int i=0;i<64;i++) { + bitMaps[0] = 0l; + BitMap.set( bitMaps, i); + for (int j=0;j<64;j++) { + if (j==i) { + assertTrue( String.format("Failed at index: %d for %d",i,j), BitMap.contains( bitMaps, j)); + } else { + assertFalse( String.format("Failed at index %d for %d",i,j), BitMap.contains( bitMaps, j)); + } + } + } - assertEquals( 3, BitMap.numberOfBuckets(128)); + } + + @Test + public void contains_boundaryConditionTest() { + long[] ary = new long[1]; + + assertFalse( BitMap.contains(ary, 0) ); + ary[0] = 0x01; + assertTrue( BitMap.contains(ary, 0) ); + + assertFalse( BitMap.contains(ary, 63) ); + ary[0] = (1L << 63); + assertTrue( BitMap.contains(ary, 63) ); + + ary = new long[2]; + assertFalse( BitMap.contains(ary, 64) ); + ary[1] = 1; + assertTrue( BitMap.contains(ary, 64) ); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index c9d225ea6b..d3fea4788a 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -209,18 +209,18 @@ public final void orCardinalityTest() { Shape shape = new Shape( 3, 128); SparseBloomFilter filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63, 64)); SparseBloomFilter filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 5, SetOperations.orCardinality(filter1, filter2) ); - assertEquals( 5, SetOperations.orCardinality(filter2, filter1) ); + assertEquals( 5, SetOperations.orCardinality(shape, filter1, filter2) ); + assertEquals( 5, SetOperations.orCardinality(shape, filter2, filter1) ); filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63 )); filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 5, SetOperations.orCardinality(filter1, filter2) ); - assertEquals( 5, SetOperations.orCardinality(filter2, filter1) ); + assertEquals( 5, SetOperations.orCardinality(shape, filter1, filter2) ); + assertEquals( 5, SetOperations.orCardinality(shape, filter2, filter1) ); filter1 = new SparseBloomFilter( shape, Arrays.asList(5, 63 )); filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 4, SetOperations.orCardinality(filter1, filter2) ); - assertEquals( 4, SetOperations.orCardinality(filter2, filter1) ); + assertEquals( 4, SetOperations.orCardinality(shape, filter1, filter2) ); + assertEquals( 4, SetOperations.orCardinality(shape, filter2, filter1) ); } @Test @@ -228,18 +228,18 @@ public final void andCardinalityTest() { Shape shape = new Shape( 3, 128); SparseBloomFilter filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63, 64)); SparseBloomFilter filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 1, SetOperations.andCardinality(filter1, filter2) ); - assertEquals( 1, SetOperations.andCardinality(filter2, filter1) ); + assertEquals( 1, SetOperations.andCardinality(shape, filter1, filter2) ); + assertEquals( 1, SetOperations.andCardinality(shape, filter2, filter1) ); filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63 )); filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 0, SetOperations.andCardinality(filter1, filter2) ); - assertEquals( 0, SetOperations.andCardinality(filter2, filter1) ); + assertEquals( 0, SetOperations.andCardinality(shape, filter1, filter2) ); + assertEquals( 0, SetOperations.andCardinality(shape, filter2, filter1) ); filter1 = new SparseBloomFilter( shape, Arrays.asList(5, 63 )); filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 1, SetOperations.andCardinality(filter1, filter2) ); - assertEquals( 1, SetOperations.andCardinality(filter2, filter1) ); + assertEquals( 1, SetOperations.andCardinality(shape, filter1, filter2) ); + assertEquals( 1, SetOperations.andCardinality(shape, filter2, filter1) ); } @@ -248,18 +248,18 @@ public final void xorCardinalityTest() { Shape shape = new Shape( 3, 128); SparseBloomFilter filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63, 64)); SparseBloomFilter filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 4, SetOperations.xorCardinality(filter1, filter2) ); - assertEquals( 4, SetOperations.xorCardinality(filter2, filter1) ); + assertEquals( 4, SetOperations.xorCardinality(shape, filter1, filter2) ); + assertEquals( 4, SetOperations.xorCardinality(shape, filter2, filter1) ); filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63 )); filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 5, SetOperations.xorCardinality(filter1, filter2) ); - assertEquals( 5, SetOperations.xorCardinality(filter2, filter1) ); + assertEquals( 5, SetOperations.xorCardinality(shape, filter1, filter2) ); + assertEquals( 5, SetOperations.xorCardinality(shape, filter2, filter1) ); filter1 = new SparseBloomFilter( shape, Arrays.asList(5, 63 )); filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 3, SetOperations.xorCardinality(filter1, filter2) ); - assertEquals( 3, SetOperations.xorCardinality(filter2, filter1) ); + assertEquals( 3, SetOperations.xorCardinality(shape, filter1, filter2) ); + assertEquals( 3, SetOperations.xorCardinality(shape, filter2, filter1) ); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java index f505452c81..6b266f3e94 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java @@ -25,7 +25,7 @@ import org.junit.jupiter.api.Test; /** - * Tests the {@link SimpleHasher}. + * Tests the {@link HasherCollection}. */ public class HasherCollectionTest { @@ -45,13 +45,13 @@ public void sizeTest() { @Test public void testIndices() { Shape shape = new Shape( 5, 10 ); - Integer[] expected = { 1,2,3,4,5,0,2,4,6,8 }; + Integer[] expected = { 1,2,3,4,5,2,4,6,8,0 }; List lst = new ArrayList(); IndexProducer producer = hasher.indices(shape); producer.forEachIndex( lst::add ); assertEquals( expected.length, lst.size()); for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], lst.get(i) ); + assertEquals( String.format("error at position %d", i), expected[i], lst.get(i) ); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java new file mode 100644 index 0000000000..c0947557ad --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import static org.junit.Assert.assertEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.ArrayList; +import java.util.List; +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link SingleItemHasherCollection}. + */ +public class SingleItemHasherCollectionTest { + + private SimpleHasher hasher1 = new SimpleHasher( 1,1 ); + private SimpleHasher hasher2 = new SimpleHasher( 2, 2 ); + private HasherCollection hasher = new SingleItemHasherCollection( hasher1, hasher2 ); + + @Test + public void sizeTest() { + assertEquals( 1, hasher.size() ); + HasherCollection hasher3 = new SingleItemHasherCollection( hasher, new SimpleHasher( 3, 3 )); + assertEquals( 1, hasher3.size() ); + + } + + + @Test + public void testIndices() { + Shape shape = new Shape( 5, 10 ); + Integer[] expected = { 1,2,3,4,5,6,8,0 }; + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(shape); + producer.forEachIndex( lst::add ); + assertEquals( expected.length, lst.size()); + for (int i=0;i< expected.length;i++) { + assertEquals( expected[i], lst.get(i) ); + } + } + +} From c9d9d2ba22b1cca3f66bfe5b66c459c28de1d1c6 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Mon, 22 Nov 2021 07:59:11 +0000 Subject: [PATCH 17/27] Added license --- .../bloomfilter/hasher/HasherFilterTest.java | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java index b5aa62f8a4..aba5c4f538 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java @@ -1,4 +1,19 @@ -package org.apache.commons.collections4.bloomfilter.hasher; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */package org.apache.commons.collections4.bloomfilter.hasher; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -6,6 +21,9 @@ import org.junit.Test; +/** + * Tests the {@link Hasher.Filter}. + */ public class HasherFilterTest { @Test From dd119ac8e4aa0baf335c273d3d2fcc87cb417b70 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Mon, 22 Nov 2021 14:57:15 +0000 Subject: [PATCH 18/27] fixed checkstyle issues --- .../bloomfilter/BitCountProducer.java | 2 +- .../collections4/bloomfilter/BitMap.java | 3 +- .../bloomfilter/BitMapProducer.java | 6 +- .../collections4/bloomfilter/BloomFilter.java | 4 +- .../bloomfilter/IndexProducer.java | 2 +- .../bloomfilter/SetOperations.java | 2 +- .../bloomfilter/exceptions/package-info.java | 2 +- .../bloomfilter/hasher/Hasher.java | 4 +- .../bloomfilter/hasher/SimpleHasher.java | 2 +- .../bloomfilter/AbstractBloomFilterTest.java | 122 ++-- .../AbstractCountingBloomFilterTest.java | 135 ++-- .../ArrayCountingBloomFilterTest.java | 6 +- .../bloomfilter/BitCountProducerTest.java | 35 +- .../bloomfilter/BitMapProducerTest.java | 30 +- .../collections4/bloomfilter/BitMapTest.java | 98 ++- .../bloomfilter/IndexProducerTest.java | 36 +- .../bloomfilter/SetOperationsTest.java | 110 ++- .../bloomfilter/ShapeFactoryTest.java | 34 +- .../collections4/bloomfilter/ShapeTest.java | 648 +++++++++--------- .../bloomfilter/SimpleBloomFilterTest.java | 2 +- .../bloomfilter/SparseBloomFilterTest.java | 2 +- .../hasher/HasherCollectionTest.java | 25 +- .../bloomfilter/hasher/HasherFilterTest.java | 16 +- .../bloomfilter/hasher/NullHasherTest.java | 9 +- .../bloomfilter/hasher/SimpleHasherTest.java | 113 ++- .../SingleItemHasherCollectionTest.java | 25 +- 26 files changed, 698 insertions(+), 775 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java index e4f88ecc26..af444eeeb9 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java @@ -47,7 +47,7 @@ default void forEachIndex(IntConsumer consumer) { * @param idx An index producer. * @return A BitCountProducer with the same indices as the IndexProducer. */ - public static BitCountProducer from(IndexProducer idx) { + static BitCountProducer from(IndexProducer idx) { return new BitCountProducer() { @Override public void forEachCount(BitCountConsumer consumer) { diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java index 070c91b65f..11abbfde49 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -166,5 +166,4 @@ public static boolean isSparse(int cardinality, Shape shape) { Objects.requireNonNull(shape, "shape"); return cardinality <= (numberOfBitMaps(shape.getNumberOfBits()) * 2); } - -} \ No newline at end of file +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java index dded8351c2..478cce0928 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java @@ -52,7 +52,7 @@ public interface BitMapProducer { * @param bitMaps the bitMaps to return. * @return a BitMapProducer. */ - public static BitMapProducer fromLongArray(long[] bitMaps) { + static BitMapProducer fromLongArray(long[] bitMaps) { return new BitMapProducer() { @Override @@ -71,7 +71,7 @@ public void forEachBitMap(LongConsumer consumer) { * @param shape the desired shape. * @return A BitMapProducer that produces the BitMap equivalent of the Indices from the producer. */ - public static BitMapProducer fromIndexProducer(IndexProducer producer, Shape shape) { + static BitMapProducer fromIndexProducer(IndexProducer producer, Shape shape) { Objects.requireNonNull(producer, "producer"); Objects.requireNonNull(shape, "shape"); @@ -108,7 +108,7 @@ public void accept(int i) { * A LongConsumer that builds an Array of BitMaps as produced by a BitMapProducer. * */ - public class ArrayBuilder implements LongConsumer { + class ArrayBuilder implements LongConsumer { private long[] result; private int idx = 0; private int bucketCount = 0; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index 65bc8b7e9f..424c115601 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -33,7 +33,7 @@ public interface BloomFilter extends IndexProducer, BitMapProducer { * @param filter the filter to get the data from. * @return An array of BitMap long. */ - public static long[] asBitMapArray(BloomFilter filter) { + static long[] asBitMapArray(BloomFilter filter) { BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(filter.getShape()); filter.forEachBitMap(builder); return builder.getArray(); @@ -44,7 +44,7 @@ public static long[] asBitMapArray(BloomFilter filter) { * @param filter the Filter to get the data from. * @return An array of indices for enabled bits in the Bloom filter. */ - public static int[] asIndexArray(BloomFilter filter) { + static int[] asIndexArray(BloomFilter filter) { List lst = new ArrayList(); filter.forEachIndex(lst::add); return lst.stream().mapToInt(Integer::intValue).toArray(); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java index 97681e7b88..a0caace3e4 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -43,7 +43,7 @@ public interface IndexProducer { * @param producer the @{code BitMapProducer} * @return a new @{code IndexProducer}. */ - public static IndexProducer fromBitMapProducer(BitMapProducer producer) { + static IndexProducer fromBitMapProducer(BitMapProducer producer) { Objects.requireNonNull(producer, "producer"); return new IndexProducer() { @Override diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java index 0f7acce630..d82548cd99 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java @@ -65,7 +65,7 @@ private static class CardCounter implements LongConsumer { * @param op2 The operation to execute when there are two BitMaps to compare. * @param op1 The operation to execute when there is only one BitMap to cmpare. */ - public CardCounter(BitMapProducer producer, Shape shape, LongBinaryOperator op2, LongUnaryOperator op1) { + CardCounter(BitMapProducer producer, Shape shape, LongBinaryOperator op2, LongUnaryOperator op1) { BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape); producer.forEachBitMap(builder); this.bitMaps = builder.getArray(); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java index 9491de9bca..4c00ea13e4 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java @@ -17,4 +17,4 @@ /** * Exceptions specific to Bloom filter processing. */ -package org.apache.commons.collections4.bloomfilter.exceptions; \ No newline at end of file +package org.apache.commons.collections4.bloomfilter.exceptions; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index 3c4912138b..561736a80d 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -72,7 +72,7 @@ default boolean isEmpty() { *

This is conceptually a unique filter implemented as a {@code Predicate}.

* @since 4.5 */ - public class Filter { + class Filter { private long[] bits; private int size; @@ -116,7 +116,7 @@ public boolean test(int number) { * * @since 4.5 */ - public class FilteredIntConsumer implements IntConsumer { + class FilteredIntConsumer implements IntConsumer { private Hasher.Filter filter; private IntConsumer consumer; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java index 22795b94ae..64adb12c3b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java @@ -50,7 +50,7 @@ public final class SimpleHasher implements Hasher { * @param len the length of the extraction, may be longer than 8. * @return */ - private static final long toLong(byte[] byteArray, int offset, int len) { + private static long toLong(byte[] byteArray, int offset, int len) { long val = 0; len = Math.min(len, Long.BYTES); for (int i = 0; i < len; i++) { diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 076b098b17..b753c196d0 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -29,19 +29,16 @@ */ public abstract class AbstractBloomFilterTest { - protected final SimpleHasher from1 = new SimpleHasher( 1, 1 ); + protected final SimpleHasher from1 = new SimpleHasher(1, 1); protected final long from1Value = 0x3FFFEL; - protected final SimpleHasher from11 = new SimpleHasher( 11, 1 ); + protected final SimpleHasher from11 = new SimpleHasher(11, 1); protected final long from11Value = 0xFFFF800L; - protected final HasherCollection bigHasher = new HasherCollection( from1, from11 ); + protected final HasherCollection bigHasher = new HasherCollection(from1, from11); protected final long bigHashValue = 0xFFFFFFEL; - protected final HasherCollection fullHasher = new HasherCollection( - new SimpleHasher(0,1)/*0-16*/, - new SimpleHasher(17,1)/*17-33*/, - new SimpleHasher(33,1)/*33-49*/, - new SimpleHasher(50,1)/*50-66*/, - new SimpleHasher(67,1)/*67-83*/ - ); + protected final HasherCollection fullHasher = new HasherCollection(new SimpleHasher(0, 1)/* 0-16 */, + new SimpleHasher(17, 1)/* 17-33 */, new SimpleHasher(33, 1)/* 33-49 */, new SimpleHasher(50, 1)/* 50-66 */, + new SimpleHasher(67, 1)/* 67-83 */ + ); protected final long[] fullHashValue = { 0xFFFFFFFFFFFFFFFFL, 0xFFFFFL }; /** @@ -66,7 +63,6 @@ public abstract class AbstractBloomFilterTest { */ protected abstract T createFilter(Shape shape, Hasher hasher); - /** * Tests that the andCardinality calculations are correct. * @@ -74,21 +70,21 @@ public abstract class AbstractBloomFilterTest { */ @Test public void containsTest() { - final BloomFilter bf = createFilter( shape, from1 ); - final BloomFilter bf2 = createFilter( shape, bigHasher ); + final BloomFilter bf = createFilter(shape, from1); + final BloomFilter bf2 = createFilter(shape, bigHasher); - assertTrue( "BF Should contain itself", bf.contains(bf)); - assertTrue( "BF2 Should contain itself", bf2.contains(bf2)); - assertFalse( "BF should not contain BF2",bf.contains(bf2)); - assertTrue( "BF2 should contain BF", bf2.contains(bf)); + assertTrue("BF Should contain itself", bf.contains(bf)); + assertTrue("BF2 Should contain itself", bf2.contains(bf2)); + assertFalse("BF should not contain BF2", bf.contains(bf2)); + assertTrue("BF2 should contain BF", bf2.contains(bf)); } @Test public void containsTest_Hasher() { - final BloomFilter bf = createFilter( shape, bigHasher ); + final BloomFilter bf = createFilter(shape, bigHasher); - assertTrue( "BF Should contain this hasher", bf.contains( new SimpleHasher( 1, 1 ))); - assertFalse( "BF Should not contain this hasher", bf.contains( new SimpleHasher( 1, 3 ))); + assertTrue("BF Should contain this hasher", bf.contains(new SimpleHasher(1, 1))); + assertFalse("BF Should not contain this hasher", bf.contains(new SimpleHasher(1, 3))); } /** @@ -99,8 +95,8 @@ public void containsTest_Hasher() { @Test public void estimateIntersectionTest() { - final BloomFilter bf = createFilter( shape, from1 ); - final BloomFilter bf2 = createFilter( shape, bigHasher ); + final BloomFilter bf = createFilter(shape, from1); + final BloomFilter bf2 = createFilter(shape, bigHasher); assertEquals(1.0, bf.estimateIntersection(bf2)); assertEquals(1.0, bf2.estimateIntersection(bf)); @@ -108,8 +104,8 @@ public void estimateIntersectionTest() { @Test public void estimateIntersectionTest_empty() { - final BloomFilter bf = createFilter( shape, from1 ); - final BloomFilter bf2 = createEmptyFilter( shape); + final BloomFilter bf = createFilter(shape, from1); + final BloomFilter bf2 = createEmptyFilter(shape); assertEquals(0.0, bf.estimateIntersection(bf2)); assertEquals(0.0, bf2.estimateIntersection(bf)); @@ -122,9 +118,9 @@ public void estimateIntersectionTest_empty() { */ @Test public void estimateUnionTest() { - final BloomFilter bf = createFilter( shape, from1 ); + final BloomFilter bf = createFilter(shape, from1); - final BloomFilter bf2 = createFilter( shape, from11 ); + final BloomFilter bf2 = createFilter(shape, from11); assertEquals(2.0, bf.estimateUnion(bf2)); assertEquals(2.0, bf2.estimateUnion(bf)); @@ -132,15 +128,13 @@ public void estimateUnionTest() { @Test public void estimateUnionTest_empty() { - final BloomFilter bf = createFilter( shape, from1 ); - final BloomFilter bf2 = createEmptyFilter( shape); + final BloomFilter bf = createFilter(shape, from1); + final BloomFilter bf2 = createEmptyFilter(shape); assertEquals(1.0, bf.estimateUnion(bf2)); assertEquals(1.0, bf2.estimateUnion(bf)); } - - /** * Tests that the size estimate is correctly calculated. */ @@ -152,16 +146,15 @@ public void estimateNTest() { // the data provided above do not generate an estimate that is equivalent to the // actual. - filter1.mergeInPlace( new SimpleHasher( 4, 1 )); + filter1.mergeInPlace(new SimpleHasher(4, 1)); - assertEquals(1, filter1.estimateN() ); + assertEquals(1, filter1.estimateN()); - filter1.mergeInPlace( new SimpleHasher( 17, 1 )); + filter1.mergeInPlace(new SimpleHasher(17, 1)); - assertEquals(3, filter1.estimateN() ); + assertEquals(3, filter1.estimateN()); } - /** * Tests that creating an empty hasher works as expected. */ @@ -169,7 +162,7 @@ public void estimateNTest() { public final void constructorTest_Empty() { final BloomFilter bf = createEmptyFilter(shape); - final long[] lb = BloomFilter.asBitMapArray( bf ); + final long[] lb = BloomFilter.asBitMapArray(bf); assertEquals(0, lb.length); } @@ -178,7 +171,7 @@ public final void constructorTest_Empty() { */ @Test public final void constructorTest_Hasher() { - Hasher hasher = new SimpleHasher(0,1); + Hasher hasher = new SimpleHasher(0, 1); final BloomFilter bf = createFilter(shape, hasher); final long[] lb = BloomFilter.asBitMapArray(bf); @@ -186,15 +179,14 @@ public final void constructorTest_Hasher() { assertEquals(1, lb.length); } - /** * Tests that getBits() works correctly when multiple long values are returned. */ @Test public final void getBitsTest_SpanLong() { - final SimpleHasher hasher = new SimpleHasher(63,1); - final BloomFilter bf = createFilter(new Shape(2, 72), hasher ); + final SimpleHasher hasher = new SimpleHasher(63, 1); + final BloomFilter bf = createFilter(new Shape(2, 72), hasher); final long[] lb = BloomFilter.asBitMapArray(bf); assertEquals(2, lb.length); assertEquals(0x8000000000000000L, lb[0]); @@ -211,10 +203,10 @@ public final void isFullTest() { BloomFilter filter = createEmptyFilter(shape); assertFalse("Should not be full", filter.isFull(shape)); - filter = createFilter( shape, fullHasher ); + filter = createFilter(shape, fullHasher); assertTrue("Should be full", filter.isFull(shape)); - filter = createFilter( shape, new SimpleHasher( 1, 3 )); + filter = createFilter(shape, new SimpleHasher(1, 3)); assertFalse("Should not be full", filter.isFull(shape)); } @@ -224,30 +216,30 @@ public final void isFullTest() { @Test public final void mergeTest_Bloomfilter() { - final BloomFilter bf1 = createFilter( shape, from1); + final BloomFilter bf1 = createFilter(shape, from1); - final BloomFilter bf2 = createFilter( shape, from11); + final BloomFilter bf2 = createFilter(shape, from11); final BloomFilter bf3 = bf1.merge(bf2); - assertTrue( "Should contain", bf3.contains( bf1 )); - assertTrue( "Should contain", bf3.contains( bf2 )); + assertTrue("Should contain", bf3.contains(bf1)); + assertTrue("Should contain", bf3.contains(bf2)); final BloomFilter bf4 = bf2.merge(bf1); - assertTrue( "Should contain", bf4.contains( bf1 )); - assertTrue( "Should contain", bf4.contains( bf2 )); - assertTrue( "Should contain", bf4.contains( bf3 )); - assertTrue( "Should contain", bf3.contains( bf4 )); + assertTrue("Should contain", bf4.contains(bf1)); + assertTrue("Should contain", bf4.contains(bf2)); + assertTrue("Should contain", bf4.contains(bf3)); + assertTrue("Should contain", bf3.contains(bf4)); } @Test public final void mergeTest_Hasher() { - final BloomFilter bf1 = createFilter( shape, from1); - final BloomFilter bf2 = createFilter( shape, from11); + final BloomFilter bf1 = createFilter(shape, from1); + final BloomFilter bf2 = createFilter(shape, from11); - final BloomFilter bf3 = bf1.merge( from11 ); - assertTrue( "Should contain", bf3.contains( bf1 )); - assertTrue( "Should contain", bf3.contains( bf2 )); + final BloomFilter bf3 = bf1.merge(from11); + assertTrue("Should contain", bf3.contains(bf1)); + assertTrue("Should contain", bf3.contains(bf2)); } /** @@ -256,32 +248,32 @@ public final void mergeTest_Hasher() { @Test public final void mergeInPlaceTest_Bloomfilter() { - final BloomFilter bf1 = createFilter( shape, from1); + final BloomFilter bf1 = createFilter(shape, from1); - final BloomFilter bf2 = createFilter( shape, from11); + final BloomFilter bf2 = createFilter(shape, from11); final BloomFilter bf3 = bf1.merge(bf2); - bf1.mergeInPlace( bf2 ); + bf1.mergeInPlace(bf2); - assertTrue( "Should contain", bf1.contains( bf2 )); - assertTrue( "Should contain", bf1.contains( bf3 )); + assertTrue("Should contain", bf1.contains(bf2)); + assertTrue("Should contain", bf1.contains(bf3)); } @Test public final void mergeInPlaceTest_Hasher() { - final BloomFilter bf1 = createFilter( shape, from1); + final BloomFilter bf1 = createFilter(shape, from1); - final BloomFilter bf2 = createFilter( shape, from11); + final BloomFilter bf2 = createFilter(shape, from11); final BloomFilter bf3 = bf1.merge(bf2); - bf1.mergeInPlace( from11 ); + bf1.mergeInPlace(from11); - assertTrue( "Should contain Bf2", bf1.contains( bf2 )); - assertTrue( "Should contain Bf3", bf1.contains( bf3 )); + assertTrue("Should contain Bf2", bf1.contains(bf2)); + assertTrue("Should contain Bf3", bf1.contains(bf3)); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java index 8a75bc998d..53acc6678e 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java @@ -26,19 +26,18 @@ /** * Tests for the {@link ArrayCountingBloomFilter}. */ -public abstract class AbstractCountingBloomFilterTest extends AbstractBloomFilterTest { - protected int[] from1Counts = { 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0 }; - protected int[] from11Counts = { 0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0 }; - protected int[] bigHashCounts = { 0,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,0 }; +public abstract class AbstractCountingBloomFilterTest + extends AbstractBloomFilterTest { + protected int[] from1Counts = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; + protected int[] from11Counts = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; + protected int[] bigHashCounts = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; protected final BitCountProducer maximumValueProducer = new BitCountProducer() { - @Override public void forEachCount(BitCountProducer.BitCountConsumer consumer) { - for (int i=1;i<18;i++) - { - consumer.accept( i, Integer.MAX_VALUE ); + for (int i = 1; i < 18; i++) { + consumer.accept(i, Integer.MAX_VALUE); } } }; @@ -72,27 +71,25 @@ private static void assertCounts(final CountingBloomFilter bf, final int[] expec @Test public void constructorTest_Hasher_Duplicates() { // bit hasher has duplicates for 11, 12,13,14,15,16, and 17 - final CountingBloomFilter bf = createFilter( shape, from1); - bf.add( BitCountProducer.from( from11.indices(shape)) ); + final CountingBloomFilter bf = createFilter(shape, from1); + bf.add(BitCountProducer.from(from11.indices(shape))); final long[] lb = BloomFilter.asBitMapArray(bf); assertEquals(1, lb.length); assertEquals(bigHashValue, lb[0]); - assertCounts(bf, bigHashCounts ); + assertCounts(bf, bigHashCounts); } - - @Test public void containsTest_Mixed() { - final BloomFilter bf = new SimpleBloomFilter( shape, from1 ); - final CountingBloomFilter bf2 = createFilter( shape, bigHasher ); + final BloomFilter bf = new SimpleBloomFilter(shape, from1); + final CountingBloomFilter bf2 = createFilter(shape, bigHasher); - assertTrue( "BF Should contain itself", bf.contains(bf)); - assertTrue( "BF2 Should contain itself", bf2.contains(bf2)); - assertFalse( "BF should not contain BF2",bf.contains(bf2)); - assertTrue( "BF2 should contain BF", bf2.contains(bf)); + assertTrue("BF Should contain itself", bf.contains(bf)); + assertTrue("BF2 Should contain itself", bf2.contains(bf2)); + assertFalse("BF should not contain BF2", bf.contains(bf2)); + assertTrue("BF2 should contain BF", bf2.contains(bf)); } /** @@ -100,19 +97,19 @@ public void containsTest_Mixed() { */ @Test public final void mergeTest_Mixed() { - final BloomFilter bf1 = createFilter( shape, from1); + final BloomFilter bf1 = createFilter(shape, from1); - final BloomFilter bf2 = new SimpleBloomFilter( shape, from11); + final BloomFilter bf2 = new SimpleBloomFilter(shape, from11); final BloomFilter bf3 = bf1.merge(bf2); - assertTrue( "Should contain", bf3.contains( bf1 )); - assertTrue( "Should contain", bf3.contains( bf2 )); + assertTrue("Should contain", bf3.contains(bf1)); + assertTrue("Should contain", bf3.contains(bf2)); final BloomFilter bf4 = bf2.merge(bf1); - assertTrue( "Should contain", bf4.contains( bf1 )); - assertTrue( "Should contain", bf4.contains( bf2 )); - assertTrue( "Should contain", bf4.contains( bf3 )); - assertTrue( "Should contain", bf3.contains( bf4 )); + assertTrue("Should contain", bf4.contains(bf1)); + assertTrue("Should contain", bf4.contains(bf2)); + assertTrue("Should contain", bf4.contains(bf3)); + assertTrue("Should contain", bf3.contains(bf4)); } /** @@ -121,23 +118,23 @@ public final void mergeTest_Mixed() { */ @Test public void addTest() { - final CountingBloomFilter bf1 = createFilter( shape, from1); - assertTrue( "Add should work", bf1.add(createFilter( shape, from11)) ); - assertTrue( "Should contain", bf1.contains( from1 )); - assertTrue( "Should contain", bf1.contains( from11 )); - assertCounts(bf1, bigHashCounts ); + final CountingBloomFilter bf1 = createFilter(shape, from1); + assertTrue("Add should work", bf1.add(createFilter(shape, from11))); + assertTrue("Should contain", bf1.contains(from1)); + assertTrue("Should contain", bf1.contains(from11)); + assertCounts(bf1, bigHashCounts); } @Test public void addTest_overflow() { - final CountingBloomFilter bf1 = createEmptyFilter( shape); - assertTrue( "Should add to empty", bf1.add( maximumValueProducer )); - assertTrue( "Should be valid", bf1.isValid() ); + final CountingBloomFilter bf1 = createEmptyFilter(shape); + assertTrue("Should add to empty", bf1.add(maximumValueProducer)); + assertTrue("Should be valid", bf1.isValid()); - assertFalse( "Should not add", bf1.add( createFilter( shape, from1) )); - assertFalse( "Should not be valid", bf1.isValid() ); + assertFalse("Should not add", bf1.add(createFilter(shape, from1))); + assertFalse("Should not be valid", bf1.isValid()); } /** @@ -146,14 +143,14 @@ public void addTest_overflow() { */ @Test public void subtractTest() { - final CountingBloomFilter bf1 = createFilter( shape, from1); - bf1.add( BitCountProducer.from( from11.indices(shape))); + final CountingBloomFilter bf1 = createFilter(shape, from1); + bf1.add(BitCountProducer.from(from11.indices(shape))); - final CountingBloomFilter bf2 = createFilter( shape, from11); + final CountingBloomFilter bf2 = createFilter(shape, from11); - assertTrue( "Subtract should work", bf1.subtract(bf2) ); - assertFalse( "Should not contain bitHasher", bf1.contains( bigHasher )); - assertTrue( "Should contain from1", bf1.contains( from1 )); + assertTrue("Subtract should work", bf1.subtract(bf2)); + assertFalse("Should not contain bitHasher", bf1.contains(bigHasher)); + assertTrue("Should contain from1", bf1.contains(from1)); assertCounts(bf1, from1Counts); @@ -165,34 +162,33 @@ public void subtractTest() { */ @Test public void subtractTest_underflow() { - final CountingBloomFilter bf1 = createFilter( shape, from1); + final CountingBloomFilter bf1 = createFilter(shape, from1); - final CountingBloomFilter bf2 = createFilter( shape, from11); + final CountingBloomFilter bf2 = createFilter(shape, from11); - assertFalse( "Subtract should not work", bf1.subtract(bf2) ); - assertFalse( "isValid should return false", bf1.isValid()); - assertFalse( "Should not contain", bf1.contains( from1 )); - assertFalse( "Should not contain", bf1.contains( bf2 )); + assertFalse("Subtract should not work", bf1.subtract(bf2)); + assertFalse("isValid should return false", bf1.isValid()); + assertFalse("Should not contain", bf1.contains(from1)); + assertFalse("Should not contain", bf1.contains(bf2)); - assertCounts(bf1, new int[] { 0,1,1,1,1,1,1,1,1,1,1,0}); + assertCounts(bf1, new int[] { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }); } - /** * Tests that merge correctly updates the counts when a CountingBloomFilter is * passed. */ @Test public void removeTest() { - final CountingBloomFilter bf1 = createFilter( shape, from1); - bf1.add( BitCountProducer.from( from11.indices(shape))); + final CountingBloomFilter bf1 = createFilter(shape, from1); + bf1.add(BitCountProducer.from(from11.indices(shape))); - assertTrue( "Remove should work", bf1.remove(new SimpleBloomFilter( shape, from11)) ); - assertFalse( "Should not contain", bf1.contains( from11 )); - assertTrue( "Should contain", bf1.contains( from1 )); + assertTrue("Remove should work", bf1.remove(new SimpleBloomFilter(shape, from11))); + assertFalse("Should not contain", bf1.contains(from11)); + assertTrue("Should contain", bf1.contains(from1)); - assertCounts(bf1, from1Counts ); + assertCounts(bf1, from1Counts); } @@ -202,29 +198,28 @@ public void removeTest() { */ @Test public void removeTest_underflow() { - final CountingBloomFilter bf1 = createFilter( shape, from1); + final CountingBloomFilter bf1 = createFilter(shape, from1); - final BloomFilter bf2 = new SimpleBloomFilter( shape, from11); + final BloomFilter bf2 = new SimpleBloomFilter(shape, from11); - assertFalse( "Subtract should not work", bf1.remove(bf2) ); - assertFalse( "isValid should return false", bf1.isValid()); - assertFalse( "Should not contain", bf1.contains( from1 )); - assertFalse( "Should not contain", bf1.contains( bf2 )); + assertFalse("Subtract should not work", bf1.remove(bf2)); + assertFalse("isValid should return false", bf1.isValid()); + assertFalse("Should not contain", bf1.contains(from1)); + assertFalse("Should not contain", bf1.contains(bf2)); - assertCounts(bf1, new int[] { 0,1,1,1,1,1,1,1,1,1,1}); + assertCounts(bf1, new int[] { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); } @Test public void mergeTest_overflow() { - final CountingBloomFilter bf1 = createEmptyFilter( shape); - assertTrue( "Should add to empty", bf1.add( maximumValueProducer )); - assertTrue( "Should be valid", bf1.isValid() ); + final CountingBloomFilter bf1 = createEmptyFilter(shape); + assertTrue("Should add to empty", bf1.add(maximumValueProducer)); + assertTrue("Should be valid", bf1.isValid()); - CountingBloomFilter bf2 = bf1.merge(new SimpleBloomFilter( shape, from1)); - assertFalse( "Should not be valid", bf2.isValid() ); + CountingBloomFilter bf2 = bf1.merge(new SimpleBloomFilter(shape, from1)); + assertFalse("Should not be valid", bf2.isValid()); } - } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java index 0cc70c459c..117194b6a1 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java @@ -25,13 +25,13 @@ public class ArrayCountingBloomFilterTest extends AbstractCountingBloomFilterTes @Override protected ArrayCountingBloomFilter createEmptyFilter(Shape shape) { - return new ArrayCountingBloomFilter( shape ); + return new ArrayCountingBloomFilter(shape); } @Override protected ArrayCountingBloomFilter createFilter(Shape shape, Hasher hasher) { - ArrayCountingBloomFilter filter = createEmptyFilter( shape ); - filter.add( BitCountProducer.from( hasher.indices(shape))); + ArrayCountingBloomFilter filter = createEmptyFilter(shape); + filter.add(BitCountProducer.from(hasher.indices(shape))); return filter; } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java index 707f02c5db..760b7fd435 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerTest.java @@ -17,13 +17,8 @@ package org.apache.commons.collections4.bloomfilter; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; -import java.util.List; import java.util.Map; -import java.util.Set; import java.util.function.IntConsumer; import org.junit.Test; @@ -36,26 +31,26 @@ public void fromIndexProducer() { @Override public void forEachIndex(IntConsumer consumer) { - consumer.accept( 0 ); - consumer.accept( 1 ); - consumer.accept( 63 ); - consumer.accept( 64 ); - consumer.accept( 127 ); - consumer.accept( 128 ); + consumer.accept(0); + consumer.accept(1); + consumer.accept(63); + consumer.accept(64); + consumer.accept(127); + consumer.accept(128); } }; BitCountProducer producer = BitCountProducer.from(iProducer); - Map m = new HashMap(); + Map m = new HashMap(); - producer.forEachCount( (i,v) -> m.put(i, v)); + producer.forEachCount((i, v) -> m.put(i, v)); - assertEquals( 6, m.size()); - assertEquals( Integer.valueOf(1), m.get(0)); - assertEquals( Integer.valueOf(1), m.get(1)); - assertEquals( Integer.valueOf(1), m.get(63)); - assertEquals( Integer.valueOf(1), m.get(64)); - assertEquals( Integer.valueOf(1), m.get(127)); - assertEquals( Integer.valueOf(1), m.get(128)); + assertEquals(6, m.size()); + assertEquals(Integer.valueOf(1), m.get(0)); + assertEquals(Integer.valueOf(1), m.get(1)); + assertEquals(Integer.valueOf(1), m.get(63)); + assertEquals(Integer.valueOf(1), m.get(64)); + assertEquals(Integer.valueOf(1), m.get(127)); + assertEquals(Integer.valueOf(1), m.get(128)); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java index 5d26a8fdc5..4e23147daf 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java @@ -31,24 +31,24 @@ public void fromIndexProducer() { @Override public void forEachIndex(IntConsumer consumer) { - consumer.accept( 0 ); - consumer.accept( 1 ); - consumer.accept( 63 ); - consumer.accept( 64 ); - consumer.accept( 127 ); - consumer.accept( 128 ); + consumer.accept(0); + consumer.accept(1); + consumer.accept(63); + consumer.accept(64); + consumer.accept(127); + consumer.accept(128); } }; - BitMapProducer producer = BitMapProducer.fromIndexProducer(iProducer, new Shape( 1, 200 )); + BitMapProducer producer = BitMapProducer.fromIndexProducer(iProducer, new Shape(1, 200)); List lst = new ArrayList(); - producer.forEachBitMap( lst::add ); - long[] buckets = lst.stream().mapToLong( l -> l.longValue()).toArray(); - assertTrue( BitMap.contains( buckets, 0)); - assertTrue( BitMap.contains( buckets, 1)); - assertTrue( BitMap.contains( buckets, 63)); - assertTrue( BitMap.contains( buckets, 64)); - assertTrue( BitMap.contains( buckets, 127)); - assertTrue( BitMap.contains( buckets, 128)); + producer.forEachBitMap(lst::add); + long[] buckets = lst.stream().mapToLong(l -> l.longValue()).toArray(); + assertTrue(BitMap.contains(buckets, 0)); + assertTrue(BitMap.contains(buckets, 1)); + assertTrue(BitMap.contains(buckets, 63)); + assertTrue(BitMap.contains(buckets, 64)); + assertTrue(BitMap.contains(buckets, 127)); + assertTrue(BitMap.contains(buckets, 128)); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java index d8e1459ad4..0d6d575087 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java @@ -19,7 +19,6 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; import org.junit.Test; @@ -37,80 +36,77 @@ public void checkPositiveTest() { } } - - @Test public void getLongBitTest() { - assertEquals( 1, BitMap.getLongBit(0) ); - assertEquals( 0x8000000000000000L, BitMap.getLongBit( 63 ) ); - assertEquals( 1, BitMap.getLongBit( 64) ); - assertEquals( 0x8000000000000000L, BitMap.getLongBit( 127 ) ); - assertEquals( 1, BitMap.getLongBit( 128 ) ); + assertEquals(1, BitMap.getLongBit(0)); + assertEquals(0x8000000000000000L, BitMap.getLongBit(63)); + assertEquals(1, BitMap.getLongBit(64)); + assertEquals(0x8000000000000000L, BitMap.getLongBit(127)); + assertEquals(1, BitMap.getLongBit(128)); } @Test public void getLongIndexTest() { - assertEquals( 0, BitMap.getLongIndex(0) ); - assertEquals( 0, BitMap.getLongIndex( 63 ) ); - assertEquals( 1, BitMap.getLongIndex( 64) ); - assertEquals( 1, BitMap.getLongIndex( 127 ) ); - assertEquals( 2, BitMap.getLongIndex( 128 ) ); + assertEquals(0, BitMap.getLongIndex(0)); + assertEquals(0, BitMap.getLongIndex(63)); + assertEquals(1, BitMap.getLongIndex(64)); + assertEquals(1, BitMap.getLongIndex(127)); + assertEquals(2, BitMap.getLongIndex(128)); } - @Test public void isSparseTest() { - Shape shape = new Shape( 17, 64 ); - assertTrue( BitMap.isSparse(0, shape) ); - assertTrue( BitMap.isSparse(1, shape) ); - assertTrue( BitMap.isSparse(2, shape) ); - assertFalse( BitMap.isSparse(3, shape) ); + Shape shape = new Shape(17, 64); + assertTrue(BitMap.isSparse(0, shape)); + assertTrue(BitMap.isSparse(1, shape)); + assertTrue(BitMap.isSparse(2, shape)); + assertFalse(BitMap.isSparse(3, shape)); - shape = new Shape( 17, 64*3 ); + shape = new Shape(17, 64 * 3); - for (int i=0;i<7; i++) { - assertTrue( BitMap.isSparse(i, shape) ); + for (int i = 0; i < 7; i++) { + assertTrue(BitMap.isSparse(i, shape)); } - assertFalse( BitMap.isSparse(7, shape) ); + assertFalse(BitMap.isSparse(7, shape)); } @Test public void numberOfBitMapsTest() { - assertEquals( "Number of bits 0", 0, BitMap.numberOfBitMaps(0)); - for (int i = 1;i<65;i++) { - assertEquals( String.format( "Number of bits %d", i ), 1, BitMap.numberOfBitMaps(i)); + assertEquals("Number of bits 0", 0, BitMap.numberOfBitMaps(0)); + for (int i = 1; i < 65; i++) { + assertEquals(String.format("Number of bits %d", i), 1, BitMap.numberOfBitMaps(i)); } - for (int i = 65;i<129;i++) { - assertEquals( String.format( "Number of bits %d", i ),2, BitMap.numberOfBitMaps(i)); + for (int i = 65; i < 129; i++) { + assertEquals(String.format("Number of bits %d", i), 2, BitMap.numberOfBitMaps(i)); } - assertEquals( "Number of bits 129", 3, BitMap.numberOfBitMaps(129)); + assertEquals("Number of bits 129", 3, BitMap.numberOfBitMaps(129)); } @Test public void setTest() { - long[] bitMaps = new long[ BitMap.numberOfBitMaps(129)]; - for (int i=0;i<129;i++) { - BitMap.set( bitMaps, i); - assertTrue( String.format("Failed at index: %d",i), BitMap.contains( bitMaps, i)); + long[] bitMaps = new long[BitMap.numberOfBitMaps(129)]; + for (int i = 0; i < 129; i++) { + BitMap.set(bitMaps, i); + assertTrue(String.format("Failed at index: %d", i), BitMap.contains(bitMaps, i)); } - assertEquals( 0xFFFFFFFFFFFFFFFFL, bitMaps[0] ); - assertEquals( 0xFFFFFFFFFFFFFFFFL, bitMaps[1] ); - assertEquals( 1L, bitMaps[2] ); + assertEquals(0xFFFFFFFFFFFFFFFFL, bitMaps[0]); + assertEquals(0xFFFFFFFFFFFFFFFFL, bitMaps[1]); + assertEquals(1L, bitMaps[2]); } @Test public void containsTest() { - long[] bitMaps = new long[ 1 ]; - - for (int i=0;i<64;i++) { - bitMaps[0] = 0l; - BitMap.set( bitMaps, i); - for (int j=0;j<64;j++) { - if (j==i) { - assertTrue( String.format("Failed at index: %d for %d",i,j), BitMap.contains( bitMaps, j)); + long[] bitMaps = new long[1]; + + for (int i = 0; i < 64; i++) { + bitMaps[0] = 0L; + BitMap.set(bitMaps, i); + for (int j = 0; j < 64; j++) { + if (j == i) { + assertTrue(String.format("Failed at index: %d for %d", i, j), BitMap.contains(bitMaps, j)); } else { - assertFalse( String.format("Failed at index %d for %d",i,j), BitMap.contains( bitMaps, j)); + assertFalse(String.format("Failed at index %d for %d", i, j), BitMap.contains(bitMaps, j)); } } @@ -121,18 +117,18 @@ public void containsTest() { public void contains_boundaryConditionTest() { long[] ary = new long[1]; - assertFalse( BitMap.contains(ary, 0) ); + assertFalse(BitMap.contains(ary, 0)); ary[0] = 0x01; - assertTrue( BitMap.contains(ary, 0) ); + assertTrue(BitMap.contains(ary, 0)); - assertFalse( BitMap.contains(ary, 63) ); + assertFalse(BitMap.contains(ary, 63)); ary[0] = (1L << 63); - assertTrue( BitMap.contains(ary, 63) ); + assertTrue(BitMap.contains(ary, 63)); ary = new long[2]; - assertFalse( BitMap.contains(ary, 64) ); + assertFalse(BitMap.contains(ary, 64)); ary[1] = 1; - assertTrue( BitMap.contains(ary, 64) ); + assertTrue(BitMap.contains(ary, 64)); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java index 79c31171e7..c97957424c 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java @@ -13,7 +13,8 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */package org.apache.commons.collections4.bloomfilter; + */ +package org.apache.commons.collections4.bloomfilter; import static org.junit.Assert.assertEquals; @@ -29,35 +30,32 @@ public IndexProducerTest() { } - - @Test public void fromBitMapProducerTest() { - TestingBitMapProducer producer = new TestingBitMapProducer( new long[] { 1L, 2L, 3L } ); - IndexProducer underTest = IndexProducer.fromBitMapProducer( producer ); + TestingBitMapProducer producer = new TestingBitMapProducer(new long[] { 1L, 2L, 3L }); + IndexProducer underTest = IndexProducer.fromBitMapProducer(producer); List lst = new ArrayList(); - underTest.forEachIndex( lst::add ); - assertEquals( 4, lst.size() ); - assertEquals( Integer.valueOf(0), lst.get(0) ); - assertEquals( Integer.valueOf(1+64), lst.get(1) ); - assertEquals( Integer.valueOf(0+128), lst.get(2) ); - assertEquals( Integer.valueOf(1+128), lst.get(3) ); + underTest.forEachIndex(lst::add); + assertEquals(4, lst.size()); + assertEquals(Integer.valueOf(0), lst.get(0)); + assertEquals(Integer.valueOf(1 + 64), lst.get(1)); + assertEquals(Integer.valueOf(0 + 128), lst.get(2)); + assertEquals(Integer.valueOf(1 + 128), lst.get(3)); - producer = new TestingBitMapProducer( new long[] { 0xFFFFFFFFFFFFFFFFL } ); - underTest = IndexProducer.fromBitMapProducer( producer ); + producer = new TestingBitMapProducer(new long[] { 0xFFFFFFFFFFFFFFFFL }); + underTest = IndexProducer.fromBitMapProducer(producer); lst = new ArrayList(); - underTest.forEachIndex( lst::add ); + underTest.forEachIndex(lst::add); - assertEquals( 64, lst.size() ); - for (int i=0;i<64;i++) { - assertEquals( Integer.valueOf(i), lst.get(i) ); + assertEquals(64, lst.size()); + for (int i = 0; i < 64; i++) { + assertEquals(Integer.valueOf(i), lst.get(i)); } } - private class TestingBitMapProducer implements BitMapProducer { long[] values; @@ -73,6 +71,4 @@ public void forEachBitMap(LongConsumer consumer) { } } - - } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index d3fea4788a..5c9b7cd405 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -29,12 +29,11 @@ */ public class SetOperationsTest { - - protected final SimpleHasher from1 = new SimpleHasher( 1, 1 ); + protected final SimpleHasher from1 = new SimpleHasher(1, 1); protected final long from1Value = 0x3FFFEL; - protected final SimpleHasher from11 = new SimpleHasher( 11, 1 ); + protected final SimpleHasher from11 = new SimpleHasher(11, 1); protected final long from11Value = 0xFFFF800L; - protected final HasherCollection bigHasher = new HasherCollection( from1, from11 ); + protected final HasherCollection bigHasher = new HasherCollection(from1, from11); protected final long bigHashValue = 0xFFFFFFEL; private final Shape shape = new Shape(17, 72); @@ -47,18 +46,16 @@ public final void cosineDistanceTest() { BloomFilter filter1 = new SimpleBloomFilter(shape, from1); BloomFilter filter2 = new SimpleBloomFilter(shape, from1); - assertEquals(0.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); assertEquals(0.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); - Shape shape2 = new Shape( 2, 72 ); + Shape shape2 = new Shape(2, 72); filter1 = new SimpleBloomFilter(shape2, from1); - filter2 = new SimpleBloomFilter(shape2, new SimpleHasher( 2, 1 )); + filter2 = new SimpleBloomFilter(shape2, new SimpleHasher(2, 1)); assertEquals(0.5, SetOperations.cosineDistance(filter1, filter2), 0.0001); assertEquals(0.5, SetOperations.cosineDistance(filter2, filter1), 0.0001); - filter1 = new SimpleBloomFilter(shape, from1); filter2 = new SimpleBloomFilter(shape, from11); @@ -76,7 +73,6 @@ public final void cosineDistanceTest_NoValues() { BloomFilter filter2 = new SimpleBloomFilter(shape); BloomFilter filter3 = new SimpleBloomFilter(shape); - assertEquals(1.0, SetOperations.cosineDistance(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.cosineDistance(filter2, filter1), 0.0001); assertEquals(1.0, SetOperations.cosineDistance(filter2, filter3), 0.0001); @@ -91,7 +87,6 @@ public final void cosineSimilarityTest() { BloomFilter filter1 = new SimpleBloomFilter(shape, from1); BloomFilter filter2 = new SimpleBloomFilter(shape, from1); - assertEquals(1.0, SetOperations.cosineSimilarity(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.cosineSimilarity(filter2, filter1), 0.0001); @@ -118,8 +113,6 @@ public final void cosineSimilarityTest_NoValues() { assertEquals(0.0, SetOperations.cosineSimilarity(filter3, filter1), 0.0001); } - - /** * Tests that the Hamming distance is correctly calculated. */ @@ -131,7 +124,7 @@ public final void hammingDistanceTest() { assertEquals(0, SetOperations.hammingDistance(filter1, filter2)); assertEquals(0, SetOperations.hammingDistance(filter2, filter1)); - filter2 = new SimpleBloomFilter( shape, from11); + filter2 = new SimpleBloomFilter(shape, from11); assertEquals(20, SetOperations.hammingDistance(filter1, filter2)); assertEquals(20, SetOperations.hammingDistance(filter2, filter1)); @@ -148,7 +141,6 @@ public final void jaccardDistanceTest() { assertEquals(1.0, SetOperations.jaccardDistance(filter1, filter2), 0.0001); assertEquals(1.0, SetOperations.jaccardDistance(filter2, filter1), 0.0001); - filter2 = new SimpleBloomFilter(shape, from11); assertEquals(0.26, SetOperations.jaccardDistance(filter1, filter2), 0.001); @@ -206,60 +198,60 @@ public final void jaccardSimilarityTest_NoValues() { @Test public final void orCardinalityTest() { - Shape shape = new Shape( 3, 128); - SparseBloomFilter filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63, 64)); - SparseBloomFilter filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 5, SetOperations.orCardinality(shape, filter1, filter2) ); - assertEquals( 5, SetOperations.orCardinality(shape, filter2, filter1) ); - - filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63 )); - filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 5, SetOperations.orCardinality(shape, filter1, filter2) ); - assertEquals( 5, SetOperations.orCardinality(shape, filter2, filter1) ); - - filter1 = new SparseBloomFilter( shape, Arrays.asList(5, 63 )); - filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 4, SetOperations.orCardinality(shape, filter1, filter2) ); - assertEquals( 4, SetOperations.orCardinality(shape, filter2, filter1) ); + Shape shape = new Shape(3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63, 64)); + SparseBloomFilter filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(5, SetOperations.orCardinality(shape, filter1, filter2)); + assertEquals(5, SetOperations.orCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(5, SetOperations.orCardinality(shape, filter1, filter2)); + assertEquals(5, SetOperations.orCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(5, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(4, SetOperations.orCardinality(shape, filter1, filter2)); + assertEquals(4, SetOperations.orCardinality(shape, filter2, filter1)); } @Test public final void andCardinalityTest() { - Shape shape = new Shape( 3, 128); - SparseBloomFilter filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63, 64)); - SparseBloomFilter filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 1, SetOperations.andCardinality(shape, filter1, filter2) ); - assertEquals( 1, SetOperations.andCardinality(shape, filter2, filter1) ); - - filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63 )); - filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 0, SetOperations.andCardinality(shape, filter1, filter2) ); - assertEquals( 0, SetOperations.andCardinality(shape, filter2, filter1) ); - - filter1 = new SparseBloomFilter( shape, Arrays.asList(5, 63 )); - filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 1, SetOperations.andCardinality(shape, filter1, filter2) ); - assertEquals( 1, SetOperations.andCardinality(shape, filter2, filter1) ); + Shape shape = new Shape(3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63, 64)); + SparseBloomFilter filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(1, SetOperations.andCardinality(shape, filter1, filter2)); + assertEquals(1, SetOperations.andCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(0, SetOperations.andCardinality(shape, filter1, filter2)); + assertEquals(0, SetOperations.andCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(5, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(1, SetOperations.andCardinality(shape, filter1, filter2)); + assertEquals(1, SetOperations.andCardinality(shape, filter2, filter1)); } @Test public final void xorCardinalityTest() { - Shape shape = new Shape( 3, 128); - SparseBloomFilter filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63, 64)); - SparseBloomFilter filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 4, SetOperations.xorCardinality(shape, filter1, filter2) ); - assertEquals( 4, SetOperations.xorCardinality(shape, filter2, filter1) ); - - filter1 = new SparseBloomFilter( shape, Arrays.asList(1, 63 )); - filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 5, SetOperations.xorCardinality(shape, filter1, filter2) ); - assertEquals( 5, SetOperations.xorCardinality(shape, filter2, filter1) ); - - filter1 = new SparseBloomFilter( shape, Arrays.asList(5, 63 )); - filter2 = new SparseBloomFilter( shape, Arrays.asList(5, 64, 69)); - assertEquals( 3, SetOperations.xorCardinality(shape, filter1, filter2) ); - assertEquals( 3, SetOperations.xorCardinality(shape, filter2, filter1) ); + Shape shape = new Shape(3, 128); + SparseBloomFilter filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63, 64)); + SparseBloomFilter filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(4, SetOperations.xorCardinality(shape, filter1, filter2)); + assertEquals(4, SetOperations.xorCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(1, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(5, SetOperations.xorCardinality(shape, filter1, filter2)); + assertEquals(5, SetOperations.xorCardinality(shape, filter2, filter1)); + + filter1 = new SparseBloomFilter(shape, Arrays.asList(5, 63)); + filter2 = new SparseBloomFilter(shape, Arrays.asList(5, 64, 69)); + assertEquals(3, SetOperations.xorCardinality(shape, filter1, filter2)); + assertEquals(3, SetOperations.xorCardinality(shape, filter2, filter1)); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java index 5cab5cd6c3..32a9708b37 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java @@ -19,7 +19,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.fail; - import org.junit.jupiter.api.Test; /** @@ -27,7 +26,6 @@ */ public class ShapeFactoryTest { - /* * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= * @@ -40,7 +38,6 @@ public class ShapeFactoryTest { * k = 3 */ - /** * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. */ @@ -59,7 +56,7 @@ public void badNumberOfItemsTest() { // expected } try { - Shape.Factory.fromNP(0, 0.02 ); + Shape.Factory.fromNP(0, 0.02); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // expected @@ -72,19 +69,19 @@ public void badNumberOfItemsTest() { @Test public void badNumberOfBitsTest() { try { - Shape.Factory.fromNM( 5, 0 ); + Shape.Factory.fromNM(5, 0); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // expected } try { - Shape.Factory.fromNMK( 5, 0, 7 ); + Shape.Factory.fromNMK(5, 0, 7); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // expected } try { - Shape.Factory.fromPMK( 0.035, 0, 7 ); + Shape.Factory.fromPMK(0.035, 0, 7); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // expected @@ -116,7 +113,7 @@ public void badNumberOfHashFunctionsTest() { @Test public void badProbabilityTest() { try { - Shape.Factory.fromNMK( 4000, 8, 1); + Shape.Factory.fromNMK(4000, 8, 1); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // expected @@ -128,20 +125,19 @@ public void badProbabilityTest() { // do nothing. } try { - Shape.Factory.fromNP( 10, 1.0); + Shape.Factory.fromNP(10, 1.0); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // do nothing. } try { - Shape.Factory.fromNP( 10, Double.NaN); + Shape.Factory.fromNP(10, Double.NaN); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // do nothing. } } - /** * Tests that when the number of items, number of bits and number of hash functions is passed the values are * calculated correctly. @@ -151,11 +147,11 @@ public void fromNMK_test() { /* * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 */ - final Shape filterConfig = Shape.Factory.fromNMK( 5, 24, 4); + final Shape filterConfig = Shape.Factory.fromNMK(5, 24, 4); assertEquals(24, filterConfig.getNumberOfBits()); assertEquals(4, filterConfig.getNumberOfHashFunctions()); - assertEquals(0.102194782, filterConfig.getProbability(5 ), 0.000001); + assertEquals(0.102194782, filterConfig.getProbability(5), 0.000001); } /** @@ -173,9 +169,6 @@ public void fromNM_Test() { assertEquals(0.100375138, filterConfig.getProbability(5), 0.000001); } - - - /** * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. */ @@ -194,15 +187,12 @@ public void numberOfBitsOverflowTest() { */ @Test public void probabilityTest() { - Shape shape = Shape.Factory.fromNMK(5, 24, 3 ); + Shape shape = Shape.Factory.fromNMK(5, 24, 3); assertEquals(24, shape.getNumberOfBits()); assertEquals(3, shape.getNumberOfHashFunctions()); assertEquals(0.100375138, shape.getProbability(5), 0.000001); } - - - /** * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash * functions. @@ -212,13 +202,11 @@ public void fromPMK_test() { /* * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 */ - final Shape shape = Shape.Factory.fromPMK( 0.1, 24, 3); + final Shape shape = Shape.Factory.fromPMK(0.1, 24, 3); assertEquals(24, shape.getNumberOfBits()); assertEquals(3, shape.getNumberOfHashFunctions()); assertEquals(0.100375138, shape.getProbability(5), 0.000001); } - - } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java index fddf9f6ece..32fbb63109 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java @@ -20,7 +20,6 @@ import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.fail; - import org.junit.jupiter.api.Test; /** @@ -28,7 +27,6 @@ */ public class ShapeTest { - /* * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= * @@ -41,7 +39,7 @@ public class ShapeTest { * k = 3 */ - private final Shape shape = new Shape(3, 24 ); + private final Shape shape = new Shape(3, 24); /** * Tests that if the number of bits less than 1 an IllegalArgumentException is thrown. @@ -49,52 +47,54 @@ public class ShapeTest { @Test public void constructor_items_bits_BadNumberOfBitsTest() { try { - new Shape( 5, 0); + new Shape(5, 0); fail("Should have thrown IllegalArgumentException"); } catch (final IllegalArgumentException expected) { // expected } } - // /** - // * Tests that if the number of hash functions is less than 1 an IllegalArgumentException is thrown. - // */ - // @Test - // public void constructor_items_bits_BadNumberOfHashFunctionsTest() { - // try { - // new Shape( 16, 8); + // /** + // * Tests that if the number of hash functions is less than 1 an + // IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_bits_BadNumberOfHashFunctionsTest() { + // try { + // new Shape( 16, 8); // - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } - // /** - // * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. - // */ - // @Test - // public void constructor_items_bits_BadNumberOfItemsTest() { - // try { - // new Shape(testFunction, 0, 24); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } + // /** + // * Tests that if the number of items less than 1 an IllegalArgumentException + // is thrown. + // */ + // @Test + // public void constructor_items_bits_BadNumberOfItemsTest() { + // try { + // new Shape(testFunction, 0, 24); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } - // /** - // * Tests that if the number of bits is less than 1 an exception is thrown - // */ - // @Test - // public void constructor_items_bits_hash_BadNumberOfBitsTest() { - // try { - // new Shape(testFunction, 5, 0, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } + // /** + // * Tests that if the number of bits is less than 1 an exception is thrown + // */ + // @Test + // public void constructor_items_bits_hash_BadNumberOfBitsTest() { + // try { + // new Shape(testFunction, 5, 0, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } /** * Tests that if the number of hash functions is less than 1 an exception is thrown. @@ -109,268 +109,281 @@ public void constructor_items_bits_hash_BadNumberOfHashFunctionsTest() { } } - // /** - // * Tests that if the number of items is less than 1 an exception is thrown. - // */ - // @Test - // public void constructor_items_bits_hash_BadNumberOfItemsTest() { - // try { - // new Shape(testFunction, 0, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } + // /** + // * Tests that if the number of items is less than 1 an exception is thrown. + // */ + // @Test + // public void constructor_items_bits_hash_BadNumberOfItemsTest() { + // try { + // new Shape(testFunction, 0, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } - // /** - // * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown - // */ - // @Test - // public void constructor_items_bits_hash_BadProbabilityTest() { - // try { - // new Shape(testFunction, 4000, 8, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } + // /** + // * Tests that if the calculated probability is greater than or equal to 1 an + // IllegalArgumentException is thrown + // */ + // @Test + // public void constructor_items_bits_hash_BadProbabilityTest() { + // try { + // new Shape(testFunction, 4000, 8, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } - // /** - // * Tests that when the number of items, number of bits and number of hash functions is passed the values are - // * calculated correctly. - // */ - // @Test - // public void constructor_items_bits_hashTest() { - // /* - // * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 - // */ - // final Shape filterConfig = new Shape(testFunction, 5, 24, 4); + // /** + // * Tests that when the number of items, number of bits and number of hash + // functions is passed the values are + // * calculated correctly. + // */ + // @Test + // public void constructor_items_bits_hashTest() { + // /* + // * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 + // */ + // final Shape filterConfig = new Shape(testFunction, 5, 24, 4); // - // assertEquals(24, filterConfig.getNumberOfBits()); - // assertEquals(4, filterConfig.getNumberOfHashFunctions()); - // assertEquals(5, filterConfig.getNumberOfItems()); - // assertEquals(0.102194782, filterConfig.getProbability(), 0.000001); - // } + // assertEquals(24, filterConfig.getNumberOfBits()); + // assertEquals(4, filterConfig.getNumberOfHashFunctions()); + // assertEquals(5, filterConfig.getNumberOfItems()); + // assertEquals(0.102194782, filterConfig.getProbability(), 0.000001); + // } - // /** - // * Tests that the number of items and number of bits is passed the other values are calculated correctly. - // */ - // @Test - // public void constructor_items_bitsTest() { - // /* - // * values from https://hur.st/bloomfilter/?n=5&m=24 - // */ - // final Shape filterConfig = new Shape(testFunction, 5, 24); + // /** + // * Tests that the number of items and number of bits is passed the other + // values are calculated correctly. + // */ + // @Test + // public void constructor_items_bitsTest() { + // /* + // * values from https://hur.st/bloomfilter/?n=5&m=24 + // */ + // final Shape filterConfig = new Shape(testFunction, 5, 24); // - // assertEquals(24, filterConfig.getNumberOfBits()); - // assertEquals(3, filterConfig.getNumberOfHashFunctions()); - // assertEquals(5, filterConfig.getNumberOfItems()); - // assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); - // } + // assertEquals(24, filterConfig.getNumberOfBits()); + // assertEquals(3, filterConfig.getNumberOfHashFunctions()); + // assertEquals(5, filterConfig.getNumberOfItems()); + // assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); + // } // - // /** - // * Tests that if the number of items is less than 1 an IllegalArgumentException is thrown. - // */ - // @Test - // public void constructor_items_probability_BadNumberOfItemsTest() { - // try { - // new Shape(testFunction, 0, 1.0 / 10); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // do nothing. - // } - // } + // /** + // * Tests that if the number of items is less than 1 an + // IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_probability_BadNumberOfItemsTest() { + // try { + // new Shape(testFunction, 0, 1.0 / 10); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // do nothing. + // } + // } // - // /** - // * Tests that if the probability is less than or equal to 0 or more than or equal to 1 an IllegalArgumentException is thrown. - // */ - // @Test - // public void constructor_items_probability_BadProbabilityTest() { - // try { - // new Shape(testFunction, 10, 0.0); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // do nothing. - // } - // try { - // new Shape(testFunction, 10, 1.0); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // do nothing. - // } - // try { - // new Shape(testFunction, 10, Double.NaN); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // do nothing. - // } - // } + // /** + // * Tests that if the probability is less than or equal to 0 or more than or + // equal to 1 an IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_probability_BadProbabilityTest() { + // try { + // new Shape(testFunction, 10, 0.0); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // do nothing. + // } + // try { + // new Shape(testFunction, 10, 1.0); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // do nothing. + // } + // try { + // new Shape(testFunction, 10, Double.NaN); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // do nothing. + // } + // } // - // /** - // * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. - // */ - // @Test - // public void constructor_items_probability_NumberOfBitsOverflowTest() { - // try { - // new Shape(testFunction, Integer.MAX_VALUE, 1.0 / 10); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // do nothing. - // } - // } + // /** + // * Tests that if calculated number of bits is greater than Integer.MAX_VALUE + // an IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_probability_NumberOfBitsOverflowTest() { + // try { + // new Shape(testFunction, Integer.MAX_VALUE, 1.0 / 10); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // do nothing. + // } + // } // - // /** - // * Tests the the probability is calculated correctly. - // */ - // @Test - // public void constructor_items_probability_Test() { + // /** + // * Tests the the probability is calculated correctly. + // */ + // @Test + // public void constructor_items_probability_Test() { // - // assertEquals(24, shape.getNumberOfBits()); - // assertEquals(3, shape.getNumberOfHashFunctions()); - // assertEquals(5, shape.getNumberOfItems()); - // assertEquals(0.100375138, shape.getProbability(), 0.000001); - // } + // assertEquals(24, shape.getNumberOfBits()); + // assertEquals(3, shape.getNumberOfHashFunctions()); + // assertEquals(5, shape.getNumberOfItems()); + // assertEquals(0.100375138, shape.getProbability(), 0.000001); + // } // - // /** - // * Tests that the constructor with a null name, number of items and size of filter fails. - // */ - // @Test - // public void constructor_nm_noName() { - // try { - // new Shape(null, 5, 72); - // fail("Should throw NullPointerException"); - // } catch (final NullPointerException expected) { - // // do nothing - // } - // } + // /** + // * Tests that the constructor with a null name, number of items and size of + // filter fails. + // */ + // @Test + // public void constructor_nm_noName() { + // try { + // new Shape(null, 5, 72); + // fail("Should throw NullPointerException"); + // } catch (final NullPointerException expected) { + // // do nothing + // } + // } // - // /** - // * Tests that the constructor with a null name, number of items, size of filter, and number of functions fails. - // */ - // @Test - // public void constructor_nmk_noName() { - // try { - // new Shape(null, 5, 72, 17); - // fail("Should throw NullPointerException"); - // } catch (final NullPointerException expected) { - // // do nothing - // } - // } + // /** + // * Tests that the constructor with a null name, number of items, size of + // filter, and number of functions fails. + // */ + // @Test + // public void constructor_nmk_noName() { + // try { + // new Shape(null, 5, 72, 17); + // fail("Should throw NullPointerException"); + // } catch (final NullPointerException expected) { + // // do nothing + // } + // } // - // /** - // * Tests that the constructor with a null name, number of items, and probability fails. - // */ - // @Test - // public void constructor_np_noName() { - // try { - // new Shape(null, 5, 0.1); - // fail("Should throw NullPointerException"); - // } catch (final NullPointerException expected) { - // // do nothing - // } - // } + // /** + // * Tests that the constructor with a null name, number of items, and + // probability fails. + // */ + // @Test + // public void constructor_np_noName() { + // try { + // new Shape(null, 5, 0.1); + // fail("Should throw NullPointerException"); + // } catch (final NullPointerException expected) { + // // do nothing + // } + // } // - // /** - // * Tests that the constructor with a null name, probability, size of filter, and number of functions fails. - // */ - // @Test - // public void constructor_pmk_noName() { - // try { - // new Shape(null, 0.1, 72, 17); - // fail("Should throw NullPointerException"); - // } catch (final NullPointerException expected) { - // // do nothing - // } - // } + // /** + // * Tests that the constructor with a null name, probability, size of filter, + // and number of functions fails. + // */ + // @Test + // public void constructor_pmk_noName() { + // try { + // new Shape(null, 0.1, 72, 17); + // fail("Should throw NullPointerException"); + // } catch (final NullPointerException expected) { + // // do nothing + // } + // } // - // /** - // * Tests that if the number of bits is less than 1 an exception is thrown - // */ - // @Test - // public void constructor_probability_bits_hash_BadNumberOfBitsTest() { - // try { - // new Shape(testFunction, 0.5, 0, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } + // /** + // * Tests that if the number of bits is less than 1 an exception is thrown + // */ + // @Test + // public void constructor_probability_bits_hash_BadNumberOfBitsTest() { + // try { + // new Shape(testFunction, 0.5, 0, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } // - // /** - // * Tests that if the number of functions is less than 1 an exception is thrown - // */ - // @Test - // public void constructor_probability_bits_hash_BadNumberOfHashFunctionsTest() { - // try { - // new Shape(testFunction, 0.5, 24, 0); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } + // /** + // * Tests that if the number of functions is less than 1 an exception is thrown + // */ + // @Test + // public void constructor_probability_bits_hash_BadNumberOfHashFunctionsTest() + // { + // try { + // new Shape(testFunction, 0.5, 24, 0); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } // - // /** - // * Tests that invalid probability values cause and IllegalArgumentException to be thrown. - // */ - // @Test - // public void constructor_probability_bits_hash_BadProbabilityTest() { - // // probability should not be 0 - // try { - // new Shape(testFunction, 0.0, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } + // /** + // * Tests that invalid probability values cause and IllegalArgumentException to + // be thrown. + // */ + // @Test + // public void constructor_probability_bits_hash_BadProbabilityTest() { + // // probability should not be 0 + // try { + // new Shape(testFunction, 0.0, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } // - // // probability should not be = -1 - // try { - // new Shape(testFunction, -1.0, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } + // // probability should not be = -1 + // try { + // new Shape(testFunction, -1.0, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } // - // // probability should not be < -1 - // try { - // new Shape(testFunction, -1.5, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } + // // probability should not be < -1 + // try { + // new Shape(testFunction, -1.5, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } // - // // probability should not be = 1 - // try { - // new Shape(testFunction, 1.0, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } + // // probability should not be = 1 + // try { + // new Shape(testFunction, 1.0, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } // - // // probability should not be > 1 - // try { - // new Shape(testFunction, 2.0, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } + // // probability should not be > 1 + // try { + // new Shape(testFunction, 2.0, 24, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } // - // /** - // * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash - // * functions. - // */ - // @Test - // public void constructor_probability_bits_hashTest() { - // /* - // * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 - // */ - // final Shape filterConfig = new Shape(testFunction, 0.1, 24, 3); + // /** + // * Tests the calculated values of calling the constructor with the + // probability, number of bits and number of hash + // * functions. + // */ + // @Test + // public void constructor_probability_bits_hashTest() { + // /* + // * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 + // */ + // final Shape filterConfig = new Shape(testFunction, 0.1, 24, 3); // - // assertEquals(24, filterConfig.getNumberOfBits()); - // assertEquals(3, filterConfig.getNumberOfHashFunctions()); - // assertEquals(5, filterConfig.getNumberOfItems()); - // assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); - // } + // assertEquals(24, filterConfig.getNumberOfBits()); + // assertEquals(3, filterConfig.getNumberOfHashFunctions()); + // assertEquals(5, filterConfig.getNumberOfItems()); + // assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); + // } // /** * Test equality of shape. @@ -379,75 +392,36 @@ public void constructor_items_bits_hash_BadNumberOfHashFunctionsTest() { public void equalsTest() { assertEquals(shape, shape); - assertEquals( 3, shape.getNumberOfHashFunctions()); - assertEquals( 24, shape.getNumberOfBits() ); - assertEquals(shape.hashCode(), new Shape(3,24 ).hashCode()); + assertEquals(3, shape.getNumberOfHashFunctions()); + assertEquals(24, shape.getNumberOfBits()); + assertEquals(shape.hashCode(), new Shape(3, 24).hashCode()); assertNotEquals(shape, null); - assertNotEquals(shape, new Shape(3,25)); - assertNotEquals(shape, new Shape(4,24)); + assertNotEquals(shape, new Shape(3, 25)); + assertNotEquals(shape, new Shape(4, 24)); } @Test public void estimateNTest() { - double[] expected = {0.0, 0.3404769153503671, - 0.6960910159170385, - 1.068251140996181, - 1.4585724543516367, - 1.8689188094520417, - 2.301456579614247, - 2.758723890333837, - 3.243720864865314, - 3.7600290339658846, - 4.311972005861497, - 4.90483578309127, - 5.545177444479562, - 6.2412684603966, - 7.003749898831201, - 7.8466340240938095, - 8.788898309344876, - 9.85714945034106, - 11.090354888959125, - 12.54892734331076, - 14.334075753824441, - 16.635532333438686, - 19.879253198304, - 25.424430642783573}; - for (int i=0;i<24;i++) - { - assertEquals( expected[i], shape.estimateN(i), 0.00000000000000001); + double[] expected = { 0.0, 0.3404769153503671, 0.6960910159170385, 1.068251140996181, 1.4585724543516367, + 1.8689188094520417, 2.301456579614247, 2.758723890333837, 3.243720864865314, 3.7600290339658846, + 4.311972005861497, 4.90483578309127, 5.545177444479562, 6.2412684603966, 7.003749898831201, + 7.8466340240938095, 8.788898309344876, 9.85714945034106, 11.090354888959125, 12.54892734331076, + 14.334075753824441, 16.635532333438686, 19.879253198304, 25.424430642783573 }; + for (int i = 0; i < 24; i++) { + assertEquals(expected[i], shape.estimateN(i), 0.00000000000000001); } } @Test public void getProbabilityTest() { - double[] expected = { 0.0, - 0.0016223626694561954, - 0.010823077182670957, - 0.030579354491777785, - 0.06091618422799686, - 0.1003751381786711, - 0.14689159766038104, - 0.19829601428155866, - 0.25258045782764715, - 0.3080221532988778, - 0.3632228594351169, - 0.4171013016177174, - 0.4688617281200601, - 0.5179525036637239, - 0.5640228015164387, - 0.6068817738972262, - 0.6464623147796981, - 0.6827901771310362, - 0.7159584363083427, - 0.7461068849672469, - 0.7734057607554121, - 0.7980431551369204, - 0.8202154721379679, - 0.8401203636727712}; - for (int i=0;i<24;i++) { - assertEquals( expected[i], shape.getProbability(i), 0.00000000000000001); + double[] expected = { 0.0, 0.0016223626694561954, 0.010823077182670957, 0.030579354491777785, + 0.06091618422799686, 0.1003751381786711, 0.14689159766038104, 0.19829601428155866, 0.25258045782764715, + 0.3080221532988778, 0.3632228594351169, 0.4171013016177174, 0.4688617281200601, 0.5179525036637239, + 0.5640228015164387, 0.6068817738972262, 0.6464623147796981, 0.6827901771310362, 0.7159584363083427, + 0.7461068849672469, 0.7734057607554121, 0.7980431551369204, 0.8202154721379679, 0.8401203636727712 }; + for (int i = 0; i < 24; i++) { + assertEquals(expected[i], shape.getProbability(i), 0.00000000000000001); } } - } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java index 760c307e4c..3c8649d7da 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java @@ -29,6 +29,6 @@ protected SimpleBloomFilter createEmptyFilter(final Shape shape) { @Override protected SimpleBloomFilter createFilter(final Shape shape, final Hasher hasher) { - return new SimpleBloomFilter(shape, hasher); + return new SimpleBloomFilter(shape, hasher); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java index 9d45a7c2e5..1a13860b4c 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java @@ -29,6 +29,6 @@ protected SparseBloomFilter createEmptyFilter(final Shape shape) { @Override protected SparseBloomFilter createFilter(final Shape shape, final Hasher hasher) { - return new SparseBloomFilter(shape, hasher); + return new SparseBloomFilter(shape, hasher); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java index 6b266f3e94..27dc2f2358 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java @@ -29,29 +29,28 @@ */ public class HasherCollectionTest { - private SimpleHasher hasher1 = new SimpleHasher( 1,1 ); - private SimpleHasher hasher2 = new SimpleHasher( 2, 2 ); - private HasherCollection hasher = new HasherCollection( hasher1, hasher2 ); + private SimpleHasher hasher1 = new SimpleHasher(1, 1); + private SimpleHasher hasher2 = new SimpleHasher(2, 2); + private HasherCollection hasher = new HasherCollection(hasher1, hasher2); @Test public void sizeTest() { - assertEquals( 2, hasher.size() ); - HasherCollection hasher3 = new HasherCollection( hasher, new SimpleHasher( 3, 3 )); - assertEquals( 3, hasher3.size() ); + assertEquals(2, hasher.size()); + HasherCollection hasher3 = new HasherCollection(hasher, new SimpleHasher(3, 3)); + assertEquals(3, hasher3.size()); } - @Test public void testIndices() { - Shape shape = new Shape( 5, 10 ); - Integer[] expected = { 1,2,3,4,5,2,4,6,8,0 }; + Shape shape = new Shape(5, 10); + Integer[] expected = { 1, 2, 3, 4, 5, 2, 4, 6, 8, 0 }; List lst = new ArrayList(); IndexProducer producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( expected.length, lst.size()); - for (int i=0;i< expected.length;i++) { - assertEquals( String.format("error at position %d", i), expected[i], lst.get(i) ); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(String.format("error at position %d", i), expected[i], lst.get(i)); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java index aba5c4f538..ce6d1aa7da 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherFilterTest.java @@ -13,7 +13,8 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */package org.apache.commons.collections4.bloomfilter.hasher; + */ +package org.apache.commons.collections4.bloomfilter.hasher; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -30,19 +31,18 @@ public class HasherFilterTest { public void testBasicFiltering() { Hasher.Filter filter = new Hasher.Filter(10); - for (int i=0;i<10;i++) { - assertTrue( filter.test(i)); + for (int i = 0; i < 10; i++) { + assertTrue(filter.test(i)); } - for (int i=0;i<10;i++) { - assertFalse( filter.test(i)); + for (int i = 0; i < 10; i++) { + assertFalse(filter.test(i)); } try { filter.test(10); - fail( "Should have thrown IndexOutOfBounds exception"); - } - catch (IndexOutOfBoundsException expected) { + fail("Should have thrown IndexOutOfBounds exception"); + } catch (IndexOutOfBoundsException expected) { // do nothing. } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java index 3f5a1e1f93..e3b3764bd4 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java @@ -33,17 +33,16 @@ public class NullHasherTest { @Test public void sizeTest() { - assertEquals( 0, hasher.size() ); + assertEquals(0, hasher.size()); } @Test public void testIterator() { - Shape shape = new Shape( 5, 10 ); + Shape shape = new Shape(5, 10); List lst = new ArrayList(); IndexProducer producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( 0, lst.size()); + producer.forEachIndex(lst::add); + assertEquals(0, lst.size()); } - } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java index 7c1f29edff..b0b9883f54 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java @@ -29,106 +29,105 @@ */ public class SimpleHasherTest { - private SimpleHasher hasher = new SimpleHasher( 1,1 ); + private SimpleHasher hasher = new SimpleHasher(1, 1); @Test public void sizeTest() { - assertEquals( 1, hasher.size() ); + assertEquals(1, hasher.size()); } @Test public void testIterator() { - Shape shape = new Shape( 5, 10 ); - Integer[] expected = { 1,2,3,4,5}; + Shape shape = new Shape(5, 10); + Integer[] expected = { 1, 2, 3, 4, 5 }; List lst = new ArrayList(); IndexProducer producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( expected.length, lst.size()); - for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], lst.get(i) ); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); } } @Test public void constructorBufferTest() { - Shape shape = new Shape( 5, 10 ); - byte[] buffer = { 1, 1}; - SimpleHasher hasher = new SimpleHasher( buffer ); - Integer[] expected = { 1,2,3,4,5}; + Shape shape = new Shape(5, 10); + byte[] buffer = { 1, 1 }; + SimpleHasher hasher = new SimpleHasher(buffer); + Integer[] expected = { 1, 2, 3, 4, 5 }; List lst = new ArrayList(); IndexProducer producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( expected.length, lst.size()); - for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], lst.get(i) ); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); } buffer = new byte[] { 1 }; - hasher = new SimpleHasher( buffer ); - expected = new Integer[]{ 0,1,2,3,4 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 0, 1, 2, 3, 4 }; lst = new ArrayList(); producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( expected.length, lst.size()); - for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], lst.get(i) ); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); } - buffer = new byte[] { 1, 0, 1 }; - hasher = new SimpleHasher( buffer ); - expected = new Integer[]{ 1,2,3,4,5 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 1, 2, 3, 4, 5 }; lst = new ArrayList(); producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( expected.length, lst.size()); - for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], lst.get(i) ); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); } buffer = new byte[] { 0, 1, 0, 1 }; - hasher = new SimpleHasher( buffer ); - expected = new Integer[]{ 1,2,3,4,5 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 1, 2, 3, 4, 5 }; lst = new ArrayList(); producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( expected.length, lst.size()); - for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], lst.get(i) ); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); } - buffer = new byte[] { 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1}; - hasher = new SimpleHasher( buffer ); - expected = new Integer[]{ 1,2,3,4,5 }; + buffer = new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 1, 2, 3, 4, 5 }; lst = new ArrayList(); producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( expected.length, lst.size()); - for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], lst.get(i) ); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); } - buffer = new byte[] { 0,0,0,0,0,0,0,1,5,5,0,0,0,0,0,0,0,1,5,5}; - hasher = new SimpleHasher( buffer ); - expected = new Integer[]{ 1,2,3,4,5 }; + buffer = new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 1, 2, 3, 4, 5 }; lst = new ArrayList(); producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( expected.length, lst.size()); - for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], lst.get(i) ); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); } - buffer = new byte[] { 0,0,0,0,0,0,0,1,5,0,0,0,0,0,0,0,1,5,5}; - hasher = new SimpleHasher( buffer ); - expected = new Integer[]{ 1,2,3,4,5 }; + buffer = new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 }; + hasher = new SimpleHasher(buffer); + expected = new Integer[] { 1, 2, 3, 4, 5 }; lst = new ArrayList(); producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( expected.length, lst.size()); - for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], lst.get(i) ); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); } -} + } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java index c0947557ad..78f0d26c7c 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java @@ -29,29 +29,28 @@ */ public class SingleItemHasherCollectionTest { - private SimpleHasher hasher1 = new SimpleHasher( 1,1 ); - private SimpleHasher hasher2 = new SimpleHasher( 2, 2 ); - private HasherCollection hasher = new SingleItemHasherCollection( hasher1, hasher2 ); + private SimpleHasher hasher1 = new SimpleHasher(1, 1); + private SimpleHasher hasher2 = new SimpleHasher(2, 2); + private HasherCollection hasher = new SingleItemHasherCollection(hasher1, hasher2); @Test public void sizeTest() { - assertEquals( 1, hasher.size() ); - HasherCollection hasher3 = new SingleItemHasherCollection( hasher, new SimpleHasher( 3, 3 )); - assertEquals( 1, hasher3.size() ); + assertEquals(1, hasher.size()); + HasherCollection hasher3 = new SingleItemHasherCollection(hasher, new SimpleHasher(3, 3)); + assertEquals(1, hasher3.size()); } - @Test public void testIndices() { - Shape shape = new Shape( 5, 10 ); - Integer[] expected = { 1,2,3,4,5,6,8,0 }; + Shape shape = new Shape(5, 10); + Integer[] expected = { 1, 2, 3, 4, 5, 6, 8, 0 }; List lst = new ArrayList(); IndexProducer producer = hasher.indices(shape); - producer.forEachIndex( lst::add ); - assertEquals( expected.length, lst.size()); - for (int i=0;i< expected.length;i++) { - assertEquals( expected[i], lst.get(i) ); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); } } From db6eb1594f50625b5a8b5a806adba831145ee53a Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Tue, 23 Nov 2021 14:09:29 +0000 Subject: [PATCH 19/27] fixed javadoc issues --- .../collections4/bloomfilter/BitMap.java | 2 ++ .../collections4/bloomfilter/BloomFilter.java | 8 +++----- .../bloomfilter/CountingBloomFilter.java | 17 +++++++++-------- .../commons/collections4/bloomfilter/Shape.java | 8 ++++---- .../bloomfilter/SimpleBloomFilter.java | 3 ++- .../bloomfilter/SparseBloomFilter.java | 1 + .../collections4/bloomfilter/hasher/Hasher.java | 11 ++++++----- .../bloomfilter/hasher/HasherCollection.java | 3 +-- .../bloomfilter/hasher/package-info.java | 7 ++----- .../collections4/bloomfilter/package-info.java | 5 ++--- 10 files changed, 32 insertions(+), 33 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java index 11abbfde49..7aa50086ac 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -88,7 +88,9 @@ public static void checkPositive(final int bitIndex) { * * @param limit the number of bitmaps in a collection. * @param bitIndex the bit index + * @returns the index for the bitmap in the array. * @throws IndexOutOfBoundsException if the index is not positive + * @see #getLongIndex(int) */ public static int checkRange(final int limit, final int bitIndex) { checkPositive(bitIndex); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index 424c115601..1e8680a239 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -203,14 +203,12 @@ default boolean mergeInPlace(Hasher hasher) { /** * Determines if the bloom filter is "full". * - *

Full is defined as having no unset - * bits.

+ *

Full is defined as having no unset bits.

* * @return {@code true} if the filter is full, {@code false} otherwise. */ - default boolean isFull(Shape shape) { - Objects.requireNonNull(shape, "shape"); - return cardinality() == shape.getNumberOfBits(); + default boolean isFull() { + return cardinality() == getShape().getNumberOfBits(); } // Counting Operations diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index 6891ac2213..e8e8a7b242 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -82,27 +82,25 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { /** * Removes the specified Bloom filter from this Bloom filter. * - *

Specifically - * all counts for the indexes identified by the {@code other} filter will be decremented by 1.

+ *

Specifically: all counts for the indexes identified by the {@code other} filter will be decremented by 1,

* - *

Note: If the other filter is a counting Bloom filter the index counts are ignored; only - * the enabled indexes are used.

+ *

Note: If the other filter is a counting Bloom filter the index counts are ignored and it is treated as an + * IndexProducer.

* *

This method will return {@code true} if the filter is valid after the operation.

* * @param other the other Bloom filter * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #subtract(CountingBloomFilter) + * @see #subtract(BitCountProducer) */ boolean remove(BloomFilter other); /** * Removes the specified hasher from the Bloom filter from this Bloom filter. * - *

Specifically - * all counts for the distinct indexes identified by the {@code hasher} will be - * decremented by 1. If the {@code hasher} contains duplicate bit indexes these are ignored.

+ *

Specifically all counts for the indices produced by the {@code hasher} will be + * decremented by 1.

* *

For HasherCollections each enclosed Hasher will be considered a single item and decremented * from the counts separately.

@@ -112,6 +110,7 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { * @param hasher the hasher to provide the indexes * @return {@code true} if the removal was successful and the state is valid * @see #isValid() + * @see #subtract(BitCountProducer) */ boolean remove(Hasher hasher); @@ -127,6 +126,7 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { * @param other the BitCountProducer to add. * @return {@code true} if the addition was successful and the state is valid * @see #isValid() + * @see #subtract(BitCountProducer) */ boolean add(BitCountProducer other); @@ -142,6 +142,7 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { * @param other the BitCountProducer to subtract. * @return {@code true} if the subtraction was successful and the state is valid * @see #isValid() + * @see #add(BitCountProducer) */ boolean subtract(BitCountProducer other); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java index c68c79fe5f..8aa9bb2e88 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -151,7 +151,6 @@ public int getNumberOfHashFunctions() { * * @param numberOfItems the number of items hashed into the Bloom filter. * @return the probability of false positives. - * @see #getNumberOfItems() */ public double getProbability(int numberOfItems) { if (numberOfItems < 0) { @@ -223,6 +222,7 @@ public static class Factory { * @param probability The desired false-positive probability in the range {@code (0, 1)} * @param numberOfBits The number of bits in the filter * @param numberOfHashFunctions The number of hash functions in the filter + * @return a valid Shape. * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}, * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual * probability is {@code >= 1.0} @@ -271,9 +271,9 @@ public static Shape fromPMK(final double probability, final int numberOfBits, fi * * @param numberOfItems Number of items to be placed in the filter * @param probability The desired false-positive probability in the range {@code (0, 1)} + * @returns a valid Shape * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}. - * @see #getProbability() */ public static Shape fromNP(final int numberOfItems, final double probability) { checkNumberOfItems(numberOfItems); @@ -307,9 +307,9 @@ public static Shape fromNP(final int numberOfItems, final double probability) { * * @param numberOfItems Number of items to be placed in the filter * @param numberOfBits The number of bits in the filter + * @return a valid Shape. * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0} - * @see #getProbability() */ public static Shape fromNM(final int numberOfItems, final int numberOfBits) { checkNumberOfItems(numberOfItems); @@ -332,9 +332,9 @@ public static Shape fromNM(final int numberOfItems, final int numberOfBits) { * @param numberOfItems Number of items to be placed in the filter * @param numberOfBits The number of bits in the filter. * @param numberOfHashFunctions The number of hash functions in the filter + * @return a valid Shape. * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}. - * @see #getProbability() */ public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) { checkNumberOfItems(numberOfItems); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java index fdf79a7415..194c80c19c 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -46,8 +46,9 @@ public class SimpleBloomFilter implements BloomFilter { private int cardinality; /** - * Constructs an empty BitSetBloomFilter. + * Constructs an empty SimpleBloomFilter. * + * @param shape The shape for the filter. */ public SimpleBloomFilter(Shape shape) { Objects.requireNonNull(shape, "shape"); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java index 7a43ded059..24c88a0561 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -45,6 +45,7 @@ public class SparseBloomFilter implements BloomFilter { /** * Constructs an empty BitSetBloomFilter. * + * @param shape The shape of the filter. */ public SparseBloomFilter(Shape shape) { Objects.requireNonNull(shape, "shape"); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index 561736a80d..cccd5187bf 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -92,11 +92,11 @@ public Filter(int size) { *

The first time a number is tested the method returns {@code true} and returns * {@code false} for every time after that.

* - *

If the input is negative the behavior is not defined.

+ *

If the input is not in the range [0,size) an IndexOutOfBoundsException exception is thrown.

* - *

Note: only positive number are * @param number the number to check. * @return {@code true} if the number has not been seen, {@code false} otherwise. + * @see Hasher.Filter#Filter(int) */ public boolean test(int number) { BitMap.checkPositive(number); @@ -122,11 +122,12 @@ class FilteredIntConsumer implements IntConsumer { /** * Constructor. - * @param shape The shape of the output. + *

integers ouside the range [0,size) will throw an IndexOutOfBoundsException. + * @param size The number of integers to track. Values in the range [0,size) will be tracked. * @param consumer to wrap. */ - public FilteredIntConsumer(int maxIntegerValue, IntConsumer consumer) { - this.filter = new Hasher.Filter(maxIntegerValue); + public FilteredIntConsumer(int size, IntConsumer consumer) { + this.filter = new Hasher.Filter(size); this.consumer = consumer; } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java index 4376e58641..37e208f488 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java @@ -61,8 +61,7 @@ public HasherCollection(final Collection hashers) { /** * Constructor. * - * @param function the function to use. - * @param buffers the byte buffers that will be hashed. + * @param hashers A list of Hashers to initialize the collection with. */ public HasherCollection(Hasher... hashers) { this(Arrays.asList(hashers)); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java index b53e159063..a363af352b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java @@ -55,9 +55,6 @@ *

Other implementations of the Hasher are easy to implement. Hashers that represent single items should make use of the * {@code Hahser.Filter} and/or {@code Hahser.FileredIntConsumer} classes to filter out duplicate indices.

* - * - * - * * With the exception of the HasherCollection, a Hasher represents an item of arbitrary * byte size as multiple byte representations of fixed size (multiple hashes). The hashers * are be used to create indices for a Bloom filter.

@@ -75,9 +72,9 @@ * than the required number of hash values per item after duplicates have been * removed.

* - *

Footnotes + *

Footnotes

* - * 1. Kirsch, Adam and Michael Mitzenmacher, * "Building a Better Bloom Filter", * Harvard Computer Science Group Technical Report TR-02-05. * diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java index 45f0a81b99..50a8f723b7 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java @@ -42,7 +42,7 @@ * representation of the internal structure. Additional methods are available in the {@code BitMap} to assist in * manipulation of the representations.

* - *

The bloom filter code is an interface that requires implementation of 6 methods: + *

The bloom filter code is an interface that requires implementation of 6 methods:

*
    *
  • {@code cardinality()} * returns the number of bits enabled in the Bloom filter.
  • @@ -64,7 +64,7 @@ * utilizes either the {@code BitMapProducer} or {@code IndexProducer} from the argument to enable extra bits * in the internal representation of the Bloom filter.. *
- *

+ * *

Other methods should be implemented where they can be done so more efficiently than the default implementations. *

* @@ -73,7 +73,6 @@ *

The counting bloom filter extends the Bloom filter by counting the number of times a specific bit has been * enabled or disabled. This allows the removal (opposite of merge) of Bloom filters at the expense of additional * overhead.

- *
  • * *

    Shape

    * From 45eb883d3749d4d1a48940da5604e19683a68525 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Tue, 23 Nov 2021 14:12:16 +0000 Subject: [PATCH 20/27] fixed test issue --- .../collections4/bloomfilter/AbstractBloomFilterTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index b753c196d0..6c6c8761bd 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -201,13 +201,13 @@ public final void isFullTest() { // create empty filter BloomFilter filter = createEmptyFilter(shape); - assertFalse("Should not be full", filter.isFull(shape)); + assertFalse("Should not be full", filter.isFull()); filter = createFilter(shape, fullHasher); - assertTrue("Should be full", filter.isFull(shape)); + assertTrue("Should be full", filter.isFull()); filter = createFilter(shape, new SimpleHasher(1, 3)); - assertFalse("Should not be full", filter.isFull(shape)); + assertFalse("Should not be full", filter.isFull()); } /** From 736711f1a1b919c8c4c26ff88b231731355ca699 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Tue, 23 Nov 2021 21:20:16 +0000 Subject: [PATCH 21/27] fixed javadoc issues --- .../apache/commons/collections4/bloomfilter/BitMap.java | 2 +- .../org/apache/commons/collections4/bloomfilter/Shape.java | 2 +- .../collections4/bloomfilter/hasher/package-info.java | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java index 7aa50086ac..a16cb0c917 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java @@ -88,7 +88,7 @@ public static void checkPositive(final int bitIndex) { * * @param limit the number of bitmaps in a collection. * @param bitIndex the bit index - * @returns the index for the bitmap in the array. + * @return the index for the bitmap in the array. * @throws IndexOutOfBoundsException if the index is not positive * @see #getLongIndex(int) */ diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java index 8aa9bb2e88..6319efcd44 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -271,7 +271,7 @@ public static Shape fromPMK(final double probability, final int numberOfBits, fi * * @param numberOfItems Number of items to be placed in the filter * @param probability The desired false-positive probability in the range {@code (0, 1)} - * @returns a valid Shape + * @return a valid Shape * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}. */ diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java index a363af352b..1a18e37d97 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java @@ -30,8 +30,9 @@ *

    SimpleHasher

    * *

    The SimpleHasher represents one item being added to the Bloom filter. It utilizes the combinatorial strategy - * as described by Krisch and Mitzenmacher[1]. Generally, a hash value is created by hashing - * together multiple properties of the item being added. The hash value is then used to create a SimpleHasher.

    + * as described by Krisch and Mitzenmacher. + * Generally, a hash value is created by hashing together multiple properties of the item being added. The hash value is + * then used to create a SimpleHasher.

    * *

    This hasher represents a single item and thus does not return duplicate indices.

    * @@ -55,7 +56,7 @@ *

    Other implementations of the Hasher are easy to implement. Hashers that represent single items should make use of the * {@code Hahser.Filter} and/or {@code Hahser.FileredIntConsumer} classes to filter out duplicate indices.

    * - * With the exception of the HasherCollection, a Hasher represents an item of arbitrary + *

    With the exception of the HasherCollection, a Hasher represents an item of arbitrary * byte size as multiple byte representations of fixed size (multiple hashes). The hashers * are be used to create indices for a Bloom filter.

    * From 1feaccc32d2250b6061c575e78c4856876609cb8 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Tue, 23 Nov 2021 21:28:24 +0000 Subject: [PATCH 22/27] Reduced the acceptable delta for p tests --- .../org/apache/commons/collections4/bloomfilter/ShapeTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java index 32fbb63109..0d9ece1cc4 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java @@ -420,7 +420,7 @@ public void getProbabilityTest() { 0.5640228015164387, 0.6068817738972262, 0.6464623147796981, 0.6827901771310362, 0.7159584363083427, 0.7461068849672469, 0.7734057607554121, 0.7980431551369204, 0.8202154721379679, 0.8401203636727712 }; for (int i = 0; i < 24; i++) { - assertEquals(expected[i], shape.getProbability(i), 0.00000000000000001); + assertEquals(expected[i], shape.getProbability(i), 0.000000000000001); } } From 3002d080337ab0829e45beb65f920f5de25e372b Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Thu, 25 Nov 2021 07:47:06 +0000 Subject: [PATCH 23/27] Updated docs and test cases --- .../collections4/bloomfilter/Shape.java | 14 + .../bloomfilter/SimpleBloomFilter.java | 10 +- .../bloomfilter/SparseBloomFilter.java | 12 +- .../bloomfilter/hasher/Hasher.java | 2 - .../bloomfilter/hasher/NullHasher.java | 19 +- .../hasher/SingleItemHasherCollection.java | 13 + .../bloomfilter/hasher/package-info.java | 8 +- .../bloomfilter/AbstractBloomFilterTest.java | 28 +- .../AbstractCountingBloomFilterTest.java | 24 +- .../bloomfilter/BitCountProducerTest.java | 21 ++ .../bloomfilter/BitMapProducerTest.java | 27 ++ .../collections4/bloomfilter/BitMapTest.java | 11 + .../bloomfilter/IndexProducerTest.java | 4 - .../collections4/bloomfilter/ShapeTest.java | 326 +----------------- .../bloomfilter/SimpleBloomFilterTest.java | 25 ++ .../bloomfilter/SparseBloomFilterTest.java | 35 ++ .../hasher/HasherCollectionTest.java | 9 + .../bloomfilter/hasher/NullHasherTest.java | 6 + .../bloomfilter/hasher/SimpleHasherTest.java | 16 + .../SingleItemHasherCollectionTest.java | 17 +- 20 files changed, 274 insertions(+), 353 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java index 6319efcd44..11179f36bc 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -170,6 +170,13 @@ public String toString() { /** * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled. + * + *

    Note:

    + *
      + *
    • if hammingValue == numberOfBits, then result is infinity.
    • + *
    • if hammingValue > numberOfBits, then result is NaN.
    • + *
    + * * @param hammingValue the number of enabled bits. * @return An estimate of the number of items in the Bloom filter. */ @@ -207,6 +214,13 @@ public static class Factory { */ private static final double DENOMINATOR = -LN_2 * LN_2; + /** + * Do not instantiate. + */ + private Factory() { + + } + /** * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the * specified number of bits ({@code m}) and hash functions ({@code k}). diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java index 194c80c19c..0cb733e0bf 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -74,6 +74,7 @@ public SimpleBloomFilter(final Shape shape, Hasher hasher) { * Constructor. * @param shape The shape for the filter. * @param producer the BitMap Producer to initialize the filter with. + * @throws IllegalArgumentException if the producer returns too many bit maps. */ public SimpleBloomFilter(final Shape shape, BitMapProducer producer) { Objects.requireNonNull(shape, "shape"); @@ -81,8 +82,13 @@ public SimpleBloomFilter(final Shape shape, BitMapProducer producer) { this.shape = shape; BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape); - producer.forEachBitMap(builder); - this.bitMap = builder.getArray(); + try { + producer.forEachBitMap(builder); + this.bitMap = builder.getArray(); + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException( String.format("BitMapProducer should only send %s maps", + BitMap.numberOfBitMaps( shape.getNumberOfBits())), e); + } this.cardinality = -1; } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java index 24c88a0561..92ea2a2be2 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -75,9 +75,15 @@ public SparseBloomFilter(Shape shape, List indices) { this(shape); Objects.requireNonNull(indices, "indices"); this.indices.addAll(indices); - if (this.indices.last() >= shape.getNumberOfBits()) { - throw new IllegalArgumentException(String.format("Value in list {} is greater than maximum value ({})", - this.indices.last(), shape.getNumberOfBits())); + if (! this.indices.isEmpty()) { + if (this.indices.last() >= shape.getNumberOfBits()) { + throw new IllegalArgumentException(String.format("Value in list {} is greater than maximum value ({})", + this.indices.last(), shape.getNumberOfBits())); + } + if (this.indices.first() < 0 ) { + throw new IllegalArgumentException(String.format("Value in list {} is less than 0", + this.indices.first())); + } } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index cccd5187bf..58b78d61ae 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -135,9 +135,7 @@ public FilteredIntConsumer(int size, IntConsumer consumer) { public void accept(int value) { if (filter.test(value)) { consumer.accept(value); - ; } - } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java index a27033f4c8..0349b22c6d 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasher.java @@ -29,21 +29,26 @@ */ public final class NullHasher implements Hasher { + /** + * The instance of the Null Hasher. + */ public static final NullHasher INSTANCE = new NullHasher(); + + private static final IndexProducer PRODUCER = new IndexProducer() { + @Override + public void forEachIndex(IntConsumer consumer) { + // do nothing + } + }; + private NullHasher() { } @Override public IndexProducer indices(final Shape shape) { Objects.requireNonNull(shape, "shape"); - - return new IndexProducer() { - @Override - public void forEachIndex(IntConsumer consumer) { - // do nothing - } - }; + return PRODUCER; } @Override diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java index d1a15587e2..7de34276c4 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java @@ -32,14 +32,27 @@ */ public class SingleItemHasherCollection extends HasherCollection { + /** + * Constructs an empty SingleItemHasherCollection. + */ public SingleItemHasherCollection() { super(); } + /** + * Constructs a SingleItemHasherCollection from a collection of Hasher objects. + * + * @param hashers A collections of Hashers to build the indices with. + */ public SingleItemHasherCollection(Collection hashers) { super(hashers); } + /** + * Constructor. + * + * @param hashers A list of Hashers to initialize the collection with. + */ public SingleItemHasherCollection(Hasher... hashers) { super(hashers); } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java index 1a18e37d97..2922477edc 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java @@ -38,7 +38,7 @@ * *

    HasherCollection

    * - *

    The HasherCollection is a collection of Hashers that implemehts the Hasher interface. Each hasher within the collection + *

    The HasherCollection is a collection of Hashers that implements the Hasher interface. Each hasher within the collection * represents a single item, or in the case of a HasherCollections multiple items.

    * *

    This hahser represents multiple items and thus may return duplicate indices.

    @@ -73,12 +73,6 @@ * than the required number of hash values per item after duplicates have been * removed.

    * - *

    Footnotes

    - * - * 1. Kirsch, Adam and Michael Mitzenmacher, - * "Building a Better Bloom Filter", - * Harvard Computer Science Group Technical Report TR-02-05. - * * @see org.apache.commons.collections4.bloomfilter.IndexProducer * @since 4.5 */ diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 6c6c8761bd..298b7953d2 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -16,9 +16,9 @@ */ package org.apache.commons.collections4.bloomfilter; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import static org.junit.jupiter.api.Assertions.assertEquals; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.HasherCollection; import org.apache.commons.collections4.bloomfilter.hasher.SimpleHasher; @@ -63,6 +63,16 @@ public abstract class AbstractBloomFilterTest { */ protected abstract T createFilter(Shape shape, Hasher hasher); + @Test + public void asIndexArrayTest() { + final BloomFilter bf = createFilter( shape, from1 ); + int[] ary = BloomFilter.asIndexArray( bf ); + assertEquals( 17, ary.length ); + for (int i=0; i lst = new ArrayList(); + producer.forEachIndex( lst::add ); + assertEquals( 2, lst.size() ); + assertEquals( Integer.valueOf(1), lst.get(0) ); + assertEquals( Integer.valueOf(3), lst.get(1) ); + } + } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java index 4e23147daf..2cbff7c8b6 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerTest.java @@ -16,7 +16,10 @@ */ package org.apache.commons.collections4.bloomfilter; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + import java.util.ArrayList; import java.util.List; import java.util.function.IntConsumer; @@ -51,4 +54,28 @@ public void forEachIndex(IntConsumer consumer) { assertTrue(BitMap.contains(buckets, 128)); } + @Test + public void fromLongArrayTest() { + long[] ary = new long[] {1L, 2L, 3L, 4L, 5L}; + BitMapProducer producer = BitMapProducer.fromLongArray( ary ); + List lst = new ArrayList(); + producer.forEachBitMap( lst::add ); + assertEquals( Long.valueOf(1), lst.get(0) ); + assertEquals( Long.valueOf(2), lst.get(1) ); + assertEquals( Long.valueOf(3), lst.get(2) ); + assertEquals( Long.valueOf(4), lst.get(3) ); + assertEquals( Long.valueOf(5), lst.get(4) ); + + } + + @Test + public void arrayBuilderTest() { + try { + new BitMapProducer.ArrayBuilder( new Shape( 1, 4 ), new long[] {1L, 2L, 3L, 4L, 5L }); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + } + } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java index 0d6d575087..145a28aa7f 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java @@ -19,6 +19,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import org.junit.Test; @@ -131,4 +132,14 @@ public void contains_boundaryConditionTest() { assertTrue(BitMap.contains(ary, 64)); } + + @Test + public void checkRangeTest() { + try { + BitMap.checkRange( 1, Long.SIZE + 1); + fail( "Should have thrown IndexOutOfBoundsException" ); + } catch (IndexOutOfBoundsException expected) { + // + } + } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java index c97957424c..7fd7b81512 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java @@ -26,10 +26,6 @@ public class IndexProducerTest { - public IndexProducerTest() { - - } - @Test public void fromBitMapProducerTest() { TestingBitMapProducer producer = new TestingBitMapProducer(new long[] { 1L, 2L, 3L }); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java index 0d9ece1cc4..67c7e53cfd 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java @@ -54,47 +54,6 @@ public void constructor_items_bits_BadNumberOfBitsTest() { } } - // /** - // * Tests that if the number of hash functions is less than 1 an - // IllegalArgumentException is thrown. - // */ - // @Test - // public void constructor_items_bits_BadNumberOfHashFunctionsTest() { - // try { - // new Shape( 16, 8); - // - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } - - // /** - // * Tests that if the number of items less than 1 an IllegalArgumentException - // is thrown. - // */ - // @Test - // public void constructor_items_bits_BadNumberOfItemsTest() { - // try { - // new Shape(testFunction, 0, 24); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } - - // /** - // * Tests that if the number of bits is less than 1 an exception is thrown - // */ - // @Test - // public void constructor_items_bits_hash_BadNumberOfBitsTest() { - // try { - // new Shape(testFunction, 5, 0, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } /** * Tests that if the number of hash functions is less than 1 an exception is thrown. @@ -109,282 +68,6 @@ public void constructor_items_bits_hash_BadNumberOfHashFunctionsTest() { } } - // /** - // * Tests that if the number of items is less than 1 an exception is thrown. - // */ - // @Test - // public void constructor_items_bits_hash_BadNumberOfItemsTest() { - // try { - // new Shape(testFunction, 0, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } - - // /** - // * Tests that if the calculated probability is greater than or equal to 1 an - // IllegalArgumentException is thrown - // */ - // @Test - // public void constructor_items_bits_hash_BadProbabilityTest() { - // try { - // new Shape(testFunction, 4000, 8, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } - - // /** - // * Tests that when the number of items, number of bits and number of hash - // functions is passed the values are - // * calculated correctly. - // */ - // @Test - // public void constructor_items_bits_hashTest() { - // /* - // * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 - // */ - // final Shape filterConfig = new Shape(testFunction, 5, 24, 4); - // - // assertEquals(24, filterConfig.getNumberOfBits()); - // assertEquals(4, filterConfig.getNumberOfHashFunctions()); - // assertEquals(5, filterConfig.getNumberOfItems()); - // assertEquals(0.102194782, filterConfig.getProbability(), 0.000001); - // } - - // /** - // * Tests that the number of items and number of bits is passed the other - // values are calculated correctly. - // */ - // @Test - // public void constructor_items_bitsTest() { - // /* - // * values from https://hur.st/bloomfilter/?n=5&m=24 - // */ - // final Shape filterConfig = new Shape(testFunction, 5, 24); - // - // assertEquals(24, filterConfig.getNumberOfBits()); - // assertEquals(3, filterConfig.getNumberOfHashFunctions()); - // assertEquals(5, filterConfig.getNumberOfItems()); - // assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); - // } - // - // /** - // * Tests that if the number of items is less than 1 an - // IllegalArgumentException is thrown. - // */ - // @Test - // public void constructor_items_probability_BadNumberOfItemsTest() { - // try { - // new Shape(testFunction, 0, 1.0 / 10); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // do nothing. - // } - // } - // - // /** - // * Tests that if the probability is less than or equal to 0 or more than or - // equal to 1 an IllegalArgumentException is thrown. - // */ - // @Test - // public void constructor_items_probability_BadProbabilityTest() { - // try { - // new Shape(testFunction, 10, 0.0); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // do nothing. - // } - // try { - // new Shape(testFunction, 10, 1.0); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // do nothing. - // } - // try { - // new Shape(testFunction, 10, Double.NaN); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // do nothing. - // } - // } - // - // /** - // * Tests that if calculated number of bits is greater than Integer.MAX_VALUE - // an IllegalArgumentException is thrown. - // */ - // @Test - // public void constructor_items_probability_NumberOfBitsOverflowTest() { - // try { - // new Shape(testFunction, Integer.MAX_VALUE, 1.0 / 10); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // do nothing. - // } - // } - // - // /** - // * Tests the the probability is calculated correctly. - // */ - // @Test - // public void constructor_items_probability_Test() { - // - // assertEquals(24, shape.getNumberOfBits()); - // assertEquals(3, shape.getNumberOfHashFunctions()); - // assertEquals(5, shape.getNumberOfItems()); - // assertEquals(0.100375138, shape.getProbability(), 0.000001); - // } - // - // /** - // * Tests that the constructor with a null name, number of items and size of - // filter fails. - // */ - // @Test - // public void constructor_nm_noName() { - // try { - // new Shape(null, 5, 72); - // fail("Should throw NullPointerException"); - // } catch (final NullPointerException expected) { - // // do nothing - // } - // } - // - // /** - // * Tests that the constructor with a null name, number of items, size of - // filter, and number of functions fails. - // */ - // @Test - // public void constructor_nmk_noName() { - // try { - // new Shape(null, 5, 72, 17); - // fail("Should throw NullPointerException"); - // } catch (final NullPointerException expected) { - // // do nothing - // } - // } - // - // /** - // * Tests that the constructor with a null name, number of items, and - // probability fails. - // */ - // @Test - // public void constructor_np_noName() { - // try { - // new Shape(null, 5, 0.1); - // fail("Should throw NullPointerException"); - // } catch (final NullPointerException expected) { - // // do nothing - // } - // } - // - // /** - // * Tests that the constructor with a null name, probability, size of filter, - // and number of functions fails. - // */ - // @Test - // public void constructor_pmk_noName() { - // try { - // new Shape(null, 0.1, 72, 17); - // fail("Should throw NullPointerException"); - // } catch (final NullPointerException expected) { - // // do nothing - // } - // } - // - // /** - // * Tests that if the number of bits is less than 1 an exception is thrown - // */ - // @Test - // public void constructor_probability_bits_hash_BadNumberOfBitsTest() { - // try { - // new Shape(testFunction, 0.5, 0, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } - // - // /** - // * Tests that if the number of functions is less than 1 an exception is thrown - // */ - // @Test - // public void constructor_probability_bits_hash_BadNumberOfHashFunctionsTest() - // { - // try { - // new Shape(testFunction, 0.5, 24, 0); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } - // - // /** - // * Tests that invalid probability values cause and IllegalArgumentException to - // be thrown. - // */ - // @Test - // public void constructor_probability_bits_hash_BadProbabilityTest() { - // // probability should not be 0 - // try { - // new Shape(testFunction, 0.0, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // - // // probability should not be = -1 - // try { - // new Shape(testFunction, -1.0, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // - // // probability should not be < -1 - // try { - // new Shape(testFunction, -1.5, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // - // // probability should not be = 1 - // try { - // new Shape(testFunction, 1.0, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // - // // probability should not be > 1 - // try { - // new Shape(testFunction, 2.0, 24, 1); - // fail("Should have thrown IllegalArgumentException"); - // } catch (final IllegalArgumentException expected) { - // // expected - // } - // } - // - // /** - // * Tests the calculated values of calling the constructor with the - // probability, number of bits and number of hash - // * functions. - // */ - // @Test - // public void constructor_probability_bits_hashTest() { - // /* - // * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 - // */ - // final Shape filterConfig = new Shape(testFunction, 0.1, 24, 3); - // - // assertEquals(24, filterConfig.getNumberOfBits()); - // assertEquals(3, filterConfig.getNumberOfHashFunctions()); - // assertEquals(5, filterConfig.getNumberOfItems()); - // assertEquals(0.100375138, filterConfig.getProbability(), 0.000001); - // } - // /** * Test equality of shape. */ @@ -422,6 +105,15 @@ public void getProbabilityTest() { for (int i = 0; i < 24; i++) { assertEquals(expected[i], shape.getProbability(i), 0.000000000000001); } + + assertEquals( 0.0, shape.getProbability(0), 0.0 ); + + try { + shape.getProbability( -1 ); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expect) { + // do nothing + } } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java index 3c8649d7da..5c0ef45082 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java @@ -16,7 +16,14 @@ */ package org.apache.commons.collections4.bloomfilter; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.List; + import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.junit.Test; /** * Tests for the {@link SimpleBloomFilter}. @@ -31,4 +38,22 @@ protected SimpleBloomFilter createEmptyFilter(final Shape shape) { protected SimpleBloomFilter createFilter(final Shape shape, final Hasher hasher) { return new SimpleBloomFilter(shape, hasher); } + + @Test + public void constructorTest() { + + SimpleBloomFilter filter = new SimpleBloomFilter( shape, BitMapProducer.fromLongArray( new long[] { 500L }) ); + List lst = new ArrayList(); + filter.forEachBitMap( lst::add ); + assertEquals( 1, lst.size() ); + assertEquals( 500L, lst.get(0).intValue() ); + + try { + filter = new SimpleBloomFilter( shape, + BitMapProducer.fromLongArray( new long[] { 500L, 400L, 300L }) ); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing + } + } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java index 1a13860b4c..e8f1845322 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilterTest.java @@ -16,7 +16,13 @@ */ package org.apache.commons.collections4.bloomfilter; +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.List; + import org.apache.commons.collections4.bloomfilter.hasher.Hasher; +import org.junit.Test; /** * Tests for the {@link SparseBloomFilter}. @@ -31,4 +37,33 @@ protected SparseBloomFilter createEmptyFilter(final Shape shape) { protected SparseBloomFilter createFilter(final Shape shape, final Hasher hasher) { return new SparseBloomFilter(shape, hasher); } + + @Test + public void constructor_indexOutOfRange() { + Shape shape = new Shape( 1, 5 ); + List lst = new ArrayList(); + lst.add( 5 ); + try { + new SparseBloomFilter( shape, lst ); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing; + } + lst.clear(); + lst.add( -1 ); + try { + new SparseBloomFilter( shape, lst ); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing; + } + } + + @Test + public void constructor_noValues() { + Shape shape = new Shape( 1, 5 ); + List lst = new ArrayList(); + new SparseBloomFilter( shape, lst ); + } + } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java index 27dc2f2358..845e5d9e6d 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java @@ -17,6 +17,8 @@ package org.apache.commons.collections4.bloomfilter.hasher; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.ArrayList; import java.util.List; @@ -38,7 +40,14 @@ public void sizeTest() { assertEquals(2, hasher.size()); HasherCollection hasher3 = new HasherCollection(hasher, new SimpleHasher(3, 3)); assertEquals(3, hasher3.size()); + } + @Test + public void isEmptyTest() { + HasherCollection hasher = new HasherCollection(); + assertTrue( hasher.isEmpty() ); + hasher.add( hasher1 ); + assertFalse( hasher.isEmpty() ); } @Test diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java index e3b3764bd4..d92b178883 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/NullHasherTest.java @@ -17,6 +17,7 @@ package org.apache.commons.collections4.bloomfilter.hasher; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.ArrayList; import java.util.List; @@ -45,4 +46,9 @@ public void testIterator() { assertEquals(0, lst.size()); } + @Test + public void isEmptyTest() { + assertTrue( hasher.isEmpty() ); + } + } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java index b0b9883f54..aef6190cf4 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java @@ -17,6 +17,8 @@ package org.apache.commons.collections4.bloomfilter.hasher; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.fail; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.ArrayList; import java.util.List; @@ -31,11 +33,25 @@ public class SimpleHasherTest { private SimpleHasher hasher = new SimpleHasher(1, 1); + @Test + public void constructor_byteTest() { + try { + hasher = new SimpleHasher( new byte[0] ); + fail( "Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // do nothing. + } + } @Test public void sizeTest() { assertEquals(1, hasher.size()); } + @Test + public void isEmptyTest() { + assertFalse( hasher.isEmpty() ); + } + @Test public void testIterator() { Shape shape = new Shape(5, 10); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java index 78f0d26c7c..e354243c18 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java @@ -17,6 +17,8 @@ package org.apache.commons.collections4.bloomfilter.hasher; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.ArrayList; import java.util.List; @@ -31,18 +33,31 @@ public class SingleItemHasherCollectionTest { private SimpleHasher hasher1 = new SimpleHasher(1, 1); private SimpleHasher hasher2 = new SimpleHasher(2, 2); - private HasherCollection hasher = new SingleItemHasherCollection(hasher1, hasher2); + @Test public void sizeTest() { + SingleItemHasherCollection hasher = new SingleItemHasherCollection(); + assertEquals(0, hasher.size() ); + hasher.add( hasher1 ); + hasher.add( hasher2 ); assertEquals(1, hasher.size()); HasherCollection hasher3 = new SingleItemHasherCollection(hasher, new SimpleHasher(3, 3)); assertEquals(1, hasher3.size()); + } + @Test + public void isEmptyTest() { + SingleItemHasherCollection hasher = new SingleItemHasherCollection(); + assertTrue( hasher.isEmpty() ); + hasher.add( hasher1 ); + assertFalse( hasher.isEmpty() ); } + @Test public void testIndices() { + HasherCollection hasher = new SingleItemHasherCollection(hasher1, hasher2); Shape shape = new Shape(5, 10); Integer[] expected = { 1, 2, 3, 4, 5, 6, 8, 0 }; List lst = new ArrayList(); From 285b3038a17c819b93c0870519f7466b5bdcac8e Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Thu, 25 Nov 2021 07:47:35 +0000 Subject: [PATCH 24/27] Updated docs and test cases --- .../bloomfilter/DefaultBloomFilterTest.java | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java new file mode 100644 index 0000000000..9d615eb6fa --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.TreeSet; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; + +import org.apache.commons.collections4.bloomfilter.exceptions.NoMatchException; +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; + +/** + * Tests for the {@link BloomFilter}. + */ +public class DefaultBloomFilterTest extends AbstractBloomFilterTest { + @Override + protected DefaultBloomFilter createEmptyFilter(final Shape shape) { + return new DefaultBloomFilter(shape); + } + + @Override + protected DefaultBloomFilter createFilter(final Shape shape, final Hasher hasher) { + return new DefaultBloomFilter(shape, hasher); + } + + public class DefaultBloomFilter implements BloomFilter { + private Shape shape; + private TreeSet indices; + + DefaultBloomFilter(Shape shape) { + this.shape = shape; + this.indices = new TreeSet(); + } + + DefaultBloomFilter(Shape shape, Hasher hasher) { + this( shape ); + hasher.indices(shape).forEachIndex( indices::add ); + } + + @Override + public void forEachIndex(IntConsumer consumer) { + indices.forEach( i -> consumer.accept( i.intValue() ) ); + } + + @Override + public void forEachBitMap(LongConsumer consumer) { + BitMapProducer.fromIndexProducer(this, shape).forEachBitMap(consumer); + } + + @Override + public boolean isSparse() { + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean contains(IndexProducer indexProducer) { + try { + indexProducer.forEachIndex( i -> { + if (!indices.contains( i )) { + throw new NoMatchException(); + } + } ); + return true; + } catch (NoMatchException e) { + return false; + } + } + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + return contains( IndexProducer.fromBitMapProducer(bitMapProducer) ); + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + other.forEachIndex( indices::add ); + return true; + } + + @Override + public int cardinality() { + return indices.size(); + } + + } +} From f2bc6987d8439e33f020ee927f72e9641f708c54 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Thu, 25 Nov 2021 15:37:28 +0000 Subject: [PATCH 25/27] fixed issue with Shape javadoc --- .../java/org/apache/commons/collections4/bloomfilter/Shape.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java index 11179f36bc..fcaa971da8 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -174,7 +174,7 @@ public String toString() { *

    Note:

    *
      *
    • if hammingValue == numberOfBits, then result is infinity.
    • - *
    • if hammingValue > numberOfBits, then result is NaN.
    • + *
    • if hammingValue > numberOfBits, then result is NaN.
    • *
    * * @param hammingValue the number of enabled bits. From 8854e5f729a9d7aa83af89d972e6f87028804075 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Fri, 26 Nov 2021 07:09:59 +0000 Subject: [PATCH 26/27] Added more test coverage. --- .../bloomfilter/hasher/HasherCollection.java | 2 +- .../bloomfilter/AbstractBloomFilterTest.java | 20 +++++++++++++++++++ .../bloomfilter/ShapeFactoryTest.java | 16 +++++++++++++++ .../hasher/HasherCollectionTest.java | 12 ++++++++++- .../SingleItemHasherCollectionTest.java | 20 +++++++++++++++++++ 5 files changed, 68 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java index 37e208f488..bc3ab940bd 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java @@ -82,7 +82,7 @@ public void add(Hasher hasher) { */ public void add(Collection hashers) { Objects.requireNonNull(hashers, "hashers"); - hashers.addAll(hashers); + this.hashers.addAll(hashers); } @Override diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 298b7953d2..0d15ad7232 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -87,6 +87,7 @@ public void containsTest() { assertTrue("BF2 Should contain itself", bf2.contains(bf2)); assertFalse("BF should not contain BF2", bf.contains(bf2)); assertTrue("BF2 should contain BF", bf2.contains(bf)); + } @Test @@ -97,6 +98,25 @@ public void containsTest_Hasher() { assertFalse("BF Should not contain this hasher", bf.contains(new SimpleHasher(1, 3))); } + @Test + public void containsTest_IndexProducer() { + final BloomFilter bf = createFilter(shape, bigHasher); + + IndexProducer indexProducer = new SimpleHasher(1,1).indices(shape); + assertTrue("BF Should contain this hasher", bf.contains(indexProducer)); + indexProducer = new SimpleHasher(1,3).indices(shape); + assertFalse("BF Should not contain this hasher", bf.contains(indexProducer)); + } + + @Test + public void containsTest_BitMapProducer() { + final BloomFilter bf = createFilter(shape, bigHasher); + + BitMapProducer bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1,1).indices(shape), shape); + assertTrue("BF Should contain this hasher", bf.contains(bitMapProducer)); + bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1,3).indices(shape), shape); + assertFalse("BF Should not contain this hasher", bf.contains(bitMapProducer)); + } /** * Tests that the andCardinality calculations are correct. * diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java index 32a9708b37..5e8c6ed1d2 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java @@ -209,4 +209,20 @@ public void fromPMK_test() { assertEquals(0.100375138, shape.getProbability(5), 0.000001); } + /** + * Tests the calculated values of calling the constructor with the probability, number of bits and number of hash + * functions. + */ + @Test + public void fromNP_test() { + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=24&k=3 + */ + final double probability = 1.0/2000000; + final Shape shape = Shape.Factory.fromNP(10, probability ); + + assertEquals(302, shape.getNumberOfBits()); + assertEquals(21, shape.getNumberOfHashFunctions()); + } + } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java index 845e5d9e6d..bbcc91a359 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollectionTest.java @@ -21,6 +21,7 @@ import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.commons.collections4.bloomfilter.IndexProducer; import org.apache.commons.collections4.bloomfilter.Shape; @@ -33,10 +34,10 @@ public class HasherCollectionTest { private SimpleHasher hasher1 = new SimpleHasher(1, 1); private SimpleHasher hasher2 = new SimpleHasher(2, 2); - private HasherCollection hasher = new HasherCollection(hasher1, hasher2); @Test public void sizeTest() { + HasherCollection hasher = new HasherCollection(hasher1, hasher2); assertEquals(2, hasher.size()); HasherCollection hasher3 = new HasherCollection(hasher, new SimpleHasher(3, 3)); assertEquals(3, hasher3.size()); @@ -52,6 +53,8 @@ public void isEmptyTest() { @Test public void testIndices() { + HasherCollection hasher = new HasherCollection(hasher1, hasher2); + assertEquals(2, hasher.size()); Shape shape = new Shape(5, 10); Integer[] expected = { 1, 2, 3, 4, 5, 2, 4, 6, 8, 0 }; List lst = new ArrayList(); @@ -63,4 +66,11 @@ public void testIndices() { } } + @Test + public void testAdd_collection() { + HasherCollection hasher = new HasherCollection(); + hasher.add( Arrays.asList( hasher1, hasher2)); + assertEquals(2, hasher.size()); + } + } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java index e354243c18..81c19f8d60 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollectionTest.java @@ -21,6 +21,7 @@ import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.commons.collections4.bloomfilter.IndexProducer; import org.apache.commons.collections4.bloomfilter.Shape; @@ -39,17 +40,22 @@ public class SingleItemHasherCollectionTest { public void sizeTest() { SingleItemHasherCollection hasher = new SingleItemHasherCollection(); assertEquals(0, hasher.size() ); + hasher.add( NullHasher.INSTANCE ); + assertEquals(0, hasher.size()); hasher.add( hasher1 ); hasher.add( hasher2 ); assertEquals(1, hasher.size()); HasherCollection hasher3 = new SingleItemHasherCollection(hasher, new SimpleHasher(3, 3)); assertEquals(1, hasher3.size()); + } @Test public void isEmptyTest() { SingleItemHasherCollection hasher = new SingleItemHasherCollection(); assertTrue( hasher.isEmpty() ); + hasher.add( NullHasher.INSTANCE ); + assertTrue( hasher.isEmpty() ); hasher.add( hasher1 ); assertFalse( hasher.isEmpty() ); } @@ -69,4 +75,18 @@ public void testIndices() { } } + @Test + public void testAdd_collection() { + HasherCollection hasher = new SingleItemHasherCollection(); + hasher.add( Arrays.asList( hasher1, hasher2)); + assertEquals(1, hasher.size()); + Integer[] expected = { 1, 2, 3, 4, 5, 6, 8, 0 }; + List lst = new ArrayList(); + IndexProducer producer = hasher.indices(new Shape(5, 10)); + producer.forEachIndex(lst::add); + assertEquals(expected.length, lst.size()); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], lst.get(i)); + } + } } From b3b6a5058a39be2e5942a574e15f7f3ba038d5cc Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Sat, 27 Nov 2021 15:06:08 +0000 Subject: [PATCH 27/27] fixed formatting issues --- .../collections4/bloomfilter/AbstractBloomFilterTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 0d15ad7232..b5d26c6e15 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -102,9 +102,9 @@ public void containsTest_Hasher() { public void containsTest_IndexProducer() { final BloomFilter bf = createFilter(shape, bigHasher); - IndexProducer indexProducer = new SimpleHasher(1,1).indices(shape); + IndexProducer indexProducer = new SimpleHasher(1, 1).indices(shape); assertTrue("BF Should contain this hasher", bf.contains(indexProducer)); - indexProducer = new SimpleHasher(1,3).indices(shape); + indexProducer = new SimpleHasher(1, 3).indices(shape); assertFalse("BF Should not contain this hasher", bf.contains(indexProducer)); } @@ -112,9 +112,9 @@ public void containsTest_IndexProducer() { public void containsTest_BitMapProducer() { final BloomFilter bf = createFilter(shape, bigHasher); - BitMapProducer bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1,1).indices(shape), shape); + BitMapProducer bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1, 1).indices(shape), shape); assertTrue("BF Should contain this hasher", bf.contains(bitMapProducer)); - bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1,3).indices(shape), shape); + bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1, 3).indices(shape), shape); assertFalse("BF Should not contain this hasher", bf.contains(bitMapProducer)); } /**