Skip to content

Commit

Permalink
Add simple bloom filter implementation.
Browse files Browse the repository at this point in the history
  • Loading branch information
kuujo committed Feb 22, 2015
1 parent 4dbe35b commit 8f7476d
Show file tree
Hide file tree
Showing 4 changed files with 235 additions and 2 deletions.
175 changes: 175 additions & 0 deletions util/src/main/java/net/kuujo/copycat/util/internal/BloomFilter.java
@@ -0,0 +1,175 @@
/*
* Copyright 2015 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.kuujo.copycat.util.internal;

import java.util.BitSet;
import java.util.Objects;

/**
* Simple Murmur based Bloom Filter implementation.
*
* @author <a href="http://github.com/kuujo">Jordan Halterman</a>
*/
public class BloomFilter<T> {
private final int numHashes;
private final int numBits;
private final BitSet bits;

/**
* Calculation of number of bits and hashes taken from Guava.
*
* https://github.com/google/guava/blob/master/guava/src/com/google/common/hash/BloomFilter.java
*
* @param falsePositiveProbability The desired probability of false positives.
* @param expectedSize The expected number of elements to be added to the filter. This will be used in conjunction
* with the desired false positive probability to calculate the number of bits and hashes to use.
*/
public BloomFilter(double falsePositiveProbability, int expectedSize) {
numBits = (int) (-expectedSize * Math.log(falsePositiveProbability == 0 ? Double.MIN_VALUE : falsePositiveProbability) / (Math.log(2) * Math.log(2)));
numHashes = Math.max(1, (int) Math.round((double) numBits / expectedSize * Math.log(2)));
bits = new BitSet(numBits);
}

/**
* Returns an array of indexes of bits for the given byte array.
*
* @param bytes The byte array to index.
* @param numHashes The number of hash functions to run and indexes to return.
* @param bits The total number of available bits.
* @return An array of bit indexes.
*/
private static int[] indexes(byte[] bytes, int numHashes, int bits) {
if (numHashes == 1) {
return new int[]{Hash.hash32(bytes)};
} else if (numHashes == 2) {
return new int[]{Hash.hash32(bytes, 0), Hash.hash32(bytes, 1)};
}

int[] hashes = new int[numHashes];

int h1 = Hash.hash32(bytes, 0);
hashes[0] = Math.abs(h1 % bits);

int h2 = Hash.hash32(bytes, 1);
hashes[1] = Math.abs(h2 % bits);

for (int i = 2; i < numHashes; i++) {
int h = h1 + i * h2;
if (h < 0)
h = ~h;
hashes[i] = Math.abs(h % bits);
}
return hashes;
}

/**
* Adds a byte array value to the bloom filter.
*
* @param bytes The byte array to add.
* @return The bloom filter.
*/
public BloomFilter<T> add(byte[] bytes) {
for (int index : indexes(bytes, numHashes, numBits)) {
bits.set(index, true);
}
return this;
}

/**
* Adds a value to the bloom filter.
*
* @param value The value to add.
* @return The bloom filter.
*/
public BloomFilter<T> add(T value) {
return add(value.toString().getBytes());
}

/**
* Returns a boolean value indicating whether the bloom filter *might* contain the given bytes.
*
* @param bytes The bytes to check.
* @return Indicates whether the bloom filter *might* contain the given bytes.
*/
public boolean contains(byte[] bytes) {
for (int index : indexes(bytes, numHashes, numBits)) {
if (!bits.get(index)) {
return false;
}
}
return true;
}

/**
* Returns a boolean value indicating whether the bloom filter might contain the given value.
*
* @param value The value to check.
* @return Indicates whether the bloom filter *might* contain the given value.
*/
public boolean contains(T value) {
return contains(value.toString().getBytes());
}

/**
* Returns the number of bits in the bloom filter.
*
* @return The number of bits in the bloom filter.
*/
public int size() {
return numBits;
}

/**
* Combines the given bloom filter with this bloom filter.
*
* @param filter The filter to combine.
* @return The combined filter.
*/
public BloomFilter<T> combine(BloomFilter<T> filter) {
Assert.arg(filter, filter != this, "cannot combine a bloom filter with itself");
Assert.arg(filter, filter.numBits == numBits, "cannot combine a bloom filter with a different number of bits");
Assert.arg(filter, filter.numHashes == numHashes, "cannot combine a bloom filter with a different number of hashes");
bits.or(filter.bits);
return this;
}

/**
* Calculates the probability that a false positive will occur.
*/
private double calculateFalsePositiveProbability() {
return 1 - Math.pow((double) bits.size() / numBits, numHashes);
}

@Override
public boolean equals(Object object) {
if (object instanceof BloomFilter) {
BloomFilter filter = (BloomFilter) object;
return filter.numBits == numBits && filter.numHashes == numHashes && filter.bits.equals(bits);
}
return false;
}

@Override
public int hashCode() {
return Objects.hash(numHashes, numBits, bits);
}

@Override
public String toString() {
return String.format("BloomFilter[probability=%f]", calculateFalsePositiveProbability());
}

}
19 changes: 17 additions & 2 deletions util/src/main/java/net/kuujo/copycat/util/internal/Hash.java
Expand Up @@ -28,7 +28,7 @@ public final class Hash {
private static final int C2 = 0x1b873593;

/**
* Hashes the given string value.
* Hashes the given value.
*
* Taken from Guava:
* https://github.com/google/guava/blob/master/guava/src/com/google/common/hash/Murmur3_32HashFunction.java
Expand All @@ -39,7 +39,22 @@ public final class Hash {
* - Kurt Alfred Kluever
*/
public static int hash32(byte[] bytes) {
int h1 = 0;
return hash32(bytes, 0);
}

/**
* Hashes the given value.
*
* Taken from Guava:
* https://github.com/google/guava/blob/master/guava/src/com/google/common/hash/Murmur3_32HashFunction.java
*
* Authors:
* - Austin Appleby
* - Dimitris Andreou
* - Kurt Alfred Kluever
*/
public static int hash32(byte[] bytes, int seed) {
int h1 = seed;
int length = 0;

ByteBuffer buffer = ByteBuffer.allocate(11).order(ByteOrder.LITTLE_ENDIAN);
Expand Down
41 changes: 41 additions & 0 deletions util/src/test/java/net/kuujo/copycat/util/BloomFilterTest.java
@@ -0,0 +1,41 @@
/*
* Copyright 2015 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.kuujo.copycat.util;

import net.kuujo.copycat.util.internal.BloomFilter;
import org.testng.Assert;
import org.testng.annotations.Test;

/**
* Bloom filter test.
*
* @author <a href="http://github.com/kuujo">Jordan Halterman</a>
*/
@Test
public class BloomFilterTest {

/**
* Tests adding an element to a bloom filter and then checking that the filter contains that element.
*/
public void testAddContains() {
BloomFilter<String> filter = new BloomFilter<>(.1, 100);
filter.add("Hello world!");
Assert.assertTrue(filter.contains("Hello world!"));
Assert.assertFalse(filter.contains("Hello world again!"));
System.out.println(filter.toString());for (;;);
}

}
2 changes: 2 additions & 0 deletions util/src/test/java/net/kuujo/copycat/util/HashTest.java
Expand Up @@ -33,6 +33,8 @@ public class HashTest {
public void testHash() {
String string = "abcdefghijklmnopqrstuvwxyz";
Assert.assertEquals(Hash.hash32(string.getBytes()), Hash.hash32(string.getBytes()));
Assert.assertEquals(Hash.hash32(string.getBytes(), 2), Hash.hash32(string.getBytes(), 2));
Assert.assertNotEquals(Hash.hash32(string.getBytes()), Hash.hash32(string.getBytes(), 2));
}

}

0 comments on commit 8f7476d

Please sign in to comment.