diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java index 409ca7656e3..b121cb050c6 100644 --- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java @@ -16,15 +16,13 @@ */ package org.apache.jackrabbit.oak.plugins.document.cache; -import org.apache.jackrabbit.guava.common.base.Predicate; -import org.apache.jackrabbit.guava.common.hash.BloomFilter; -import org.apache.jackrabbit.guava.common.hash.Funnel; -import org.apache.jackrabbit.guava.common.hash.PrimitiveSink; +import org.apache.jackrabbit.oak.plugins.document.cache.tmp.BloomFilter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Closeable; import java.util.List; +import java.util.function.Predicate; public class CacheChangesTracker implements Closeable { @@ -48,13 +46,13 @@ public class CacheChangesTracker implements Closeable { } public void invalidateDocument(String key) { - if (keyFilter.apply(key)) { + if (keyFilter.test(key)) { lazyBloomFilter.put(key); } } public boolean mightBeenAffected(String key) { - return keyFilter.apply(key) && lazyBloomFilter.mightContain(key); + return keyFilter.test(key) && lazyBloomFilter.mightContain(key); } @Override @@ -65,7 +63,8 @@ public void close() { if (lazyBloomFilter.filter == null) { LOG.debug("Disposing CacheChangesTracker for {}, no filter was needed", keyFilter); } else { - LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter, lazyBloomFilter.filter.expectedFpp()); + LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter, + lazyBloomFilter.filter.expectedFpp()); } } } @@ -76,14 +75,14 @@ public static class LazyBloomFilter { private final int entries; - private volatile BloomFilter filter; + private volatile BloomFilter filter; public LazyBloomFilter(int entries) { this.entries = entries; } public synchronized void put(String entry) { - getFilter().put(entry); + getFilter().add(entry); } public boolean mightContain(String entry) { @@ -91,21 +90,14 @@ public boolean mightContain(String entry) { return false; } else { synchronized (this) { - return filter.mightContain(entry); + return filter.mayContain(entry); } } } - private BloomFilter getFilter() { + private BloomFilter getFilter() { if (filter == null) { - filter = BloomFilter.create(new Funnel() { - private static final long serialVersionUID = -7114267990225941161L; - - @Override - public void funnel(String from, PrimitiveSink into) { - into.putUnencodedChars(from); - } - }, entries, FPP); + filter = BloomFilter.construct(entries, FPP); } return filter; } diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/tmp/BloomFilter.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/tmp/BloomFilter.java new file mode 100644 index 00000000000..e2e9dfa1e2f --- /dev/null +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/tmp/BloomFilter.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.plugins.document.cache.tmp; + +import org.jetbrains.annotations.NotNull; + +/** + * A Bloom filter implementation. + *

+ * Copied for the purpose of testing in oak-store-document; see OAK-10674. + */ +public class BloomFilter { + + private final int k; + private final int arraySize; + private final long[] data; + + private BloomFilter(long[] data, int k) { + this.data = data; + this.k = k; + this.arraySize = data.length; + } + + /** + * Construct a Bloom filter. With a fpp of 0.01, the memory usage is roughly 1 + * byte per entry. + * + * @param bytes the size in number of bytes (eg. 64_000_000 for 64 MB memory + * usage) + * @param fpp the false-positive probability (eg. 0.01 for a 1% false-positive + * probability) + * @return the Bloom filter + */ + public static BloomFilter construct(long n, double fpp) { + long m = calculateBits(n, fpp); + int k = calculateK((double) m / n); + return new BloomFilter(new long[(int) ((m + 63) / 64)], k); + } + + // See also https://hur.st/bloomfilter + + /** + * Calculate the best k parameter for a Bloom filter. + * + * @param bitsPerKey the number of bits per key (eg. 10) + * @return the k parameter + */ + public static int calculateK(double bitsPerKey) { + return Math.max(1, (int) Math.round(bitsPerKey * Math.log(2))); + } + + /** + * Calculate the number of bits needed for a Bloom filter, given a number of entries and the k parameter. + * + * @param n the number of entries (eg. 1_000_000) + * @param fpp the false positive probability (eg. 0.01) + * @return the bits needed + */ + public static long calculateBits(long n, double fpp) { + return (long) Math.ceil((n * Math.log(fpp)) / Math.log(1 / Math.pow(2, Math.log(2)))); + } + + /** + * Calculate the maximum number of entries in the set, given the the memory size + * in bits, and a target false positive probability. + * + * @param bits the number of bits (eg. 10_000_000) + * @param fpp the false positive probability (eg. 0.01) + * @return the maximum number of entries to be added + */ + public static long calculateN(long bits, double fpp) { + return (long) Math.ceil((bits * Math.log(Math.pow(0.5, Math.log(2))) / Math.log(fpp))); + } + + /** + * Calculate the false positive probability. + * + * @param bits the number of bits (eg. 10_000_000) + * @param fpp the false positive probability (eg. 0.01) + * @return the maximum number of entries to be added + */ + public static double calculateFpp(long n, long bits, int k) { + // p = pow(1 - exp(-k / (m / n)), k) + return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k); + } + + /** + * Get the expected false positive rate for the current entries in the + * filter. This will first calculate the estimated entry count, and then + * calculate the false positive probability from there. + */ + public double expectedFpp() { + return calculateFpp(getEstimatedEntryCount(), getBitCount(), getK()); + } + + /** + * Add an entry. + * + * @param hash the hash value (need to be a high quality hash code, with all + * bits having high entropy) + */ + public void add(long hash) { + long a = (hash >>> 32) | (hash << 32); + long b = hash; + for (int i = 0; i < k; i++) { + data[Hash.reduce((int) (a >>> 32), arraySize)] |= 1L << a; + a += b; + } + } + + /** + * Whether the entry may be in the set. + * + * @param hash the hash value (need to be a high quality hash code, with all + * bits having high entropy) + * @return true if the entry was added, or, with a certain false positive + * probability, even if it was not added + */ + public boolean mayContain(long hash) { + long a = (hash >>> 32) | (hash << 32); + long b = hash; + for (int i = 0; i < k; i++) { + if ((data[Hash.reduce((int) (a >>> 32), arraySize)] & 1L << a) == 0) { + return false; + } + a += b; + } + return true; + } + + /** + * Get the number of bits needed for the array. + * + * @return the number of bits + */ + public long getBitCount() { + return data.length * 64L; + } + + public int getK() { + return k; + } + + /** + * Get the estimated entry count (number of distinct items added). This + * operation is relatively slow, as it loops over all the entries. + * + * @return the estimated entry count, or Long.MAX_VALUE if the number can not be estimated. + */ + public long getEstimatedEntryCount() { + long x = 0; + for (long d : data) { + x += Long.bitCount(d); + } + double m = getBitCount(); + return (long) (-(m / k) * Math.log(1 - (x / m))); + } + + /** + * Add an entry. This internally uses the hashCode() method to derive a + * high-quality hash code. + * + * @param obj the object (must not be null) + */ + public void add(@NotNull Object obj) { + add(Hash.hash64(obj.hashCode())); + } + + /** + * Whether the entry may be in the set. This internally uses the hashCode() + * method to derive a high-quality hash code. + * + * @param obj the object (must not be null) + * @return true if the entry was added, or, with a certain false positive + * probability, even if it was not added + */ + public boolean mayContain(@NotNull Object obj) { + return mayContain(Hash.hash64(obj.hashCode())); + } +} \ No newline at end of file diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/tmp/Hash.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/tmp/Hash.java new file mode 100644 index 00000000000..35593604e6e --- /dev/null +++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/tmp/Hash.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.plugins.document.cache.tmp; + +/** + * A hash function utility class. + *

+ * Copied for the purpose of testing in oak-store-document; see OAK-10674. + */ +public class Hash { + + private Hash() { + // utility class + } + + /** + * Calculate a 64-bit hash value from a value, using a seed. + * + * The current algorithm used the finalizer of the MurmurHash3 hash function, + * but callers shouldn't rely on that. + * + * @param x the value + * @param seed the seed + * @return the hash value + */ + public static long hash64(long x, long seed) { + x += seed; + x = (x ^ (x >>> 33)) * 0xff51afd7ed558ccdL; + x = (x ^ (x >>> 33)) * 0xc4ceb9fe1a85ec53L; + x = x ^ (x >>> 33); + return x; + } + + /** + * Calculate a 64-bit hash value from a value. The input is a 64-bit value and + * the output is a 64-bit values. Two different inputs are never mapped to the + * same output. The operation is reversible. + * + * @param x the value + * @return the hash value + */ + public static long hash64(long x) { + return hash64(x, 100); + } + + /** + * Shrink the hash to a value 0..n. Kind of like modulo, but using + * multiplication and shift, which are faster to compute. + * + * @param hash the hash + * @param n the maximum of the result + * @return the reduced value + */ + public static int reduce(int hash, int n) { + // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32); + } + +}