apache · reschke · Mar 5, 2024 · Mar 5, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/...t/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java b/...t/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/CacheChangesTracker.java
@@ -16,15 +16,13 @@
  */
 package org.apache.jackrabbit.oak.plugins.document.cache;
 
-import org.apache.jackrabbit.guava.common.base.Predicate;
-import org.apache.jackrabbit.guava.common.hash.BloomFilter;
-import org.apache.jackrabbit.guava.common.hash.Funnel;
-import org.apache.jackrabbit.guava.common.hash.PrimitiveSink;
+import org.apache.jackrabbit.oak.plugins.document.cache.tmp.BloomFilter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.Closeable;
 import java.util.List;
+import java.util.function.Predicate;
 
 public class CacheChangesTracker implements Closeable {
 
@@ -48,13 +46,13 @@ public class CacheChangesTracker implements Closeable {
     }
 
     public void invalidateDocument(String key) {
-        if (keyFilter.apply(key)) {
+        if (keyFilter.test(key)) {
             lazyBloomFilter.put(key);
         }
     }
 
     public boolean mightBeenAffected(String key) {
-        return keyFilter.apply(key) && lazyBloomFilter.mightContain(key);
+        return keyFilter.test(key) && lazyBloomFilter.mightContain(key);
     }
 
     @Override
@@ -65,7 +63,8 @@ public void close() {
             if (lazyBloomFilter.filter == null) {
                 LOG.debug("Disposing CacheChangesTracker for {}, no filter was needed", keyFilter);
             } else {
-                LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter, lazyBloomFilter.filter.expectedFpp());
+                LOG.debug("Disposing CacheChangesTracker for {}, filter fpp was: {}", keyFilter,
+                        lazyBloomFilter.filter.expectedFpp());
             }
         }
     }
@@ -76,36 +75,29 @@ public static class LazyBloomFilter {
 
         private final int entries;
 
-        private volatile BloomFilter<String> filter;
+        private volatile BloomFilter filter;
 
         public LazyBloomFilter(int entries) {
             this.entries = entries;
         }
 
         public synchronized void put(String entry) {
-            getFilter().put(entry);
+            getFilter().add(entry);
         }
 
         public boolean mightContain(String entry) {
             if (filter == null) {
                 return false;
             } else {
                 synchronized (this) {
-                    return filter.mightContain(entry);
+                    return filter.mayContain(entry);
                 }
             }
         }
 
-        private BloomFilter<String> getFilter() {
+        private BloomFilter getFilter() {
             if (filter == null) {
-                filter = BloomFilter.create(new Funnel<String>() {
-                    private static final long serialVersionUID = -7114267990225941161L;
-
-                    @Override
-                    public void funnel(String from, PrimitiveSink into) {
-                        into.putUnencodedChars(from);
-                    }
-                }, entries, FPP);
+                filter = BloomFilter.construct(entries, FPP);
             }
             return filter;
         }

diff --git a/...ument/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/tmp/BloomFilter.java b/...ument/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/tmp/BloomFilter.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.document.cache.tmp;
+
+import org.jetbrains.annotations.NotNull;
+
+/**
+ * A Bloom filter implementation.
+ * <p>
+ * Copied for the purpose of testing in oak-store-document; see OAK-10674.
+ */
+public class BloomFilter {
+
+    private final int k;
+    private final int arraySize;
+    private final long[] data;
+
+    private BloomFilter(long[] data, int k) {
+        this.data = data;
+        this.k = k;
+        this.arraySize = data.length;
+    }
+
+    /**
+     * Construct a Bloom filter. With a fpp of 0.01, the memory usage is roughly 1
+     * byte per entry.
+     *
+     * @param bytes the size in number of bytes (eg. 64_000_000 for 64 MB memory
+     *              usage)
+     * @param fpp   the false-positive probability (eg. 0.01 for a 1% false-positive
+     *              probability)
+     * @return the Bloom filter
+     */
+    public static BloomFilter construct(long n, double fpp) {
+        long m = calculateBits(n, fpp);
+        int k = calculateK((double) m / n);
+        return new BloomFilter(new long[(int) ((m + 63) / 64)], k);
+    }
+
+    // See also https://hur.st/bloomfilter
+
+    /**
+     * Calculate the best k parameter for a Bloom filter.
+     *
+     * @param bitsPerKey the number of bits per key (eg. 10)
+     * @return the k parameter
+     */
+    public static int calculateK(double bitsPerKey) {
+        return Math.max(1, (int) Math.round(bitsPerKey * Math.log(2)));
+    }
+
+    /**
+     * Calculate the number of bits needed for a Bloom filter, given a number of entries and the k parameter.
+     *
+     * @param n the number of entries (eg. 1_000_000)
+     * @param fpp the false positive probability (eg. 0.01)
+     * @return the bits needed
+     */
+    public static long calculateBits(long n, double fpp) {
+        return (long) Math.ceil((n * Math.log(fpp)) / Math.log(1 / Math.pow(2, Math.log(2))));
+    }
+
+    /**
+     * Calculate the maximum number of entries in the set, given the the memory size
+     * in bits, and a target false positive probability.
+     *
+     * @param bits the number of bits (eg. 10_000_000)
+     * @param fpp  the false positive probability (eg. 0.01)
+     * @return the maximum number of entries to be added
+     */
+    public static long calculateN(long bits, double fpp) {
+        return (long) Math.ceil((bits * Math.log(Math.pow(0.5, Math.log(2))) / Math.log(fpp)));
+    }
+
+    /**
+     * Calculate the false positive probability.
+     *
+     * @param bits the number of bits (eg. 10_000_000)
+     * @param fpp  the false positive probability (eg. 0.01)
+     * @return the maximum number of entries to be added
+     */
+    public static double calculateFpp(long n, long bits, int k) {
+        // p = pow(1 - exp(-k / (m / n)), k)
+        return Math.pow(1 - Math.exp(-k / ((double) bits / n)), k);
+    }
+
+    /**
+     * Get the expected false positive rate for the current entries in the
+     * filter. This will first calculate the estimated entry count, and then
+     * calculate the false positive probability from there.
+     */
+    public double expectedFpp() {
+        return calculateFpp(getEstimatedEntryCount(), getBitCount(), getK());
+    }
+
+    /**
+     * Add an entry.
+     *
+     * @param hash the hash value (need to be a high quality hash code, with all
+     *             bits having high entropy)
+     */
+    public void add(long hash) {
+        long a = (hash >>> 32) | (hash << 32);
+        long b = hash;
+        for (int i = 0; i < k; i++) {
+            data[Hash.reduce((int) (a >>> 32), arraySize)] |= 1L << a;
+            a += b;
+        }
+    }
+
+    /**
+     * Whether the entry may be in the set.
+     *
+     * @param hash the hash value (need to be a high quality hash code, with all
+     *             bits having high entropy)
+     * @return true if the entry was added, or, with a certain false positive
+     *         probability, even if it was not added
+     */
+    public boolean mayContain(long hash) {
+        long a = (hash >>> 32) | (hash << 32);
+        long b = hash;
+        for (int i = 0; i < k; i++) {
+            if ((data[Hash.reduce((int) (a >>> 32), arraySize)] & 1L << a) == 0) {
+                return false;
+            }
+            a += b;
+        }
+        return true;
+    }
+
+    /**
+     * Get the number of bits needed for the array.
+     *
+     * @return the number of bits
+     */
+    public long getBitCount() {
+        return data.length * 64L;
+    }
+
+    public int getK() {
+        return k;
+    }
+
+    /**
+     * Get the estimated entry count (number of distinct items added). This
+     * operation is relatively slow, as it loops over all the entries.
+     *
+     * @return the estimated entry count, or Long.MAX_VALUE if the number can not be estimated.
+     */
+    public long getEstimatedEntryCount() {
+        long x = 0;
+        for (long d : data) {
+            x += Long.bitCount(d);
+        }
+        double m = getBitCount();
+        return (long) (-(m / k) * Math.log(1 - (x / m)));
+    }
+
+    /**
+     * Add an entry. This internally uses the hashCode() method to derive a
+     * high-quality hash code.
+     *
+     * @param obj the object (must not be null)
+     */
+    public void add(@NotNull Object obj) {
+        add(Hash.hash64(obj.hashCode()));
+    }
+
+    /**
+     * Whether the entry may be in the set. This internally uses the hashCode()
+     * method to derive a high-quality hash code.
+     * 
+     * @param obj the object (must not be null)
+     * @return true if the entry was added, or, with a certain false positive
+     *         probability, even if it was not added
+     */
+    public boolean mayContain(@NotNull Object obj) {
+        return mayContain(Hash.hash64(obj.hashCode()));
+    }
+}
diff --git a/...ore-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/tmp/Hash.java b/...ore-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/cache/tmp/Hash.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.document.cache.tmp;
+
+/**
+ * A hash function utility class.
+ * <p>
+ * Copied for the purpose of testing in oak-store-document; see OAK-10674.
+ */
+public class Hash {
+
+    private Hash() {
+        // utility class
+    }
+
+    /**
+     * Calculate a 64-bit hash value from a value, using a seed.
+     *
+     * The current algorithm used the finalizer of the MurmurHash3 hash function,
+     * but callers shouldn't rely on that.
+     *
+     * @param x    the value
+     * @param seed the seed
+     * @return the hash value
+     */
+    public static long hash64(long x, long seed) {
+        x += seed;
+        x = (x ^ (x >>> 33)) * 0xff51afd7ed558ccdL;
+        x = (x ^ (x >>> 33)) * 0xc4ceb9fe1a85ec53L;
+        x = x ^ (x >>> 33);
+        return x;
+    }
+
+    /**
+     * Calculate a 64-bit hash value from a value. The input is a 64-bit value and
+     * the output is a 64-bit values. Two different inputs are never mapped to the
+     * same output. The operation is reversible.
+     *
+     * @param x the value
+     * @return the hash value
+     */
+    public static long hash64(long x) {
+        return hash64(x, 100);
+    }
+
+    /**
+     * Shrink the hash to a value 0..n. Kind of like modulo, but using
+     * multiplication and shift, which are faster to compute.
+     *
+     * @param hash the hash
+     * @param n the maximum of the result
+     * @return the reduced value
+     */
+    public static int reduce(int hash, int n) {
+        // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+        return (int) (((hash & 0xffffffffL) * (n & 0xffffffffL)) >>> 32);
+    }
+
+}