From 56f4d96f0726b4608786b16d0058c4128f636f55 Mon Sep 17 00:00:00 2001 From: David Cromberge Date: Wed, 10 Feb 2021 17:28:37 +0000 Subject: [PATCH 1/2] Add jaccard similarity capability to Tuple sketches This provides an analog to the current Theta sketch Jaccard Similarity measure, but for Tuple sketches. The new similarity measure only compares the hash table entries, but not the summary values themselves. --- .../BoundsOnRatiosInTupleSketchedSets.java | 184 +++++++ .../datasketches/tuple/CompactSketch.java | 7 + .../datasketches/tuple/JaccardSimilarity.java | 347 +++++++++++++ .../datasketches/tuple/QuickSelectSketch.java | 6 + .../org/apache/datasketches/tuple/Sketch.java | 8 + ...BoundsOnRatiosInTupleSketchedSetsTest.java | 159 ++++++ .../tuple/JaccardSimilarityTest.java | 458 ++++++++++++++++++ 7 files changed, 1169 insertions(+) create mode 100644 src/main/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSets.java create mode 100644 src/main/java/org/apache/datasketches/tuple/JaccardSimilarity.java create mode 100644 src/test/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSetsTest.java create mode 100644 src/test/java/org/apache/datasketches/tuple/JaccardSimilarityTest.java diff --git a/src/main/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSets.java b/src/main/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSets.java new file mode 100644 index 000000000..7bcc72772 --- /dev/null +++ b/src/main/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSets.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches; + +import org.apache.datasketches.tuple.Sketch; +import org.apache.datasketches.tuple.Summary; + +import static org.apache.datasketches.Util.LONG_MAX_VALUE_AS_DOUBLE; + +/** + * This class is used to compute the bounds on the estimate of the ratio B / A, where: + * + * Note: The theta of A cannot be greater than the theta of B. + * If B is formed as an intersection of A and some other set C, + * then the theta of B is guaranteed to be less than or equal to the theta of B. + * + * @author Kevin Lang + * @author Lee Rhodes + * @author David Cromberge + */ +public final class BoundsOnRatiosInTupleSketchedSets { + + private BoundsOnRatiosInTupleSketchedSets() {} + + /** + * Gets the approximate lower bound for B over A based on a 95% confidence interval + * @param sketchA the Tuple sketch A with summary type S + * @param sketchB the Tuple sketch B with summary type S + * @return the approximate lower bound for B over A + */ + public static double getLowerBoundForBoverA(final Sketch sketchA, final Sketch sketchB) { + final long thetaLongA = sketchA.getThetaLong(); + final long thetaLongB = sketchB.getThetaLong(); + checkThetas(thetaLongA, thetaLongB); + + final int countB = sketchB.getRetainedEntries(); + final int countA = (thetaLongB == thetaLongA) + ? sketchA.getRetainedEntries() + : sketchA.getCountLessThanThetaLong(thetaLongB); + + if (countA <= 0) { return 0; } + final double f = thetaLongB / LONG_MAX_VALUE_AS_DOUBLE; + return BoundsOnRatiosInSampledSets.getLowerBoundForBoverA(countA, countB, f); + } + + /** + * Gets the approximate lower bound for B over A based on a 95% confidence interval + * @param sketchA the Tuple sketch A with summary type S + * @param sketchB the Theta sketch B + * @return the approximate lower bound for B over A + */ + public static double getLowerBoundForBoverA(final Sketch sketchA, final org.apache.datasketches.theta.Sketch sketchB) { + final long thetaLongA = sketchA.getThetaLong(); + final long thetaLongB = sketchB.getThetaLong(); + checkThetas(thetaLongA, thetaLongB); + + final int countB = sketchB.getRetainedEntries(); + final int countA = (thetaLongB == thetaLongA) + ? sketchA.getRetainedEntries() + : sketchA.getCountLessThanThetaLong(thetaLongB); + + if (countA <= 0) { return 0; } + final double f = thetaLongB / LONG_MAX_VALUE_AS_DOUBLE; + return BoundsOnRatiosInSampledSets.getLowerBoundForBoverA(countA, countB, f); + } + + /** + * Gets the approximate upper bound for B over A based on a 95% confidence interval + * @param sketchA the Tuple sketch A with summary type S + * @param sketchB the Tuple sketch B with summary type S + * @return the approximate upper bound for B over A + */ + public static double getUpperBoundForBoverA(final Sketch sketchA, final Sketch sketchB) { + final long thetaLongA = sketchA.getThetaLong(); + final long thetaLongB = sketchB.getThetaLong(); + checkThetas(thetaLongA, thetaLongB); + + final int countB = sketchB.getRetainedEntries(); + final int countA = (thetaLongB == thetaLongA) + ? sketchA.getRetainedEntries() + : sketchA.getCountLessThanThetaLong(thetaLongB); + + if (countA <= 0) { return 1.0; } + final double f = thetaLongB / LONG_MAX_VALUE_AS_DOUBLE; + return BoundsOnRatiosInSampledSets.getUpperBoundForBoverA(countA, countB, f); + } + + /** + * Gets the approximate upper bound for B over A based on a 95% confidence interval + * @param sketchA the Tuple sketch A with summary type S + * @param sketchB the Theta sketch B + * @return the approximate upper bound for B over A + */ + public static double getUpperBoundForBoverA(final Sketch sketchA, final org.apache.datasketches.theta.Sketch sketchB) { + final long thetaLongA = sketchA.getThetaLong(); + final long thetaLongB = sketchB.getThetaLong(); + checkThetas(thetaLongA, thetaLongB); + + final int countB = sketchB.getRetainedEntries(true); + final int countA = (thetaLongB == thetaLongA) + ? sketchA.getRetainedEntries() + : sketchA.getCountLessThanThetaLong(thetaLongB); + + if (countA <= 0) { return 1.0; } + final double f = thetaLongB / LONG_MAX_VALUE_AS_DOUBLE; + return BoundsOnRatiosInSampledSets.getUpperBoundForBoverA(countA, countB, f); + } + + /** + * Gets the estimate for B over A + * @param sketchA the Tuple sketch A with summary type S + * @param sketchB the Tuple sketch B with summary type S + * @return the estimate for B over A + */ + public static double getEstimateOfBoverA(final Sketch sketchA, final Sketch sketchB) { + final long thetaLongA = sketchA.getThetaLong(); + final long thetaLongB = sketchB.getThetaLong(); + checkThetas(thetaLongA, thetaLongB); + + final int countB = sketchB.getRetainedEntries(); + final int countA = (thetaLongB == thetaLongA) + ? sketchA.getRetainedEntries() + : sketchA.getCountLessThanThetaLong(thetaLongB); + + if (countA <= 0) { return 0.5; } + + return (double) countB / (double) countA; + } + + /** + * Gets the estimate for B over A + * @param sketchA the Tuple sketch A with summary type S + * @param sketchB the Theta sketch B + * @return the estimate for B over A + */ + public static double getEstimateOfBoverA(final Sketch sketchA, final org.apache.datasketches.theta.Sketch sketchB) { + final long thetaLongA = sketchA.getThetaLong(); + final long thetaLongB = sketchB.getThetaLong(); + checkThetas(thetaLongA, thetaLongB); + + final int countB = sketchB.getRetainedEntries(true); + final int countA = (thetaLongB == thetaLongA) + ? sketchA.getRetainedEntries() + : sketchA.getCountLessThanThetaLong(thetaLongB); + + if (countA <= 0) { return 0.5; } + + return (double) countB / (double) countA; + } + + static void checkThetas(final long thetaLongA, final long thetaLongB) { + if (thetaLongB > thetaLongA) { + throw new SketchesArgumentException("ThetaLongB cannot be > ThetaLongA."); + } + } +} diff --git a/src/main/java/org/apache/datasketches/tuple/CompactSketch.java b/src/main/java/org/apache/datasketches/tuple/CompactSketch.java index 07d350f0c..2de4c167a 100644 --- a/src/main/java/org/apache/datasketches/tuple/CompactSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/CompactSketch.java @@ -27,6 +27,8 @@ import org.apache.datasketches.SketchesArgumentException; import org.apache.datasketches.memory.Memory; +import static org.apache.datasketches.HashOperations.count; + /** * CompactSketches are never created directly. They are created as a result of * the compact() method of an UpdatableSketch or as a result of the getResult() @@ -139,6 +141,11 @@ public int getRetainedEntries() { return hashArr_ == null ? 0 : hashArr_.length; } + @Override + public int getCountLessThanThetaLong(long thetaLong) { + return count(hashArr_, thetaLong); + } + // Layout of first 8 bytes: // Long || Start Byte Adr: // Adr: diff --git a/src/main/java/org/apache/datasketches/tuple/JaccardSimilarity.java b/src/main/java/org/apache/datasketches/tuple/JaccardSimilarity.java new file mode 100644 index 000000000..cf0df1498 --- /dev/null +++ b/src/main/java/org/apache/datasketches/tuple/JaccardSimilarity.java @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.tuple; + +import org.apache.datasketches.SketchesArgumentException; + +import static java.lang.Math.max; +import static java.lang.Math.min; +import static org.apache.datasketches.BoundsOnRatiosInTupleSketchedSets.*; +import static org.apache.datasketches.Util.*; + +/** + * Jaccard similarity of two Tuple Sketches, or alternatively, of a Tuple and Theta Sketch. + * + * Note: only retained hash values are compared, and the Tuple summary values are not accounted for in the + * similarity measure. + * + * @author Lee Rhodes + * @author David Cromberge + */ +public final class JaccardSimilarity { + private static final double[] ZEROS = {0.0, 0.0, 0.0}; // LB, Estimate, UB + private static final double[] ONES = {1.0, 1.0, 1.0}; + + /** + * Computes the Jaccard similarity index with upper and lower bounds. The Jaccard similarity index + * J(A,B) = (A ^ B)/(A U B) is used to measure how similar the two sketches are to each + * other. If J = 1.0, the sketches are considered equal. If J = 0, the two sketches are + * distinct from each other. A Jaccard of .95 means the overlap between the two + * populations is 95% of the union of the two populations. + * + *

Note: For very large pairs of sketches, where the configured nominal entries of the sketches + * are 2^25 or 2^26, this method may produce unpredictable results. + * + * @param sketchA The first argument, a Tuple sketch with summary type S + * @param sketchB The second argument, a Tuple sketch with summary type S + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. + * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index. + * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations. + */ + public static double[] jaccard(final Sketch sketchA, final Sketch sketchB, + final SummarySetOperations summarySetOps) { + //Corner case checks + if (sketchA == null || sketchB == null) { return ZEROS.clone(); } + if (sketchA.isEmpty() && sketchB.isEmpty()) { return ONES.clone(); } + if (sketchA.isEmpty() || sketchB.isEmpty()) { return ZEROS.clone(); } + + final int countA = sketchA.getRetainedEntries(); + final int countB = sketchB.getRetainedEntries(); + + //Create the Union + final int minK = 1 << MIN_LG_NOM_LONGS; + final int maxK = 1 << MAX_LG_NOM_LONGS; + final int newK = max(min(ceilingPowerOf2(countA + countB), maxK), minK); + final Union union = new Union<>(newK, summarySetOps); + union.union(sketchA); + union.union(sketchB); + + final Sketch unionAB = union.getResult(); + final long thetaLongUAB = unionAB.getThetaLong(); + final long thetaLongA = sketchA.getThetaLong(); + final long thetaLongB = sketchB.getThetaLong(); + final int countUAB = unionAB.getRetainedEntries(); + + //Check for identical data + if (countUAB == countA && countUAB == countB + && thetaLongUAB == thetaLongA && thetaLongUAB == thetaLongB) { + return ONES.clone(); + } + + //Create the Intersection + final Intersection inter = new Intersection<>(summarySetOps); + inter.intersect(sketchA); + inter.intersect(sketchB); + inter.intersect(unionAB); //ensures that intersection is a subset of the union + final Sketch interABU = inter.getResult(); + + final double lb = getLowerBoundForBoverA(unionAB, interABU); + final double est = getEstimateOfBoverA(unionAB, interABU); + final double ub = getUpperBoundForBoverA(unionAB, interABU); + return new double[] {lb, est, ub}; + } + + /** + * Computes the Jaccard similarity index with upper and lower bounds. The Jaccard similarity index + * J(A,B) = (A ^ B)/(A U B) is used to measure how similar the two sketches are to each + * other. If J = 1.0, the sketches are considered equal. If J = 0, the two sketches are + * distinct from each other. A Jaccard of .95 means the overlap between the two + * populations is 95% of the union of the two populations. + * + *

Note: For very large pairs of sketches, where the configured nominal entries of the sketches + * are 2^25 or 2^26, this method may produce unpredictable results. + * + * @param sketchA The first argument, a Tuple sketch with summary type S + * @param sketchB The second argument, a Theta sketch + * @param summary the given proxy summary for the theta sketch, which doesn't have one. + * This may not be null. + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. + * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index. + * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations. + */ + public static double[] jaccard(final Sketch sketchA, final org.apache.datasketches.theta.Sketch sketchB, + final S summary, final SummarySetOperations summarySetOps) { + // Null case checks + if (summary == null) { + throw new SketchesArgumentException("Summary cannot be null."); } + + //Corner case checks + if (sketchA == null || sketchB == null) { return ZEROS.clone(); } + if (sketchA.isEmpty() && sketchB.isEmpty()) { return ONES.clone(); } + if (sketchA.isEmpty() || sketchB.isEmpty()) { return ZEROS.clone(); } + + final int countA = sketchA.getRetainedEntries(); + final int countB = sketchB.getRetainedEntries(true); + + //Create the Union + final int minK = 1 << MIN_LG_NOM_LONGS; + final int maxK = 1 << MAX_LG_NOM_LONGS; + final int newK = max(min(ceilingPowerOf2(countA + countB), maxK), minK); + final Union union = new Union<>(newK, summarySetOps); + union.union(sketchA); + union.union(sketchB, summary); + + final Sketch unionAB = union.getResult(); + final long thetaLongUAB = unionAB.getThetaLong(); + final long thetaLongA = sketchA.getThetaLong(); + final long thetaLongB = sketchB.getThetaLong(); + final int countUAB = unionAB.getRetainedEntries(); + + //Check for identical data + if (countUAB == countA && countUAB == countB + && thetaLongUAB == thetaLongA && thetaLongUAB == thetaLongB) { + return ONES.clone(); + } + + //Create the Intersection + final Intersection inter = new Intersection<>(summarySetOps); + inter.intersect(sketchA); + inter.intersect(sketchB, summary); + inter.intersect(unionAB); //ensures that intersection is a subset of the union + final Sketch interABU = inter.getResult(); + + final double lb = getLowerBoundForBoverA(unionAB, interABU); + final double est = getEstimateOfBoverA(unionAB, interABU); + final double ub = getUpperBoundForBoverA(unionAB, interABU); + return new double[] {lb, est, ub}; + } + + /** + * Returns true if the two given sketches have exactly the same hash values and the same + * theta values. Thus, they are equivalent. + * @param sketchA The first argument, a Tuple sketch with summary type S + * @param sketchB The second argument, a Tuple sketch with summary type S + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. + * @return true if the two given sketches have exactly the same hash values and the same + * theta values. + */ + public static boolean exactlyEqual(final Sketch sketchA, final Sketch sketchB, + final SummarySetOperations summarySetOps) { + //Corner case checks + if (sketchA == null || sketchB == null) { return false; } + if (sketchA == sketchB) { return true; } + if (sketchA.isEmpty() && sketchB.isEmpty()) { return true; } + if (sketchA.isEmpty() || sketchB.isEmpty()) { return false; } + + final int countA = sketchA.getRetainedEntries(); + final int countB = sketchB.getRetainedEntries(); + + //Create the Union + final Union union = new Union<>(ceilingPowerOf2(countA + countB), summarySetOps); + union.union(sketchA); + union.union(sketchB); + final Sketch unionAB = union.getResult(); + final long thetaLongUAB = unionAB.getThetaLong(); + final long thetaLongA = sketchA.getThetaLong(); + final long thetaLongB = sketchB.getThetaLong(); + final int countUAB = unionAB.getRetainedEntries(); + + //Check for identical counts and thetas + if (countUAB == countA && countUAB == countB + && thetaLongUAB == thetaLongA && thetaLongUAB == thetaLongB) { + return true; + } + return false; + } + + /** + * Returns true if the two given sketches have exactly the same hash values and the same + * theta values. Thus, they are equivalent. + * @param sketchA The first argument, a Tuple sketch with summary type S + * @param sketchB The second argument, a Theta sketch + * @param summary the given proxy summary for the theta sketch, which doesn't have one. + * This may not be null. + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. + * @return true if the two given sketches have exactly the same hash values and the same + * theta values. + */ + public static boolean exactlyEqual(final Sketch sketchA, final org.apache.datasketches.theta.Sketch sketchB, + final S summary, final SummarySetOperations summarySetOps) { + // Null case checks + if (summary == null) { + throw new SketchesArgumentException("Summary cannot be null."); } + + //Corner case checks + if (sketchA == null || sketchB == null) { return false; } + if (sketchA.isEmpty() && sketchB.isEmpty()) { return true; } + if (sketchA.isEmpty() || sketchB.isEmpty()) { return false; } + + final int countA = sketchA.getRetainedEntries(); + final int countB = sketchB.getRetainedEntries(true); + + //Create the Union + final Union union = new Union<>(ceilingPowerOf2(countA + countB), summarySetOps); + union.union(sketchA); + union.union(sketchB, summary); + final Sketch unionAB = union.getResult(); + final long thetaLongUAB = unionAB.getThetaLong(); + final long thetaLongA = sketchA.getThetaLong(); + final long thetaLongB = sketchB.getThetaLong(); + final int countUAB = unionAB.getRetainedEntries(); + + //Check for identical counts and thetas + if (countUAB == countA && countUAB == countB + && thetaLongUAB == thetaLongA && thetaLongUAB == thetaLongB) { + return true; + } + return false; + } + + /** + * Tests similarity of a measured Sketch against an expected Sketch. + * Computes the lower bound of the Jaccard index JLB of the measured and + * expected sketches. + * if JLB ≥ threshold, then the sketches are considered to be + * similar with a confidence of 97.7%. + * + * @param measured a Tuple sketch with summary type S to be tested + * @param expected the reference Tuple sketch with summary type S that is considered to be correct. + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * + * @param threshold a real value between zero and one. + * @return if true, the similarity of the two sketches is greater than the given threshold + * with at least 97.7% confidence. + */ + public static boolean similarityTest(final Sketch measured, final Sketch expected, + final SummarySetOperations summarySetOps, + final double threshold) { + //index 0: the lower bound + //index 1: the mean estimate + //index 2: the upper bound + final double jRatioLB = jaccard(measured, expected, summarySetOps)[0]; //choosing the lower bound + return jRatioLB >= threshold; + } + + /** + * Tests similarity of a measured Sketch against an expected Sketch. + * Computes the lower bound of the Jaccard index JLB of the measured and + * expected sketches. + * if JLB ≥ threshold, then the sketches are considered to be + * similar with a confidence of 97.7%. + * + * @param measured a Tuple sketch with summary type S to be tested + * @param expected the reference Theta sketch that is considered to be correct. + * @param summary the given proxy summary for the theta sketch, which doesn't have one. + * This may not be null. + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * + * @param threshold a real value between zero and one. + * @return if true, the similarity of the two sketches is greater than the given threshold + * with at least 97.7% confidence. + */ + public static boolean similarityTest(final Sketch measured, final org.apache.datasketches.theta.Sketch expected, + final S summary, final SummarySetOperations summarySetOps, + final double threshold) { + //index 0: the lower bound + //index 1: the mean estimate + //index 2: the upper bound + final double jRatioLB = jaccard(measured, expected, summary, summarySetOps)[0]; //choosing the lower bound + return jRatioLB >= threshold; + } + + /** + * Tests dissimilarity of a measured Sketch against an expected Sketch. + * Computes the upper bound of the Jaccard index JUB of the measured and + * expected sketches. + * if JUB ≤ threshold, then the sketches are considered to be + * dissimilar with a confidence of 97.7%. + * + * @param measured a Tuple sketch with summary type S to be tested + * @param expected the reference Theta sketch that is considered to be correct. + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * + * @param threshold a real value between zero and one. + * @return if true, the dissimilarity of the two sketches is greater than the given threshold + * with at least 97.7% confidence. + */ + public static boolean dissimilarityTest(final Sketch measured, final Sketch expected, + final SummarySetOperations summarySetOps, + final double threshold) { + //index 0: the lower bound + //index 1: the mean estimate + //index 2: the upper bound + final double jRatioUB = jaccard(measured, expected, summarySetOps)[2]; //choosing the upper bound + return jRatioUB <= threshold; + } + + /** + * Tests dissimilarity of a measured Sketch against an expected Sketch. + * Computes the upper bound of the Jaccard index JUB of the measured and + * expected sketches. + * if JUB ≤ threshold, then the sketches are considered to be + * dissimilar with a confidence of 97.7%. + * + * @param measured a Tuple sketch with summary type S to be tested + * @param expected the reference Theta sketch that is considered to be correct. + * @param summary the given proxy summary for the theta sketch, which doesn't have one. + * This may not be null. + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * + * @param threshold a real value between zero and one. + * @return if true, the dissimilarity of the two sketches is greater than the given threshold + * with at least 97.7% confidence. + */ + public static boolean dissimilarityTest(final Sketch measured, final org.apache.datasketches.theta.Sketch expected, + final S summary, final SummarySetOperations summarySetOps, + final double threshold) { + //index 0: the lower bound + //index 1: the mean estimate + //index 2: the upper bound + final double jRatioUB = jaccard(measured, expected, summary, summarySetOps)[2]; //choosing the upper bound + return jRatioUB <= threshold; + } + +} diff --git a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java index 99166e289..a7cb60202 100644 --- a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java @@ -19,6 +19,7 @@ package org.apache.datasketches.tuple; +import static org.apache.datasketches.HashOperations.count; import static org.apache.datasketches.Util.REBUILD_THRESHOLD; import static org.apache.datasketches.Util.RESIZE_THRESHOLD; import static org.apache.datasketches.Util.ceilingPowerOf2; @@ -210,6 +211,11 @@ public int getRetainedEntries() { return count_; } + @Override + public int getCountLessThanThetaLong(long thetaLong) { + return count(hashTable_, thetaLong); + } + S[] getSummaryTable() { return summaryTable_; } diff --git a/src/main/java/org/apache/datasketches/tuple/Sketch.java b/src/main/java/org/apache/datasketches/tuple/Sketch.java index 02b7bf5c6..9f4059178 100644 --- a/src/main/java/org/apache/datasketches/tuple/Sketch.java +++ b/src/main/java/org/apache/datasketches/tuple/Sketch.java @@ -19,6 +19,7 @@ package org.apache.datasketches.tuple; +import static org.apache.datasketches.HashOperations.count; import static org.apache.datasketches.Util.LS; import org.apache.datasketches.BinomialBoundsN; @@ -145,6 +146,13 @@ public boolean isEstimationMode() { */ public abstract int getRetainedEntries(); + /** + * Gets the number of hash values less than the given theta expressed as a long. + * @param thetaLong the given theta as a long between zero and Long.MAX_VALUE. + * @return the number of hash values less than the given thetaLong. + */ + public abstract int getCountLessThanThetaLong(final long thetaLong); + /** * Gets the value of theta as a double between zero and one * @return the value of theta as a double diff --git a/src/test/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSetsTest.java b/src/test/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSetsTest.java new file mode 100644 index 000000000..15311f327 --- /dev/null +++ b/src/test/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSetsTest.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches; + +import org.apache.datasketches.theta.UpdateSketch; +import org.apache.datasketches.theta.UpdateSketchBuilder; +import org.apache.datasketches.tuple.Sketch; +import org.apache.datasketches.tuple.UpdatableSketch; +import org.apache.datasketches.tuple.UpdatableSketchBuilder; +import org.apache.datasketches.tuple.adouble.DoubleSummary; +import org.apache.datasketches.tuple.adouble.DoubleSummaryFactory; +import org.apache.datasketches.tuple.adouble.DoubleSummarySetOperations; +import org.apache.datasketches.tuple.Intersection; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +/** + * @author Lee Rhodes + * @author David Cromberge + */ +@SuppressWarnings("javadoc") +public class BoundsOnRatiosInTupleSketchedSetsTest { + + private final DoubleSummary.Mode umode = DoubleSummary.Mode.Sum; + private final DoubleSummarySetOperations dsso = new DoubleSummarySetOperations(); + private final DoubleSummaryFactory factory = new DoubleSummaryFactory(umode); + private final UpdateSketchBuilder thetaBldr = UpdateSketch.builder(); + private final UpdatableSketchBuilder tupleBldr = new UpdatableSketchBuilder<>(factory); + private final Double constSummary = 1.0; + + @Test + public void checkNormalReturns1() { // tuple, tuple + final UpdatableSketch skA = tupleBldr.build(); //4K + final UpdatableSketch skC = tupleBldr.build(); + final int uA = 10000; + final int uC = 100000; + for (int i = 0; i < uA; i++) { skA.update(i, constSummary); } + for (int i = 0; i < uC; i++) { skC.update(i + (uA / 2), constSummary); } + final Intersection inter = new Intersection<>(dsso); + inter.intersect(skA); + inter.intersect(skC); + final Sketch skB = inter.getResult(); + + double est = BoundsOnRatiosInTupleSketchedSets.getEstimateOfBoverA(skA, skB); + double lb = BoundsOnRatiosInTupleSketchedSets.getLowerBoundForBoverA(skA, skB); + double ub = BoundsOnRatiosInTupleSketchedSets.getUpperBoundForBoverA(skA, skB); + assertTrue(ub > est); + assertTrue(est > lb); + assertEquals(est, 0.5, .03); + println("ub : " + ub); + println("est: " + est); + println("lb : " + lb); + skA.reset(); //skA is now empty + est = BoundsOnRatiosInTupleSketchedSets.getEstimateOfBoverA(skA, skB); + lb = BoundsOnRatiosInTupleSketchedSets.getLowerBoundForBoverA(skA, skB); + ub = BoundsOnRatiosInTupleSketchedSets.getUpperBoundForBoverA(skA, skB); + println("ub : " + ub); + println("est: " + est); + println("lb : " + lb); + skC.reset(); //Now both are empty + est = BoundsOnRatiosInTupleSketchedSets.getEstimateOfBoverA(skA, skC); + lb = BoundsOnRatiosInTupleSketchedSets.getLowerBoundForBoverA(skA, skC); + ub = BoundsOnRatiosInTupleSketchedSets.getUpperBoundForBoverA(skA, skC); + println("ub : " + ub); + println("est: " + est); + println("lb : " + lb); + } + + @Test + public void checkNormalReturns2() { // tuple, theta + final UpdatableSketch skA = tupleBldr.build(); //4K + final UpdateSketch skC = thetaBldr.build(); + final int uA = 10000; + final int uC = 100000; + for (int i = 0; i < uA; i++) { skA.update(i, constSummary); } + for (int i = 0; i < uC; i++) { skC.update(i + (uA / 2)); } + final Intersection inter = new Intersection<>(dsso); + inter.intersect(skA); + inter.intersect(skC, factory.newSummary()); + final Sketch skB = inter.getResult(); + + double est = BoundsOnRatiosInTupleSketchedSets.getEstimateOfBoverA(skA, skB); + double lb = BoundsOnRatiosInTupleSketchedSets.getLowerBoundForBoverA(skA, skB); + double ub = BoundsOnRatiosInTupleSketchedSets.getUpperBoundForBoverA(skA, skB); + assertTrue(ub > est); + assertTrue(est > lb); + assertEquals(est, 0.5, .03); + println("ub : " + ub); + println("est: " + est); + println("lb : " + lb); + skA.reset(); //skA is now empty + est = BoundsOnRatiosInTupleSketchedSets.getEstimateOfBoverA(skA, skB); + lb = BoundsOnRatiosInTupleSketchedSets.getLowerBoundForBoverA(skA, skB); + ub = BoundsOnRatiosInTupleSketchedSets.getUpperBoundForBoverA(skA, skB); + println("ub : " + ub); + println("est: " + est); + println("lb : " + lb); + skC.reset(); //Now both are empty + est = BoundsOnRatiosInTupleSketchedSets.getEstimateOfBoverA(skA, skC); + lb = BoundsOnRatiosInTupleSketchedSets.getLowerBoundForBoverA(skA, skC); + ub = BoundsOnRatiosInTupleSketchedSets.getUpperBoundForBoverA(skA, skC); + println("ub : " + ub); + println("est: " + est); + println("lb : " + lb); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkAbnormalReturns1() { // tuple, tuple + final UpdatableSketch skA = tupleBldr.build(); //4K + final UpdatableSketch skC = tupleBldr.build(); + final int uA = 100000; + final int uC = 10000; + for (int i = 0; i < uA; i++) { skA.update(i, constSummary); } + for (int i = 0; i < uC; i++) { skC.update(i + (uA / 2), constSummary); } + BoundsOnRatiosInTupleSketchedSets.getEstimateOfBoverA(skA, skC); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkAbnormalReturns2() { // tuple, theta + final UpdatableSketch skA = tupleBldr.build(); //4K + final UpdateSketch skC = thetaBldr.build(); + final int uA = 100000; + final int uC = 10000; + for (int i = 0; i < uA; i++) { skA.update(i, constSummary); } + for (int i = 0; i < uC; i++) { skC.update(i + (uA / 2)); } + BoundsOnRatiosInTupleSketchedSets.getEstimateOfBoverA(skA, skC); + } + + @Test + public void printlnTest() { + println("PRINTING: " + this.getClass().getName()); + } + + /** + * @param s value to print + */ + static void println(final String s) { + //System.out.println(s); //disable here + } +} diff --git a/src/test/java/org/apache/datasketches/tuple/JaccardSimilarityTest.java b/src/test/java/org/apache/datasketches/tuple/JaccardSimilarityTest.java new file mode 100644 index 000000000..49a0fa15e --- /dev/null +++ b/src/test/java/org/apache/datasketches/tuple/JaccardSimilarityTest.java @@ -0,0 +1,458 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.tuple; + +import org.apache.datasketches.tuple.adouble.DoubleSummary; +import org.apache.datasketches.tuple.adouble.DoubleSummaryFactory; +import org.apache.datasketches.tuple.adouble.DoubleSummarySetOperations; +import org.testng.annotations.Test; + +import org.apache.datasketches.theta.UpdateSketch; +import org.apache.datasketches.theta.UpdateSketchBuilder; +import static org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest; +import static org.apache.datasketches.tuple.JaccardSimilarity.exactlyEqual; +import static org.apache.datasketches.tuple.JaccardSimilarity.jaccard; +import static org.apache.datasketches.tuple.JaccardSimilarity.similarityTest; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +/** + * @author Lee Rhodes + * @author David Cromberge + */ +@SuppressWarnings("javadoc") +public class JaccardSimilarityTest { + private final DoubleSummary.Mode umode = DoubleSummary.Mode.Sum; + private final DoubleSummarySetOperations dsso = new DoubleSummarySetOperations(); + private final DoubleSummaryFactory factory = new DoubleSummaryFactory(umode); + private final UpdateSketchBuilder thetaBldr = UpdateSketch.builder(); + private final UpdatableSketchBuilder tupleBldr = new UpdatableSketchBuilder<>(factory); + private final Double constSummary = 1.0; + + @Test + public void checkNullsEmpties1() { // tuple, tuple + int minK = 1 << 12; + double threshold = 0.95; + println("Check nulls & empties, minK: " + minK + "\t Th: " + threshold); + //check both null + double[] jResults = jaccard(null, null, dsso); + boolean state = jResults[1] > threshold; + println("null \t null:\t" + state + "\t" + jaccardString(jResults)); + assertFalse(state); + + state = exactlyEqual(null, null, dsso); + assertFalse(state); + + final UpdatableSketch measured = tupleBldr.setNominalEntries(minK).build(); + final UpdatableSketch expected = tupleBldr.setNominalEntries(minK).build(); + + //check both empty + jResults = jaccard(measured, expected, dsso); + state = jResults[1] > threshold; + println("empty\tempty:\t" + state + "\t" + jaccardString(jResults)); + assertTrue(state); + + state = exactlyEqual(measured, expected, dsso); + assertTrue(state); + + state = exactlyEqual(measured, measured, dsso); + assertTrue(state); + + //adjust one + expected.update(1, constSummary); + jResults = jaccard(measured, expected, dsso); + state = jResults[1] > threshold; + println("empty\t 1:\t" + state + "\t" + jaccardString(jResults)); + assertFalse(state); + + state = exactlyEqual(measured, expected, dsso); + assertFalse(state); + + println(""); + } + + @Test + public void checkNullsEmpties2() { // tuple, theta + int minK = 1 << 12; + double threshold = 0.95; + println("Check nulls & empties, minK: " + minK + "\t Th: " + threshold); + //check both null + double[] jResults = jaccard(null, null, factory.newSummary(), dsso); + boolean state = jResults[1] > threshold; + println("null \t null:\t" + state + "\t" + jaccardString(jResults)); + assertFalse(state); + + state = exactlyEqual(null, null, factory.newSummary(), dsso); + assertFalse(state); + + final UpdatableSketch measured = tupleBldr.setNominalEntries(minK).build(); + final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build(); + + //check both empty + jResults = jaccard(measured, expected, factory.newSummary(), dsso); + state = jResults[1] > threshold; + println("empty\tempty:\t" + state + "\t" + jaccardString(jResults)); + assertTrue(state); + + state = exactlyEqual(measured, expected, factory.newSummary(), dsso); + assertTrue(state); + + state = exactlyEqual(measured, measured, dsso); + assertTrue(state); + + //adjust one + expected.update(1); + jResults = jaccard(measured, expected, factory.newSummary(), dsso); + state = jResults[1] > threshold; + println("empty\t 1:\t" + state + "\t" + jaccardString(jResults)); + assertFalse(state); + + state = exactlyEqual(measured, expected, factory.newSummary(), dsso); + assertFalse(state); + + println(""); + } + + @Test + public void checkExactMode1() { // tuple, tuple + int k = 1 << 12; + int u = k; + double threshold = 0.9999; + println("Exact Mode, minK: " + k + "\t Th: " + threshold); + + final UpdatableSketch measured = tupleBldr.setNominalEntries(k).build(); + final UpdatableSketch expected = tupleBldr.setNominalEntries(k).build(); + + for (int i = 0; i < (u-1); i++) { //one short + measured.update(i, constSummary); + expected.update(i, constSummary); + } + + double[] jResults = jaccard(measured, expected, dsso); + boolean state = jResults[1] > threshold; + println(state + "\t" + jaccardString(jResults)); + assertTrue(state); + + state = exactlyEqual(measured, expected, dsso); + assertTrue(state); + + measured.update(u-1, constSummary); //now exactly k entries + expected.update(u, constSummary); //now exactly k entries but differs by one + jResults = jaccard(measured, expected, dsso); + state = jResults[1] > threshold; + println(state + "\t" + jaccardString(jResults)); + assertFalse(state); + + state = exactlyEqual(measured, expected, dsso); + assertFalse(state); + + println(""); + } + + @Test + public void checkExactMode2() { // tuple, theta + int k = 1 << 12; + int u = k; + double threshold = 0.9999; + println("Exact Mode, minK: " + k + "\t Th: " + threshold); + + final UpdatableSketch measured = tupleBldr.setNominalEntries(k).build(); + final UpdateSketch expected = thetaBldr.setNominalEntries(k).build(); + + for (int i = 0; i < (u-1); i++) { //one short + measured.update(i, constSummary); + expected.update(i); + } + + double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso); + boolean state = jResults[1] > threshold; + println(state + "\t" + jaccardString(jResults)); + assertTrue(state); + + state = exactlyEqual(measured, expected, factory.newSummary(), dsso); + assertTrue(state); + + measured.update(u-1, constSummary); //now exactly k entries + expected.update(u); //now exactly k entries but differs by one + jResults = jaccard(measured, expected, factory.newSummary(), dsso); + state = jResults[1] > threshold; + println(state + "\t" + jaccardString(jResults)); + assertFalse(state); + + state = exactlyEqual(measured, expected, factory.newSummary(), dsso); + assertFalse(state); + + println(""); + } + + @Test + public void checkEstMode1() { // tuple, tuple + int k = 1 << 12; + int u = 1 << 20; + double threshold = 0.9999; + println("Estimation Mode, minK: " + k + "\t Th: " + threshold); + + final UpdatableSketch measured = tupleBldr.setNominalEntries(k).build(); + final UpdatableSketch expected = tupleBldr.setNominalEntries(k).build(); + + for (int i = 0; i < u; i++) { + measured.update(i, constSummary); + expected.update(i, constSummary); + } + + double[] jResults = jaccard(measured, expected, dsso); + boolean state = jResults[1] > threshold; + println(state + "\t" + jaccardString(jResults)); + assertTrue(state); + + state = exactlyEqual(measured, expected, dsso); + assertTrue(state); + + for (int i = u; i < (u + 50); i++) { //empirically determined + measured.update(i, constSummary); + } + + jResults = jaccard(measured, expected, dsso); + state = jResults[1] >= threshold; + println(state + "\t" + jaccardString(jResults)); + assertFalse(state); + + state = exactlyEqual(measured, expected, dsso); + assertFalse(state); + + println(""); + } + + @Test + public void checkEstMode2() { // tuple, theta + int k = 1 << 12; + int u = 1 << 20; + double threshold = 0.9999; + println("Estimation Mode, minK: " + k + "\t Th: " + threshold); + + final UpdatableSketch measured = tupleBldr.setNominalEntries(k).build(); + final UpdateSketch expected = thetaBldr.setNominalEntries(k).build(); + + for (int i = 0; i < u; i++) { + measured.update(i, constSummary); + expected.update(i); + } + + double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso); + boolean state = jResults[1] > threshold; + println(state + "\t" + jaccardString(jResults)); + assertTrue(state); + + state = exactlyEqual(measured, expected, factory.newSummary(), dsso); + assertTrue(state); + + for (int i = u; i < (u + 50); i++) { //empirically determined + measured.update(i, constSummary); + } + + jResults = jaccard(measured, expected, factory.newSummary(), dsso); + state = jResults[1] >= threshold; + println(state + "\t" + jaccardString(jResults)); + assertFalse(state); + + state = exactlyEqual(measured, expected, factory.newSummary(), dsso); + assertFalse(state); + + println(""); + } + + /** + * Enable printing on this test and you will see that the distribution is pretty tight, + * about +/- 0.7%, which is pretty good since the accuracy of the underlying sketch is about + * +/- 1.56%. + */ + @Test + public void checkSimilarity1() { // tuple, tuple + int minK = 1 << 12; + int u1 = 1 << 20; + int u2 = (int) (u1 * 0.95); + double threshold = 0.943; + println("Estimation Mode, minK: " + minK + "\t Th: " + threshold); + + final UpdatableSketch measured = tupleBldr.setNominalEntries(minK).build(); + final UpdatableSketch expected = tupleBldr.setNominalEntries(minK).build(); + + for (int i = 0; i < u1; i++) { + expected.update(i, constSummary); + } + + for (int i = 0; i < u2; i++) { + measured.update(i, constSummary); + } + + double[] jResults = jaccard(measured, expected, dsso); + boolean state = similarityTest(measured, expected, dsso, threshold); + println(state + "\t" + jaccardString(jResults)); + assertTrue(state); + //check identity case + state = similarityTest(measured, measured, dsso, threshold); + assertTrue(state); + } + + /** + * Enable printing on this test and you will see that the distribution is pretty tight, + * about +/- 0.7%, which is pretty good since the accuracy of the underlying sketch is about + * +/- 1.56%. + */ + @Test + public void checkSimilarity2() { // tuple, theta + int minK = 1 << 12; + int u1 = 1 << 20; + int u2 = (int) (u1 * 0.95); + double threshold = 0.943; + println("Estimation Mode, minK: " + minK + "\t Th: " + threshold); + + final UpdatableSketch measured = tupleBldr.setNominalEntries(minK).build(); + final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build(); + + for (int i = 0; i < u1; i++) { + expected.update(i); + } + + for (int i = 0; i < u2; i++) { + measured.update(i, constSummary); + } + + double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso); + boolean state = similarityTest(measured, expected, factory.newSummary(), dsso, threshold); + println(state + "\t" + jaccardString(jResults)); + assertTrue(state); + //check identity case + state = similarityTest(measured, measured, dsso, threshold); + assertTrue(state); + } + + /** + * Enable printing on this test and you will see that the distribution is much looser, + * about +/- 14%. This is due to the fact that intersections loose accuracy as the ratio of + * intersection to the union becomes a small number. + */ + @Test + public void checkDissimilarity1() { // tuple, tuple + int minK = 1 << 12; + int u1 = 1 << 20; + int u2 = (int) (u1 * 0.05); + double threshold = 0.061; + println("Estimation Mode, minK: " + minK + "\t Th: " + threshold); + + final UpdatableSketch measured = tupleBldr.setNominalEntries(minK).setNominalEntries(minK).build(); + final UpdatableSketch expected = tupleBldr.setNominalEntries(minK).setNominalEntries(minK).build(); + + for (int i = 0; i < u1; i++) { + expected.update(i, constSummary); + } + + for (int i = 0; i < u2; i++) { + measured.update(i, constSummary); + } + + double[] jResults = jaccard(measured, expected, dsso); + boolean state = dissimilarityTest(measured, expected, dsso, threshold); + println(state + "\t" + jaccardString(jResults)); + assertTrue(state); + } + + /** + * Enable printing on this test and you will see that the distribution is much looser, + * about +/- 14%. This is due to the fact that intersections loose accuracy as the ratio of + * intersection to the union becomes a small number. + */ + @Test + public void checkDissimilarity2() { // tuple, theta + int minK = 1 << 12; + int u1 = 1 << 20; + int u2 = (int) (u1 * 0.05); + double threshold = 0.061; + println("Estimation Mode, minK: " + minK + "\t Th: " + threshold); + + final UpdatableSketch measured = tupleBldr.setNominalEntries(minK).setNominalEntries(minK).build(); + final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build(); + + for (int i = 0; i < u1; i++) { + expected.update(i); + } + + for (int i = 0; i < u2; i++) { + measured.update(i, constSummary); + } + + double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso); + boolean state = dissimilarityTest(measured, expected, factory.newSummary(), dsso, threshold); + println(state + "\t" + jaccardString(jResults)); + assertTrue(state); + } + + private static String jaccardString(double[] jResults) { + double lb = jResults[0]; + double est = jResults[1]; + double ub = jResults[2]; + return lb + "\t" + est + "\t" + ub + "\t" + ((lb/est) - 1.0) + "\t" + ((ub/est) - 1.0); + } + + @Test + public void checkMinK1() { // tuple, tuple + final UpdatableSketch skA = tupleBldr.build(); //4096 + final UpdatableSketch skB = tupleBldr.build(); //4096 + skA.update(1, constSummary); + skB.update(1, constSummary); + double[] result = jaccard(skA, skB, dsso); + println(result[0] + ", " + result[1] + ", " + result[2]); + for (int i = 1; i < 4096; i++) { + skA.update(i, constSummary); + skB.update(i, constSummary); + } + result = jaccard(skA, skB, dsso); + println(result[0] + ", " + result[1] + ", " + result[2]); + } + + @Test + public void checkMinK2() { // tuple, theta + final UpdatableSketch skA = tupleBldr.build(); //4096 + final UpdateSketch skB = UpdateSketch.builder().build(); //4096 + skA.update(1, constSummary); + skB.update(1); + double[] result = jaccard(skA, skB, factory.newSummary(), dsso); + println(result[0] + ", " + result[1] + ", " + result[2]); + for (int i = 1; i < 4096; i++) { + skA.update(i, constSummary); + skB.update(i); + } + result = jaccard(skA, skB, factory.newSummary(), dsso); + println(result[0] + ", " + result[1] + ", " + result[2]); + } + + @Test + public void printlnTest() { + println("PRINTING: "+this.getClass().getName()); + } + + /** + * @param s value to print + */ + static void println(String s) { + //System.out.println(s); //disable here + } + +} From 89ec198e1b38e46421a9ea92517b1a8383a9bcde Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 22 Feb 2021 10:37:51 -0800 Subject: [PATCH 2/2] Mostly style changes. Added missing finals. Added missing javadoc @param. Reformatted lines that were too long for better readability. Generic descriptors <...> needed a trailing space. Many unnecessary parentheses were removed automatically (likely historical). --- .../BoundsOnRatiosInTupleSketchedSets.java | 46 +++++--- .../datasketches/tuple/CompactSketch.java | 16 +-- .../datasketches/tuple/JaccardSimilarity.java | 100 +++++++++++------- .../datasketches/tuple/QuickSelectSketch.java | 18 ++-- .../org/apache/datasketches/tuple/Sketch.java | 3 +- 5 files changed, 112 insertions(+), 71 deletions(-) diff --git a/src/main/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSets.java b/src/main/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSets.java index 7bcc72772..d74170ddb 100644 --- a/src/main/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSets.java +++ b/src/main/java/org/apache/datasketches/BoundsOnRatiosInTupleSketchedSets.java @@ -19,11 +19,11 @@ package org.apache.datasketches; +import static org.apache.datasketches.Util.LONG_MAX_VALUE_AS_DOUBLE; + import org.apache.datasketches.tuple.Sketch; import org.apache.datasketches.tuple.Summary; -import static org.apache.datasketches.Util.LONG_MAX_VALUE_AS_DOUBLE; - /** * This class is used to compute the bounds on the estimate of the ratio B / A, where: *

    @@ -54,15 +54,18 @@ private BoundsOnRatiosInTupleSketchedSets() {} * Gets the approximate lower bound for B over A based on a 95% confidence interval * @param sketchA the Tuple sketch A with summary type S * @param sketchB the Tuple sketch B with summary type S + * @param Summary * @return the approximate lower bound for B over A */ - public static double getLowerBoundForBoverA(final Sketch sketchA, final Sketch sketchB) { + public static double getLowerBoundForBoverA( + final Sketch sketchA, + final Sketch sketchB) { final long thetaLongA = sketchA.getThetaLong(); final long thetaLongB = sketchB.getThetaLong(); checkThetas(thetaLongA, thetaLongB); final int countB = sketchB.getRetainedEntries(); - final int countA = (thetaLongB == thetaLongA) + final int countA = thetaLongB == thetaLongA ? sketchA.getRetainedEntries() : sketchA.getCountLessThanThetaLong(thetaLongB); @@ -75,15 +78,18 @@ public static double getLowerBoundForBoverA(final Sketch s * Gets the approximate lower bound for B over A based on a 95% confidence interval * @param sketchA the Tuple sketch A with summary type S * @param sketchB the Theta sketch B + * @param Summary * @return the approximate lower bound for B over A */ - public static double getLowerBoundForBoverA(final Sketch sketchA, final org.apache.datasketches.theta.Sketch sketchB) { + public static double getLowerBoundForBoverA( + final Sketch sketchA, + final org.apache.datasketches.theta.Sketch sketchB) { final long thetaLongA = sketchA.getThetaLong(); final long thetaLongB = sketchB.getThetaLong(); checkThetas(thetaLongA, thetaLongB); final int countB = sketchB.getRetainedEntries(); - final int countA = (thetaLongB == thetaLongA) + final int countA = thetaLongB == thetaLongA ? sketchA.getRetainedEntries() : sketchA.getCountLessThanThetaLong(thetaLongB); @@ -96,15 +102,18 @@ public static double getLowerBoundForBoverA(final Sketch s * Gets the approximate upper bound for B over A based on a 95% confidence interval * @param sketchA the Tuple sketch A with summary type S * @param sketchB the Tuple sketch B with summary type S + * @param Summary * @return the approximate upper bound for B over A */ - public static double getUpperBoundForBoverA(final Sketch sketchA, final Sketch sketchB) { + public static double getUpperBoundForBoverA( + final Sketch sketchA, + final Sketch sketchB) { final long thetaLongA = sketchA.getThetaLong(); final long thetaLongB = sketchB.getThetaLong(); checkThetas(thetaLongA, thetaLongB); final int countB = sketchB.getRetainedEntries(); - final int countA = (thetaLongB == thetaLongA) + final int countA = thetaLongB == thetaLongA ? sketchA.getRetainedEntries() : sketchA.getCountLessThanThetaLong(thetaLongB); @@ -117,15 +126,18 @@ public static double getUpperBoundForBoverA(final Sketch s * Gets the approximate upper bound for B over A based on a 95% confidence interval * @param sketchA the Tuple sketch A with summary type S * @param sketchB the Theta sketch B + * @param Summary * @return the approximate upper bound for B over A */ - public static double getUpperBoundForBoverA(final Sketch sketchA, final org.apache.datasketches.theta.Sketch sketchB) { + public static double getUpperBoundForBoverA( + final Sketch sketchA, + final org.apache.datasketches.theta.Sketch sketchB) { final long thetaLongA = sketchA.getThetaLong(); final long thetaLongB = sketchB.getThetaLong(); checkThetas(thetaLongA, thetaLongB); final int countB = sketchB.getRetainedEntries(true); - final int countA = (thetaLongB == thetaLongA) + final int countA = thetaLongB == thetaLongA ? sketchA.getRetainedEntries() : sketchA.getCountLessThanThetaLong(thetaLongB); @@ -138,15 +150,18 @@ public static double getUpperBoundForBoverA(final Sketch s * Gets the estimate for B over A * @param sketchA the Tuple sketch A with summary type S * @param sketchB the Tuple sketch B with summary type S + * @param Summary * @return the estimate for B over A */ - public static double getEstimateOfBoverA(final Sketch sketchA, final Sketch sketchB) { + public static double getEstimateOfBoverA( + final Sketch sketchA, + final Sketch sketchB) { final long thetaLongA = sketchA.getThetaLong(); final long thetaLongB = sketchB.getThetaLong(); checkThetas(thetaLongA, thetaLongB); final int countB = sketchB.getRetainedEntries(); - final int countA = (thetaLongB == thetaLongA) + final int countA = thetaLongB == thetaLongA ? sketchA.getRetainedEntries() : sketchA.getCountLessThanThetaLong(thetaLongB); @@ -159,15 +174,18 @@ public static double getEstimateOfBoverA(final Sketch sket * Gets the estimate for B over A * @param sketchA the Tuple sketch A with summary type S * @param sketchB the Theta sketch B + * @param Summary * @return the estimate for B over A */ - public static double getEstimateOfBoverA(final Sketch sketchA, final org.apache.datasketches.theta.Sketch sketchB) { + public static double getEstimateOfBoverA( + final Sketch sketchA, + final org.apache.datasketches.theta.Sketch sketchB) { final long thetaLongA = sketchA.getThetaLong(); final long thetaLongB = sketchB.getThetaLong(); checkThetas(thetaLongA, thetaLongB); final int countB = sketchB.getRetainedEntries(true); - final int countA = (thetaLongB == thetaLongA) + final int countA = thetaLongB == thetaLongA ? sketchA.getRetainedEntries() : sketchA.getCountLessThanThetaLong(thetaLongB); diff --git a/src/main/java/org/apache/datasketches/tuple/CompactSketch.java b/src/main/java/org/apache/datasketches/tuple/CompactSketch.java index 2de4c167a..9a76587d5 100644 --- a/src/main/java/org/apache/datasketches/tuple/CompactSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/CompactSketch.java @@ -19,6 +19,8 @@ package org.apache.datasketches.tuple; +import static org.apache.datasketches.HashOperations.count; + import java.lang.reflect.Array; import java.nio.ByteOrder; @@ -27,8 +29,6 @@ import org.apache.datasketches.SketchesArgumentException; import org.apache.datasketches.memory.Memory; -import static org.apache.datasketches.HashOperations.count; - /** * CompactSketches are never created directly. They are created as a result of * the compact() method of an UpdatableSketch or as a result of the getResult() @@ -81,19 +81,19 @@ private enum Flags { IS_BIG_ENDIAN, IS_EMPTY, HAS_ENTRIES, IS_THETA_INCLUDED } SerializerDeserializer .validateType(mem.getByte(offset++), SerializerDeserializer.SketchType.CompactSketch); final byte flags = mem.getByte(offset++); - final boolean isBigEndian = (flags & (1 << Flags.IS_BIG_ENDIAN.ordinal())) > 0; + final boolean isBigEndian = (flags & 1 << Flags.IS_BIG_ENDIAN.ordinal()) > 0; if (isBigEndian ^ ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) { throw new SketchesArgumentException("Byte order mismatch"); } - empty_ = (flags & (1 << Flags.IS_EMPTY.ordinal())) > 0; - final boolean isThetaIncluded = (flags & (1 << Flags.IS_THETA_INCLUDED.ordinal())) > 0; + empty_ = (flags & 1 << Flags.IS_EMPTY.ordinal()) > 0; + final boolean isThetaIncluded = (flags & 1 << Flags.IS_THETA_INCLUDED.ordinal()) > 0; if (isThetaIncluded) { thetaLong_ = mem.getLong(offset); offset += Long.BYTES; } else { thetaLong_ = Long.MAX_VALUE; } - final boolean hasEntries = (flags & (1 << Flags.HAS_ENTRIES.ordinal())) > 0; + final boolean hasEntries = (flags & 1 << Flags.HAS_ENTRIES.ordinal()) > 0; if (hasEntries) { int classNameLength = 0; if (version == serialVersionWithSummaryClassNameUID) { @@ -142,7 +142,7 @@ public int getRetainedEntries() { } @Override - public int getCountLessThanThetaLong(long thetaLong) { + public int getCountLessThanThetaLong(final long thetaLong) { return count(hashArr_, thetaLong); } @@ -178,7 +178,7 @@ public byte[] toByteArray() { if (count > 0) { sizeBytes += + Integer.BYTES // count - + (Long.BYTES * count) + summariesBytesLength; + + Long.BYTES * count + summariesBytesLength; } final byte[] bytes = new byte[sizeBytes]; int offset = 0; diff --git a/src/main/java/org/apache/datasketches/tuple/JaccardSimilarity.java b/src/main/java/org/apache/datasketches/tuple/JaccardSimilarity.java index cf0df1498..1567071a6 100644 --- a/src/main/java/org/apache/datasketches/tuple/JaccardSimilarity.java +++ b/src/main/java/org/apache/datasketches/tuple/JaccardSimilarity.java @@ -19,18 +19,22 @@ package org.apache.datasketches.tuple; -import org.apache.datasketches.SketchesArgumentException; - import static java.lang.Math.max; import static java.lang.Math.min; -import static org.apache.datasketches.BoundsOnRatiosInTupleSketchedSets.*; -import static org.apache.datasketches.Util.*; +import static org.apache.datasketches.BoundsOnRatiosInTupleSketchedSets.getEstimateOfBoverA; +import static org.apache.datasketches.BoundsOnRatiosInTupleSketchedSets.getLowerBoundForBoverA; +import static org.apache.datasketches.BoundsOnRatiosInTupleSketchedSets.getUpperBoundForBoverA; +import static org.apache.datasketches.Util.MAX_LG_NOM_LONGS; +import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS; +import static org.apache.datasketches.Util.ceilingPowerOf2; + +import org.apache.datasketches.SketchesArgumentException; /** * Jaccard similarity of two Tuple Sketches, or alternatively, of a Tuple and Theta Sketch. * - * Note: only retained hash values are compared, and the Tuple summary values are not accounted for in the - * similarity measure. + *

    Note: only retained hash values are compared, and the Tuple summary values are not accounted for in the + * similarity measure.

    * * @author Lee Rhodes * @author David Cromberge @@ -52,11 +56,14 @@ public final class JaccardSimilarity { * @param sketchA The first argument, a Tuple sketch with summary type S * @param sketchB The second argument, a Tuple sketch with summary type S * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. + * @param Summary * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index. * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations. */ - public static double[] jaccard(final Sketch sketchA, final Sketch sketchB, - final SummarySetOperations summarySetOps) { + public static double[] jaccard( + final Sketch sketchA, + final Sketch sketchB, + final SummarySetOperations summarySetOps) { //Corner case checks if (sketchA == null || sketchB == null) { return ZEROS.clone(); } if (sketchA.isEmpty() && sketchB.isEmpty()) { return ONES.clone(); } @@ -113,11 +120,14 @@ public static double[] jaccard(final Sketch sketchA, final * @param summary the given proxy summary for the theta sketch, which doesn't have one. * This may not be null. * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. + * @param Summary * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index. * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations. */ - public static double[] jaccard(final Sketch sketchA, final org.apache.datasketches.theta.Sketch sketchB, - final S summary, final SummarySetOperations summarySetOps) { + public static double[] jaccard( + final Sketch sketchA, + final org.apache.datasketches.theta.Sketch sketchB, + final S summary, final SummarySetOperations summarySetOps) { // Null case checks if (summary == null) { throw new SketchesArgumentException("Summary cannot be null."); } @@ -169,11 +179,14 @@ public static double[] jaccard(final Sketch sketchA, final * @param sketchA The first argument, a Tuple sketch with summary type S * @param sketchB The second argument, a Tuple sketch with summary type S * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. + * @param Summary * @return true if the two given sketches have exactly the same hash values and the same * theta values. */ - public static boolean exactlyEqual(final Sketch sketchA, final Sketch sketchB, - final SummarySetOperations summarySetOps) { + public static boolean exactlyEqual( + final Sketch sketchA, + final Sketch sketchB, + final SummarySetOperations summarySetOps) { //Corner case checks if (sketchA == null || sketchB == null) { return false; } if (sketchA == sketchB) { return true; } @@ -209,11 +222,14 @@ public static boolean exactlyEqual(final Sketch sketchA, f * @param summary the given proxy summary for the theta sketch, which doesn't have one. * This may not be null. * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. + * @param Summary * @return true if the two given sketches have exactly the same hash values and the same * theta values. */ - public static boolean exactlyEqual(final Sketch sketchA, final org.apache.datasketches.theta.Sketch sketchB, - final S summary, final SummarySetOperations summarySetOps) { + public static boolean exactlyEqual( + final Sketch sketchA, + final org.apache.datasketches.theta.Sketch sketchB, + final S summary, final SummarySetOperations summarySetOps) { // Null case checks if (summary == null) { throw new SketchesArgumentException("Summary cannot be null."); } @@ -253,18 +269,20 @@ public static boolean exactlyEqual(final Sketch sketchA, f * * @param measured a Tuple sketch with summary type S to be tested * @param expected the reference Tuple sketch with summary type S that is considered to be correct. - * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * @param threshold a real value between zero and one. + * @param Summary * @return if true, the similarity of the two sketches is greater than the given threshold * with at least 97.7% confidence. */ - public static boolean similarityTest(final Sketch measured, final Sketch expected, - final SummarySetOperations summarySetOps, - final double threshold) { - //index 0: the lower bound - //index 1: the mean estimate - //index 2: the upper bound - final double jRatioLB = jaccard(measured, expected, summarySetOps)[0]; //choosing the lower bound + public static boolean similarityTest( + final Sketch measured, final Sketch expected, + final SummarySetOperations summarySetOps, + final double threshold) { + //index 0: the lower bound + //index 1: the mean estimate + //index 2: the upper bound + final double jRatioLB = jaccard(measured, expected, summarySetOps)[0]; //choosing the lower bound return jRatioLB >= threshold; } @@ -279,14 +297,16 @@ public static boolean similarityTest(final Sketch measured * @param expected the reference Theta sketch that is considered to be correct. * @param summary the given proxy summary for the theta sketch, which doesn't have one. * This may not be null. - * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * @param threshold a real value between zero and one. + * @param Summary * @return if true, the similarity of the two sketches is greater than the given threshold * with at least 97.7% confidence. */ - public static boolean similarityTest(final Sketch measured, final org.apache.datasketches.theta.Sketch expected, - final S summary, final SummarySetOperations summarySetOps, - final double threshold) { + public static boolean similarityTest( + final Sketch measured, final org.apache.datasketches.theta.Sketch expected, + final S summary, final SummarySetOperations summarySetOps, + final double threshold) { //index 0: the lower bound //index 1: the mean estimate //index 2: the upper bound @@ -303,18 +323,20 @@ public static boolean similarityTest(final Sketch measured * * @param measured a Tuple sketch with summary type S to be tested * @param expected the reference Theta sketch that is considered to be correct. - * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * @param threshold a real value between zero and one. + * @param Summary * @return if true, the dissimilarity of the two sketches is greater than the given threshold * with at least 97.7% confidence. */ - public static boolean dissimilarityTest(final Sketch measured, final Sketch expected, - final SummarySetOperations summarySetOps, - final double threshold) { - //index 0: the lower bound - //index 1: the mean estimate - //index 2: the upper bound - final double jRatioUB = jaccard(measured, expected, summarySetOps)[2]; //choosing the upper bound + public static boolean dissimilarityTest( + final Sketch measured, final Sketch expected, + final SummarySetOperations summarySetOps, + final double threshold) { + //index 0: the lower bound + //index 1: the mean estimate + //index 2: the upper bound + final double jRatioUB = jaccard(measured, expected, summarySetOps)[2]; //choosing the upper bound return jRatioUB <= threshold; } @@ -329,14 +351,16 @@ public static boolean dissimilarityTest(final Sketch measu * @param expected the reference Theta sketch that is considered to be correct. * @param summary the given proxy summary for the theta sketch, which doesn't have one. * This may not be null. - * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * + * @param summarySetOps instance of SummarySetOperations used to unify or intersect summaries. * @param threshold a real value between zero and one. + * @param Summary * @return if true, the dissimilarity of the two sketches is greater than the given threshold * with at least 97.7% confidence. */ - public static boolean dissimilarityTest(final Sketch measured, final org.apache.datasketches.theta.Sketch expected, - final S summary, final SummarySetOperations summarySetOps, - final double threshold) { + public static boolean dissimilarityTest( + final Sketch measured, final org.apache.datasketches.theta.Sketch expected, + final S summary, final SummarySetOperations summarySetOps, + final double threshold) { //index 0: the lower bound //index 1: the mean estimate //index 2: the upper bound diff --git a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java index a7cb60202..fd56b0601 100644 --- a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java @@ -159,7 +159,7 @@ private enum Flags { IS_BIG_ENDIAN, IS_IN_SAMPLING_MODE, IS_EMPTY, HAS_ENTRIES, SerializerDeserializer.validateType(mem.getByte(offset++), SerializerDeserializer.SketchType.QuickSelectSketch); final byte flags = mem.getByte(offset++); - final boolean isBigEndian = (flags & (1 << Flags.IS_BIG_ENDIAN.ordinal())) > 0; + final boolean isBigEndian = (flags & 1 << Flags.IS_BIG_ENDIAN.ordinal()) > 0; if (isBigEndian ^ ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) { throw new SketchesArgumentException("Endian byte order mismatch"); } @@ -167,13 +167,13 @@ private enum Flags { IS_BIG_ENDIAN, IS_IN_SAMPLING_MODE, IS_EMPTY, HAS_ENTRIES, lgCurrentCapacity_ = mem.getByte(offset++); lgResizeFactor_ = mem.getByte(offset++); - final boolean isInSamplingMode = (flags & (1 << Flags.IS_IN_SAMPLING_MODE.ordinal())) > 0; + final boolean isInSamplingMode = (flags & 1 << Flags.IS_IN_SAMPLING_MODE.ordinal()) > 0; samplingProbability_ = isInSamplingMode ? mem.getFloat(offset) : 1f; if (isInSamplingMode) { offset += Float.BYTES; } - final boolean isThetaIncluded = (flags & (1 << Flags.IS_THETA_INCLUDED.ordinal())) > 0; + final boolean isThetaIncluded = (flags & 1 << Flags.IS_THETA_INCLUDED.ordinal()) > 0; if (isThetaIncluded) { thetaLong_ = mem.getLong(offset); offset += Long.BYTES; @@ -182,7 +182,7 @@ private enum Flags { IS_BIG_ENDIAN, IS_IN_SAMPLING_MODE, IS_EMPTY, HAS_ENTRIES, } int count = 0; - final boolean hasEntries = (flags & (1 << Flags.HAS_ENTRIES.ordinal())) > 0; + final boolean hasEntries = (flags & 1 << Flags.HAS_ENTRIES.ordinal()) > 0; if (hasEntries) { count = mem.getInt(offset); offset += Integer.BYTES; @@ -198,7 +198,7 @@ private enum Flags { IS_BIG_ENDIAN, IS_IN_SAMPLING_MODE, IS_EMPTY, HAS_ENTRIES, offset += summaryResult.getSize(); insert(hash, summary); } - empty_ = (flags & (1 << Flags.IS_EMPTY.ordinal())) > 0; + empty_ = (flags & 1 << Flags.IS_EMPTY.ordinal()) > 0; setRebuildThreshold(); } @@ -212,7 +212,7 @@ public int getRetainedEntries() { } @Override - public int getCountLessThanThetaLong(long thetaLong) { + public int getCountLessThanThetaLong(final long thetaLong) { return count(hashTable_, thetaLong); } @@ -349,7 +349,7 @@ public byte[] toByteArray() { if (count_ > 0) { sizeBytes += Integer.BYTES; // count } - sizeBytes += (Long.BYTES * count_) + summariesBytesLength; + sizeBytes += Long.BYTES * count_ + summariesBytesLength; final byte[] bytes = new byte[sizeBytes]; int offset = 0; bytes[offset++] = PREAMBLE_LONGS; @@ -401,7 +401,7 @@ public byte[] toByteArray() { @SuppressWarnings("unchecked") void merge(final long hash, final S summary, final SummarySetOperations summarySetOps) { empty_ = false; - if ((hash > 0) && (hash < thetaLong_)) { + if (hash > 0 && hash < thetaLong_) { final int index = findOrInsert(hash); if (index < 0) { insertSummary(~index, (S)summary.copy()); //did not find, so insert @@ -485,7 +485,7 @@ private void resize(final int newSize) { lgCurrentCapacity_ = Integer.numberOfTrailingZeros(newSize); count_ = 0; for (int i = 0; i < oldHashTable.length; i++) { - if ((oldSummaryTable[i] != null) && (oldHashTable[i] < thetaLong_)) { + if (oldSummaryTable[i] != null && oldHashTable[i] < thetaLong_) { insert(oldHashTable[i], oldSummaryTable[i]); } } diff --git a/src/main/java/org/apache/datasketches/tuple/Sketch.java b/src/main/java/org/apache/datasketches/tuple/Sketch.java index 9f4059178..1ca7f9e8e 100644 --- a/src/main/java/org/apache/datasketches/tuple/Sketch.java +++ b/src/main/java/org/apache/datasketches/tuple/Sketch.java @@ -19,7 +19,6 @@ package org.apache.datasketches.tuple; -import static org.apache.datasketches.HashOperations.count; import static org.apache.datasketches.Util.LS; import org.apache.datasketches.BinomialBoundsN; @@ -138,7 +137,7 @@ public boolean isEmpty() { * @return true if the sketch is in estimation mode. */ public boolean isEstimationMode() { - return ((thetaLong_ < Long.MAX_VALUE) && !isEmpty()); + return thetaLong_ < Long.MAX_VALUE && !isEmpty(); } /**