From 1dc9a74a6099aeb6cf60b401f24c32104849997e Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 9 Feb 2022 12:17:51 -0800 Subject: [PATCH] Create common KllHelper to contain common methods between float and double helpers. --- .../apache/datasketches/QuantilesHelper.java | 8 +- .../java/org/apache/datasketches/Util.java | 21 ++- .../datasketches/kll/KllDoublesHelper.java | 140 ++---------------- .../datasketches/kll/KllDoublesSketch.java | 31 ++-- .../datasketches/kll/KllFloatsHelper.java | 140 ++---------------- .../datasketches/kll/KllFloatsSketch.java | 31 ++-- .../apache/datasketches/kll/KllHelper.java | 136 +++++++++++++++++ .../kll/KllDoublesSketchTest.java | 8 +- .../kll/KllDoublesValidationTest.java | 5 +- .../datasketches/kll/KllFloatsSketchTest.java | 8 +- .../kll/KllFloatsValidationTest.java | 4 +- 11 files changed, 230 insertions(+), 302 deletions(-) create mode 100644 src/main/java/org/apache/datasketches/kll/KllHelper.java diff --git a/src/main/java/org/apache/datasketches/QuantilesHelper.java b/src/main/java/org/apache/datasketches/QuantilesHelper.java index a1983a590..c7546569c 100644 --- a/src/main/java/org/apache/datasketches/QuantilesHelper.java +++ b/src/main/java/org/apache/datasketches/QuantilesHelper.java @@ -20,7 +20,7 @@ package org.apache.datasketches; /** - * Common static methods for quantiles sketches + * Common static methods for classic quantiles and KLL sketches */ public class QuantilesHelper { @@ -29,7 +29,7 @@ public class QuantilesHelper { * An array of {1,1,1,0} becomes {0,1,2,3} * @param array of weights where first element is zero * @return total weight - */ + */ //also used by KLL public static long convertToPrecedingCummulative(final long[] array) { long subtotal = 0; for (int i = 0; i < array.length; i++) { @@ -46,7 +46,7 @@ public static long convertToPrecedingCummulative(final long[] array) { * @param phi the fractional position where: 0 ≤ φ ≤ 1.0. * @param n the size of the stream * @return the index, a value between 0 and n-1. - */ + */ //also used by KLL public static long posOfPhi(final double phi, final long n) { final long pos = (long) Math.floor(phi * n); return pos == n ? n - 1 : pos; //avoids ArrayIndexOutOfBoundException @@ -57,7 +57,7 @@ public static long posOfPhi(final double phi, final long n) { * @param wtArr the cumulative weights array consisting of chunks * @param pos the position * @return the index of the chunk containing the position - */ + */ //also used by KLL public static int chunkContainingPos(final long[] wtArr, final long pos) { final int nominalLength = wtArr.length - 1; /* remember, wtArr contains an "extra" position */ assert nominalLength > 0; diff --git a/src/main/java/org/apache/datasketches/Util.java b/src/main/java/org/apache/datasketches/Util.java index c41e423b9..f0fee42c3 100644 --- a/src/main/java/org/apache/datasketches/Util.java +++ b/src/main/java/org/apache/datasketches/Util.java @@ -771,7 +771,7 @@ public static double pwrLawNextDouble(final int ppo, final double curPoint, return next; } - //Checks + //Checks that throw /** * Check the requested offset and length against the allocated size. @@ -821,6 +821,8 @@ public static void checkProbability(final double p, final String argName) { + "\" must be between 0.0 inclusive and 1.0 inclusive: " + p); } + //Boolean Checks + /** * Unsigned compare with longs. * @param n1 A long to be treated as if unsigned. @@ -831,6 +833,23 @@ public static boolean isLessThanUnsigned(final long n1, final long n2) { return n1 < n2 ^ n1 < 0 != n2 < 0; } + /** + * Returns true if given n is even. + * @param n the given n + * @return true if given n is even. + */ + public static boolean isEven(final long n) { + return (n & 1L) == 0; + } + + /** + * Returns true if given n is odd. + * @param n the given n + * @return true if given n is odd. + */ + public static boolean isOdd(final long n) { + return (n & 1L) == 1L; + } //Resources /** diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java index e50488411..c8a65dda0 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java @@ -19,7 +19,8 @@ package org.apache.datasketches.kll; -import static org.apache.datasketches.Util.floorPowerOf2; +import static org.apache.datasketches.Util.isEven; +import static org.apache.datasketches.Util.isOdd; import java.util.Arrays; import java.util.Random; @@ -33,14 +34,6 @@ */ public class KllDoublesHelper { - static boolean isEven(final int value) { - return (value & 1) == 0; - } - - static boolean isOdd(final int value) { - return (value & 1) == 1; - } - /** * Checks the sequential validity of the given array of double values. * They must be unique, monotonically increasing and not NaN. @@ -58,116 +51,7 @@ public static void validateDoubleValues(final double[] values) { } } - /** - * Copy the old array into a new larger array. - * The extra space is at the top. - * @param oldArr the given old array with data - * @param newLen the new length larger than the oldArr.length. - * @return the new array - */ - static int[] growIntArray(final int[] oldArr, final int newLen) { - final int oldLen = oldArr.length; - assert newLen > oldLen; - final int[] newArr = new int[newLen]; - System.arraycopy(oldArr, 0, newArr, 0, oldLen); - return newArr; - } - - /** - * Returns the upper bound of the number of levels based on n. - * @param n the length of the stream - * @return floor( log_2(n) ) - */ - static int ubOnNumLevels(final long n) { - return 1 + Long.numberOfTrailingZeros(floorPowerOf2(n)); - } - - /** - * Returns the maximum number of items that this sketch can handle - * @param k The sizing / accuracy parameter of the sketch in items. - * Note: this method actually works for k values up to k = 2^29 and 61 levels, - * however only k values up to (2^16 - 1) are currently used by the sketch. - * @param m the size of the smallest level in items. - * @param numLevels the upper bound number of levels based on n items. - * @return the total item capacity of the sketch. - */ - static int computeTotalCapacity(final int k, final int m, final int numLevels) { - long total = 0; - for (int h = 0; h < numLevels; h++) { - total += levelCapacity(k, numLevels, h, m); - } - return (int) total; - } - - /** - * Returns the capacity of a specific level. - * @param k the accuracy parameter of the sketch. Maximum is 2^29. - * @param numLevels the number of current levels in the sketch. Maximum is 61. - * @param height the zero-based index of a level with respect to the smallest level. - * This varies from 0 to 60. - * @param minWidth the minimum level width. Default is 8. - * @return the capacity of a specific level - */ - static int levelCapacity(final int k, final int numLevels, final int height, final int minWidth) { - assert (k <= (1 << 29)); - assert (numLevels >= 1) && (numLevels <= 61); - assert (height >= 0) && (height < numLevels); - final int depth = numLevels - height - 1; - return (int) Math.max(minWidth, intCapAux(k, depth)); - } - - /** - * Computes the actual capacity of a given level given its depth index. - * If the depth of levels exceeds 30, this uses a folding technique to accurately compute the - * actual level capacity up to a depth of 60. Without folding, the internal calculations would - * exceed the capacity of a long. - * @param k the configured k of the sketch - * @param depth the zero-based index of the level being computed. - * @return the actual capacity of a given level given its depth index. - */ - private static long intCapAux(final int k, final int depth) { - if (depth <= 30) { return intCapAuxAux(k, depth); } - final int half = depth / 2; - final int rest = depth - half; - final long tmp = intCapAuxAux(k, half); - return intCapAuxAux(tmp, rest); - } - - /** - * Performs the integer based calculation of an individual level (or folded level). - * @param k the configured k of the sketch - * @param depth depth the zero-based index of the level being computed. - * @return the actual capacity of a given level given its depth index. - */ - private static long intCapAuxAux(final long k, final int depth) { - final long twok = k << 1; // for rounding pre-multiply by 2 - final long tmp = ((twok << depth) / powersOfThree[depth]); - final long result = ((tmp + 1L) >>> 1); // add 1 and divide by 2 - assert (result <= k); - return result; - } - - /** - * This is the exact powers of 3 from 3^0 to 3^30 where the exponent is the index - */ - private static final long[] powersOfThree = - new long[] {1, 3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441, - 1594323, 4782969, 14348907, 43046721, 129140163, 387420489, 1162261467, - 3486784401L, 10460353203L, 31381059609L, 94143178827L, 282429536481L, - 847288609443L, 2541865828329L, 7625597484987L, 22876792454961L, 68630377364883L, - 205891132094649L}; - - static long sumTheSampleWeights(final int num_levels, final int[] levels) { - long total = 0; - long weight = 1; - for (int i = 0; i < num_levels; i++) { - total += weight * (levels[i + 1] - levels[i]); - weight *= 2; - } - return total; - } - - static void mergeSortedArrays( + static void mergeSortedDoubleArrays( final double[] bufA, final int startA, final int lenA, final double[] bufB, final int startB, final int lenB, final double[] bufC, final int startC) { @@ -230,7 +114,7 @@ static void mergeSortedArrays( * @param isLevelZeroSorted true if this.level 0 is sorted * @return int array of: {numLevels, targetItemCount, currentItemCount) */ - static int[] generalCompress( + static int[] generalDoublesCompress( final int k, final int m, final int numLevelsIn, @@ -243,7 +127,7 @@ static int[] generalCompress( assert numLevelsIn > 0; // things are too weird if zero levels are allowed int numLevels = numLevelsIn; int currentItemCount = inLevels[numLevels] - inLevels[0]; // decreases with each compaction - int targetItemCount = computeTotalCapacity(k, m, numLevels); // increases if we add levels + int targetItemCount = KllHelper.computeTotalCapacity(k, m, numLevels); // increases if we add levels boolean doneYet = false; outLevels[0] = 0; int curLevel = -1; @@ -260,7 +144,7 @@ static int[] generalCompress( final int rawLim = inLevels[curLevel + 1]; final int rawPop = rawLim - rawBeg; - if ((currentItemCount < targetItemCount) || (rawPop < levelCapacity(k, numLevels, curLevel, m))) { + if ((currentItemCount < targetItemCount) || (rawPop < KllHelper.levelCapacity(k, numLevels, curLevel, m))) { // copy level over as is // because inBuf and outBuf could be the same, make sure we are not moving data upwards! assert (rawBeg >= outLevels[curLevel]); @@ -290,10 +174,10 @@ static int[] generalCompress( } if (popAbove == 0) { // Level above is empty, so halve up - randomlyHalveUp(inBuf, adjBeg, adjPop, random); + randomlyHalveUpDoubles(inBuf, adjBeg, adjPop, random); } else { // Level above is nonempty, so halve down, then merge up - randomlyHalveDown(inBuf, adjBeg, adjPop, random); - mergeSortedArrays(inBuf, adjBeg, halfAdjPop, inBuf, rawLim, popAbove, inBuf, adjBeg + halfAdjPop); + randomlyHalveDownDoubles(inBuf, adjBeg, adjPop, random); + mergeSortedDoubleArrays(inBuf, adjBeg, halfAdjPop, inBuf, rawLim, popAbove, inBuf, adjBeg + halfAdjPop); } // track the fact that we just eliminated some data @@ -306,7 +190,7 @@ static int[] generalCompress( // This creates some more capacity (the size of the new bottom level) if (curLevel == (numLevels - 1)) { numLevels++; - targetItemCount += levelCapacity(k, numLevels, 0, m); + targetItemCount += KllHelper.levelCapacity(k, numLevels, 0, m); } } // end of code for compacting a level @@ -323,7 +207,7 @@ static int[] generalCompress( } //This must be modified for validation - static void randomlyHalveDown(final double[] buf, final int start, final int length, final Random random) { + static void randomlyHalveDownDoubles(final double[] buf, final int start, final int length, final Random random) { assert isEven(length); final int half_length = length / 2; final int offset = random.nextInt(2); // disable for validation @@ -336,7 +220,7 @@ static void randomlyHalveDown(final double[] buf, final int start, final int len } //This must be modified for validation - static void randomlyHalveUp(final double[] buf, final int start, final int length, final Random random) { + static void randomlyHalveUpDoubles(final double[] buf, final int start, final int length, final Random random) { assert isEven(length); final int half_length = length / 2; final int offset = random.nextInt(2); // disable for validation diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index 56a177e4c..11752e919 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -27,6 +27,7 @@ import static java.lang.Math.min; import static java.lang.Math.pow; import static java.lang.Math.round; +import static org.apache.datasketches.Util.isOdd; import java.util.Arrays; import java.util.Random; @@ -331,7 +332,7 @@ private KllDoublesSketch(final Memory mem) { } levels_ = new int[numLevels_ + 1]; int offset = isSingleItem ? DATA_START_SINGLE_ITEM : DATA_START; - final int capacity = KllDoublesHelper.computeTotalCapacity(k_, m_, numLevels_); + final int capacity = KllHelper.computeTotalCapacity(k_, m_, numLevels_); if (isSingleItem) { levels_[0] = capacity - 1; } else { @@ -533,8 +534,8 @@ public int getNumRetained() { * @return upper bound on the serialized size */ public static int getMaxSerializedSizeBytes(final int k, final long n) { - final int numLevels = KllDoublesHelper.ubOnNumLevels(n); - final int maxNumItems = KllDoublesHelper.computeTotalCapacity(k, DEFAULT_M, numLevels); + final int numLevels = KllHelper.ubOnNumLevels(n); + final int maxNumItems = KllHelper.computeTotalCapacity(k, DEFAULT_M, numLevels); return getSerializedSizeBytes(numLevels, maxNumItems); } @@ -847,7 +848,7 @@ public String toString(final boolean withLevels, final boolean withData) { .append(" level, offset: nominal capacity, actual size").append(Util.LS); for (int i = 0; i < numLevels_; i++) { sb.append(" ").append(i).append(", ").append(levels_[i]).append(": ") - .append(KllDoublesHelper.levelCapacity(k_, numLevels_, i, m_)) + .append(KllHelper.levelCapacity(k_, numLevels_, i, m_)) .append(", ").append(safeLevelSize(i)).append(Util.LS); } sb.append("### End sketch levels").append(Util.LS); @@ -1008,7 +1009,7 @@ private void compressWhileUpdating() { // +2 is OK because we already added a new top level if necessary final int popAbove = levels_[level + 2] - rawLim; final int rawPop = rawLim - rawBeg; - final boolean oddPop = KllDoublesHelper.isOdd(rawPop); + final boolean oddPop = isOdd(rawPop); final int adjBeg = oddPop ? rawBeg + 1 : rawBeg; final int adjPop = oddPop ? rawPop - 1 : rawPop; final int halfAdjPop = adjPop / 2; @@ -1018,10 +1019,10 @@ private void compressWhileUpdating() { Arrays.sort(items_, adjBeg, adjBeg + adjPop); } if (popAbove == 0) { - KllDoublesHelper.randomlyHalveUp(items_, adjBeg, adjPop, random); + KllDoublesHelper.randomlyHalveUpDoubles(items_, adjBeg, adjPop, random); } else { - KllDoublesHelper.randomlyHalveDown(items_, adjBeg, adjPop, random); - KllDoublesHelper.mergeSortedArrays( + KllDoublesHelper.randomlyHalveDownDoubles(items_, adjBeg, adjPop, random); + KllDoublesHelper.mergeSortedDoubleArrays( items_, adjBeg, halfAdjPop, items_, rawLim, popAbove, items_, adjBeg + halfAdjPop); @@ -1057,7 +1058,7 @@ private int findLevelToCompact() { // while (true) { assert level < numLevels_; final int pop = levels_[level + 1] - levels_[level]; - final int cap = KllDoublesHelper.levelCapacity(k_, numLevels_, level, m_); + final int cap = KllHelper.levelCapacity(k_, numLevels_, level, m_); if (pop >= cap) { return level; } @@ -1074,10 +1075,10 @@ private void addEmptyTopLevelToCompletelyFullSketch() { // note that merging MIGHT over-grow levels_, in which case we might not have to grow it here if (levels_.length < numLevels_ + 2) { - levels_ = KllDoublesHelper.growIntArray(levels_, numLevels_ + 2); + levels_ = KllHelper.growIntArray(levels_, numLevels_ + 2); } - final int deltaCap = KllDoublesHelper.levelCapacity(k_, numLevels_ + 1, 0, m_); + final int deltaCap = KllHelper.levelCapacity(k_, numLevels_ + 1, 0, m_); final int newTotalCap = curTotalCap + deltaCap; final double[] newBuf = new double[newTotalCap]; @@ -1107,7 +1108,7 @@ private void sortLevelZero() { private void mergeHigherLevels(final KllDoublesSketch other, final long finalN) { final int tmpSpaceNeeded = getNumRetained() + other.getNumRetainedAboveLevelZero(); final double[] workbuf = new double[tmpSpaceNeeded]; - final int ub = KllDoublesHelper.ubOnNumLevels(finalN); + final int ub = KllHelper.ubOnNumLevels(finalN); final int[] worklevels = new int[ub + 2]; // ub+1 does not work final int[] outlevels = new int[ub + 2]; @@ -1116,7 +1117,7 @@ private void mergeHigherLevels(final KllDoublesSketch other, final long finalN) populateWorkArrays(other, workbuf, worklevels, provisionalNumLevels); // notice that workbuf is being used as both the input and output here - final int[] result = KllDoublesHelper.generalCompress(k_, m_, provisionalNumLevels, workbuf, + final int[] result = KllDoublesHelper.generalDoublesCompress(k_, m_, provisionalNumLevels, workbuf, worklevels, workbuf, outlevels, isLevelZeroSorted_, random); final int finalNumLevels = result[0]; final int finalCapacity = result[1]; @@ -1161,7 +1162,7 @@ private void populateWorkArrays(final KllDoublesSketch other, final double[] wor } else if (selfPop == 0 && otherPop > 0) { System.arraycopy(other.items_, other.levels_[lvl], workbuf, worklevels[lvl], otherPop); } else if (selfPop > 0 && otherPop > 0) { - KllDoublesHelper.mergeSortedArrays(items_, levels_[lvl], selfPop, other.items_, + KllDoublesHelper.mergeSortedDoubleArrays(items_, levels_[lvl], selfPop, other.items_, other.levels_[lvl], otherPop, workbuf, worklevels[lvl]); } } @@ -1178,7 +1179,7 @@ private int getNumRetainedAboveLevelZero() { } private void assertCorrectTotalWeight() { - final long total = KllDoublesHelper.sumTheSampleWeights(numLevels_, levels_); + final long total = KllHelper.sumTheSampleWeights(numLevels_, levels_); assert total == n_; } diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java index 6adeed460..d97ce1475 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java @@ -19,7 +19,8 @@ package org.apache.datasketches.kll; -import static org.apache.datasketches.Util.floorPowerOf2; +import static org.apache.datasketches.Util.isEven; +import static org.apache.datasketches.Util.isOdd; import java.util.Arrays; import java.util.Random; @@ -33,14 +34,6 @@ */ class KllFloatsHelper { - static boolean isEven(final int value) { - return (value & 1) == 0; - } - - static boolean isOdd(final int value) { - return (value & 1) == 1; - } - /** * Checks the sequential validity of the given array of float values. * They must be unique, monotonically increasing and not NaN. @@ -58,116 +51,7 @@ static void validateFloatValues(final float[] values) { } } - /** - * Copy the old array into a new larger array. - * The extra space is at the top. - * @param oldArr the given old array with data - * @param newLen the new length larger than the oldArr.length. - * @return the new array - */ - static int[] growIntArray(final int[] oldArr, final int newLen) { - final int oldLen = oldArr.length; - assert newLen > oldLen; - final int[] newArr = new int[newLen]; - System.arraycopy(oldArr, 0, newArr, 0, oldLen); - return newArr; - } - - /** - * Returns the upper bound of the number of levels based on n. - * @param n the length of the stream - * @return floor( log_2(n) ) - */ - static int ubOnNumLevels(final long n) { - return 1 + Long.numberOfTrailingZeros(floorPowerOf2(n)); - } - - /** - * Returns the maximum number of items that this sketch can handle - * @param k The sizing / accuracy parameter of the sketch in items. - * Note: this method actually works for k values up to k = 2^29 and 61 levels, - * however only k values up to (2^16 - 1) are currently used by the sketch. - * @param m the size of the smallest level in items. - * @param numLevels the upper bound number of levels based on n items. - * @return the total item capacity of the sketch. - */ - static int computeTotalCapacity(final int k, final int m, final int numLevels) { - long total = 0; - for (int h = 0; h < numLevels; h++) { - total += levelCapacity(k, numLevels, h, m); - } - return (int) total; - } - - /** - * Returns the capacity of a specific level. - * @param k the accuracy parameter of the sketch. Maximum is 2^29. - * @param numLevels the number of current levels in the sketch. Maximum is 61. - * @param height the zero-based index of a level with respect to the smallest level. - * This varies from 0 to 60. - * @param minWidth the minimum level width. Default is 8. - * @return the capacity of a specific level - */ - static int levelCapacity(final int k, final int numLevels, final int height, final int minWidth) { - assert (k <= (1 << 29)); - assert (numLevels >= 1) && (numLevels <= 61); - assert (height >= 0) && (height < numLevels); - final int depth = numLevels - height - 1; - return (int) Math.max(minWidth, intCapAux(k, depth)); - } - - /** - * Computes the actual capacity of a given level given its depth index. - * If the depth of levels exceeds 30, this uses a folding technique to accurately compute the - * actual level capacity up to a depth of 60. Without folding, the internal calculations would - * exceed the capacity of a long. - * @param k the configured k of the sketch - * @param depth the zero-based index of the level being computed. - * @return the actual capacity of a given level given its depth index. - */ - private static long intCapAux(final int k, final int depth) { - if (depth <= 30) { return intCapAuxAux(k, depth); } - final int half = depth / 2; - final int rest = depth - half; - final long tmp = intCapAuxAux(k, half); - return intCapAuxAux(tmp, rest); - } - - /** - * Performs the integer based calculation of an individual level (or folded level). - * @param k the configured k of the sketch - * @param depth depth the zero-based index of the level being computed. - * @return the actual capacity of a given level given its depth index. - */ - private static long intCapAuxAux(final long k, final int depth) { - final long twok = k << 1; // for rounding pre-multiply by 2 - final long tmp = ((twok << depth) / powersOfThree[depth]); - final long result = ((tmp + 1L) >>> 1); // add 1 and divide by 2 - assert (result <= k); - return result; - } - - /** - * This is the exact powers of 3 from 3^0 to 3^30 where the exponent is the index - */ - private static final long[] powersOfThree = - new long[] {1, 3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441, - 1594323, 4782969, 14348907, 43046721, 129140163, 387420489, 1162261467, - 3486784401L, 10460353203L, 31381059609L, 94143178827L, 282429536481L, - 847288609443L, 2541865828329L, 7625597484987L, 22876792454961L, 68630377364883L, - 205891132094649L}; - - static long sumTheSampleWeights(final int num_levels, final int[] levels) { - long total = 0; - long weight = 1; - for (int i = 0; i < num_levels; i++) { - total += weight * (levels[i + 1] - levels[i]); - weight *= 2; - } - return total; - } - - static void mergeSortedArrays( + static void mergeSortedFloatArrays( final float[] bufA, final int startA, final int lenA, final float[] bufB, final int startB, final int lenB, final float[] bufC, final int startC) { @@ -230,7 +114,7 @@ static void mergeSortedArrays( * @param isLevelZeroSorted true if this.level 0 is sorted * @return int array of: {numLevels, targetItemCount, currentItemCount) */ - static int[] generalCompress( + static int[] generalFloatsCompress( final int k, final int m, final int numLevelsIn, @@ -243,7 +127,7 @@ static int[] generalCompress( assert numLevelsIn > 0; // things are too weird if zero levels are allowed int numLevels = numLevelsIn; int currentItemCount = inLevels[numLevels] - inLevels[0]; // decreases with each compaction - int targetItemCount = computeTotalCapacity(k, m, numLevels); // increases if we add levels + int targetItemCount = KllHelper.computeTotalCapacity(k, m, numLevels); // increases if we add levels boolean doneYet = false; outLevels[0] = 0; int curLevel = -1; @@ -260,7 +144,7 @@ static int[] generalCompress( final int rawLim = inLevels[curLevel + 1]; final int rawPop = rawLim - rawBeg; - if ((currentItemCount < targetItemCount) || (rawPop < levelCapacity(k, numLevels, curLevel, m))) { + if ((currentItemCount < targetItemCount) || (rawPop < KllHelper.levelCapacity(k, numLevels, curLevel, m))) { // copy level over as is // because inBuf and outBuf could be the same, make sure we are not moving data upwards! assert (rawBeg >= outLevels[curLevel]); @@ -290,10 +174,10 @@ static int[] generalCompress( } if (popAbove == 0) { // Level above is empty, so halve up - randomlyHalveUp(inBuf, adjBeg, adjPop, random); + randomlyHalveUpFloats(inBuf, adjBeg, adjPop, random); } else { // Level above is nonempty, so halve down, then merge up - randomlyHalveDown(inBuf, adjBeg, adjPop, random); - mergeSortedArrays(inBuf, adjBeg, halfAdjPop, inBuf, rawLim, popAbove, inBuf, adjBeg + halfAdjPop); + randomlyHalveDownFloats(inBuf, adjBeg, adjPop, random); + mergeSortedFloatArrays(inBuf, adjBeg, halfAdjPop, inBuf, rawLim, popAbove, inBuf, adjBeg + halfAdjPop); } // track the fact that we just eliminated some data @@ -306,7 +190,7 @@ static int[] generalCompress( // This creates some more capacity (the size of the new bottom level) if (curLevel == (numLevels - 1)) { numLevels++; - targetItemCount += levelCapacity(k, numLevels, 0, m); + targetItemCount += KllHelper.levelCapacity(k, numLevels, 0, m); } } // end of code for compacting a level @@ -323,7 +207,7 @@ static int[] generalCompress( } //This must be modified for validation - static void randomlyHalveDown(final float[] buf, final int start, final int length, final Random random) { + static void randomlyHalveDownFloats(final float[] buf, final int start, final int length, final Random random) { assert isEven(length); final int half_length = length / 2; final int offset = random.nextInt(2); // disable for validation @@ -336,7 +220,7 @@ static void randomlyHalveDown(final float[] buf, final int start, final int leng } //This must be modified for validation - static void randomlyHalveUp(final float[] buf, final int start, final int length, final Random random) { + static void randomlyHalveUpFloats(final float[] buf, final int start, final int length, final Random random) { assert isEven(length); final int half_length = length / 2; final int offset = random.nextInt(2); // disable for validation diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java index 81c4d0788..1302bde47 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java @@ -27,6 +27,7 @@ import static java.lang.Math.min; import static java.lang.Math.pow; import static java.lang.Math.round; +import static org.apache.datasketches.Util.isOdd; import java.util.Arrays; import java.util.Random; @@ -329,7 +330,7 @@ private KllFloatsSketch(final Memory mem) { } levels_ = new int[numLevels_ + 1]; int offset = isSingleItem ? DATA_START_SINGLE_ITEM : DATA_START; - final int capacity = KllFloatsHelper.computeTotalCapacity(k_, m_, numLevels_); + final int capacity = KllHelper.computeTotalCapacity(k_, m_, numLevels_); if (isSingleItem) { levels_[0] = capacity - 1; } else { @@ -531,8 +532,8 @@ public int getNumRetained() { * @return upper bound on the serialized size */ public static int getMaxSerializedSizeBytes(final int k, final long n) { - final int numLevels = KllFloatsHelper.ubOnNumLevels(n); - final int maxNumItems = KllFloatsHelper.computeTotalCapacity(k, DEFAULT_M, numLevels); + final int numLevels = KllHelper.ubOnNumLevels(n); + final int maxNumItems = KllHelper.computeTotalCapacity(k, DEFAULT_M, numLevels); return getSerializedSizeBytes(numLevels, maxNumItems); } @@ -845,7 +846,7 @@ public String toString(final boolean withLevels, final boolean withData) { .append(" level, offset: nominal capacity, actual size").append(Util.LS); for (int i = 0; i < numLevels_; i++) { sb.append(" ").append(i).append(", ").append(levels_[i]).append(": ") - .append(KllFloatsHelper.levelCapacity(k_, numLevels_, i, m_)) + .append(KllHelper.levelCapacity(k_, numLevels_, i, m_)) .append(", ").append(safeLevelSize(i)).append(Util.LS); } sb.append("### End sketch levels").append(Util.LS); @@ -1006,7 +1007,7 @@ private void compressWhileUpdating() { // +2 is OK because we already added a new top level if necessary final int popAbove = levels_[level + 2] - rawLim; final int rawPop = rawLim - rawBeg; - final boolean oddPop = KllFloatsHelper.isOdd(rawPop); + final boolean oddPop = isOdd(rawPop); final int adjBeg = oddPop ? rawBeg + 1 : rawBeg; final int adjPop = oddPop ? rawPop - 1 : rawPop; final int halfAdjPop = adjPop / 2; @@ -1016,10 +1017,10 @@ private void compressWhileUpdating() { Arrays.sort(items_, adjBeg, adjBeg + adjPop); } if (popAbove == 0) { - KllFloatsHelper.randomlyHalveUp(items_, adjBeg, adjPop, random); + KllFloatsHelper.randomlyHalveUpFloats(items_, adjBeg, adjPop, random); } else { - KllFloatsHelper.randomlyHalveDown(items_, adjBeg, adjPop, random); - KllFloatsHelper.mergeSortedArrays( + KllFloatsHelper.randomlyHalveDownFloats(items_, adjBeg, adjPop, random); + KllFloatsHelper.mergeSortedFloatArrays( items_, adjBeg, halfAdjPop, items_, rawLim, popAbove, items_, adjBeg + halfAdjPop); @@ -1055,7 +1056,7 @@ private int findLevelToCompact() { // while (true) { assert level < numLevels_; final int pop = levels_[level + 1] - levels_[level]; - final int cap = KllFloatsHelper.levelCapacity(k_, numLevels_, level, m_); + final int cap = KllHelper.levelCapacity(k_, numLevels_, level, m_); if (pop >= cap) { return level; } @@ -1072,10 +1073,10 @@ private void addEmptyTopLevelToCompletelyFullSketch() { // note that merging MIGHT over-grow levels_, in which case we might not have to grow it here if (levels_.length < numLevels_ + 2) { - levels_ = KllFloatsHelper.growIntArray(levels_, numLevels_ + 2); + levels_ = KllHelper.growIntArray(levels_, numLevels_ + 2); } - final int deltaCap = KllFloatsHelper.levelCapacity(k_, numLevels_ + 1, 0, m_); + final int deltaCap = KllHelper.levelCapacity(k_, numLevels_ + 1, 0, m_); final int newTotalCap = curTotalCap + deltaCap; final float[] newBuf = new float[newTotalCap]; @@ -1105,7 +1106,7 @@ private void sortLevelZero() { private void mergeHigherLevels(final KllFloatsSketch other, final long finalN) { final int tmpSpaceNeeded = getNumRetained() + other.getNumRetainedAboveLevelZero(); final float[] workbuf = new float[tmpSpaceNeeded]; - final int ub = KllFloatsHelper.ubOnNumLevels(finalN); + final int ub = KllHelper.ubOnNumLevels(finalN); final int[] worklevels = new int[ub + 2]; // ub+1 does not work final int[] outlevels = new int[ub + 2]; @@ -1114,7 +1115,7 @@ private void mergeHigherLevels(final KllFloatsSketch other, final long finalN) { populateWorkArrays(other, workbuf, worklevels, provisionalNumLevels); // notice that workbuf is being used as both the input and output here - final int[] result = KllFloatsHelper.generalCompress(k_, m_, provisionalNumLevels, workbuf, + final int[] result = KllFloatsHelper.generalFloatsCompress(k_, m_, provisionalNumLevels, workbuf, worklevels, workbuf, outlevels, isLevelZeroSorted_, random); final int finalNumLevels = result[0]; final int finalCapacity = result[1]; @@ -1159,7 +1160,7 @@ private void populateWorkArrays(final KllFloatsSketch other, final float[] workb } else if (selfPop == 0 && otherPop > 0) { System.arraycopy(other.items_, other.levels_[lvl], workbuf, worklevels[lvl], otherPop); } else if (selfPop > 0 && otherPop > 0) { - KllFloatsHelper.mergeSortedArrays(items_, levels_[lvl], selfPop, other.items_, + KllFloatsHelper.mergeSortedFloatArrays(items_, levels_[lvl], selfPop, other.items_, other.levels_[lvl], otherPop, workbuf, worklevels[lvl]); } } @@ -1176,7 +1177,7 @@ private int getNumRetainedAboveLevelZero() { } private void assertCorrectTotalWeight() { - final long total = KllFloatsHelper.sumTheSampleWeights(numLevels_, levels_); + final long total = KllHelper.sumTheSampleWeights(numLevels_, levels_); assert total == n_; } diff --git a/src/main/java/org/apache/datasketches/kll/KllHelper.java b/src/main/java/org/apache/datasketches/kll/KllHelper.java new file mode 100644 index 000000000..917b11222 --- /dev/null +++ b/src/main/java/org/apache/datasketches/kll/KllHelper.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.apache.datasketches.Util.floorPowerOf2; + +public class KllHelper { + + /** + * Copy the old array into a new larger array. + * The extra space is at the top. + * @param oldArr the given old array with data + * @param newLen the new length larger than the oldArr.length. + * @return the new array + */ + static int[] growIntArray(final int[] oldArr, final int newLen) { + final int oldLen = oldArr.length; + assert newLen > oldLen; + final int[] newArr = new int[newLen]; + System.arraycopy(oldArr, 0, newArr, 0, oldLen); + return newArr; + } + + /** + * Returns the upper bound of the number of levels based on n. + * @param n the length of the stream + * @return floor( log_2(n) ) + */ + static int ubOnNumLevels(final long n) { + return 1 + Long.numberOfTrailingZeros(floorPowerOf2(n)); + } + + /** + * Returns the maximum number of items that this sketch can handle + * @param k The sizing / accuracy parameter of the sketch in items. + * Note: this method actually works for k values up to k = 2^29 and 61 levels, + * however only k values up to (2^16 - 1) are currently used by the sketch. + * @param m the size of the smallest level in items. + * @param numLevels the upper bound number of levels based on n items. + * @return the total item capacity of the sketch. + */ + static int computeTotalCapacity(final int k, final int m, final int numLevels) { + long total = 0; + for (int h = 0; h < numLevels; h++) { + total += levelCapacity(k, numLevels, h, m); + } + return (int) total; + } + + /** + * Returns the capacity of a specific level. + * @param k the accuracy parameter of the sketch. Maximum is 2^29. + * @param numLevels the number of current levels in the sketch. Maximum is 61. + * @param height the zero-based index of a level with respect to the smallest level. + * This varies from 0 to 60. + * @param minWidth the minimum level width. Default is 8. + * @return the capacity of a specific level + */ + static int levelCapacity(final int k, final int numLevels, final int height, final int minWidth) { + assert (k <= (1 << 29)); + assert (numLevels >= 1) && (numLevels <= 61); + assert (height >= 0) && (height < numLevels); + final int depth = numLevels - height - 1; + return (int) Math.max(minWidth, intCapAux(k, depth)); + } + + /** + * Computes the actual capacity of a given level given its depth index. + * If the depth of levels exceeds 30, this uses a folding technique to accurately compute the + * actual level capacity up to a depth of 60. Without folding, the internal calculations would + * exceed the capacity of a long. + * @param k the configured k of the sketch + * @param depth the zero-based index of the level being computed. + * @return the actual capacity of a given level given its depth index. + */ + private static long intCapAux(final int k, final int depth) { + if (depth <= 30) { return intCapAuxAux(k, depth); } + final int half = depth / 2; + final int rest = depth - half; + final long tmp = intCapAuxAux(k, half); + return intCapAuxAux(tmp, rest); + } + + /** + * Performs the integer based calculation of an individual level (or folded level). + * @param k the configured k of the sketch + * @param depth depth the zero-based index of the level being computed. + * @return the actual capacity of a given level given its depth index. + */ + private static long intCapAuxAux(final long k, final int depth) { + final long twok = k << 1; // for rounding pre-multiply by 2 + final long tmp = ((twok << depth) / powersOfThree[depth]); + final long result = ((tmp + 1L) >>> 1); // add 1 and divide by 2 + assert (result <= k); + return result; + } + + /** + * This is the exact powers of 3 from 3^0 to 3^30 where the exponent is the index + */ + private static final long[] powersOfThree = + new long[] {1, 3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441, + 1594323, 4782969, 14348907, 43046721, 129140163, 387420489, 1162261467, + 3486784401L, 10460353203L, 31381059609L, 94143178827L, 282429536481L, + 847288609443L, 2541865828329L, 7625597484987L, 22876792454961L, 68630377364883L, + 205891132094649L}; + + static long sumTheSampleWeights(final int num_levels, final int[] levels) { + long total = 0; + long weight = 1; + for (int i = 0; i < num_levels; i++) { + total += weight * (levels[i + 1] - levels[i]); + weight *= 2; + } + return total; + } + +} + diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java index a1618453e..7dea40cff 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java @@ -378,21 +378,21 @@ public void getMaxSerializedSizeBytes() { @Test public void checkUbOnNumLevels() { - assertEquals(KllDoublesHelper.ubOnNumLevels(0), 1); + assertEquals(KllHelper.ubOnNumLevels(0), 1); } @Test public void checkIntCapAux() { - int lvlCap = KllDoublesHelper.levelCapacity(10, 61, 0, 8); + int lvlCap = KllHelper.levelCapacity(10, 61, 0, 8); assertEquals(lvlCap, 8); - lvlCap = KllDoublesHelper.levelCapacity(10, 61, 60, 8); + lvlCap = KllHelper.levelCapacity(10, 61, 60, 8); assertEquals(lvlCap, 10); } @Test public void checkSuperLargeKandLevels() { //This is beyond what the sketch can be configured for. - final int size = KllDoublesHelper.computeTotalCapacity(1 << 29, 8, 61); + final int size = KllHelper.computeTotalCapacity(1 << 29, 8, 61); assertEquals(size, 1_610_612_846); } diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesValidationTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesValidationTest.java index d784c4434..ec1087d70 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDoublesValidationTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDoublesValidationTest.java @@ -19,8 +19,9 @@ package org.apache.datasketches.kll; +import static org.apache.datasketches.Util.isOdd; + import org.testng.Assert; -import org.testng.annotations.Test; /* A test record contains: 0. testIndex @@ -157,7 +158,7 @@ public class KllDoublesValidationTest { }; private static int[] makeInputArray(int n, int stride) { - assert KllDoublesHelper.isOdd(stride); + assert isOdd(stride); int mask = (1 << 23) - 1; int cur = 0; int[] arr = new int[n]; diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java index ea4d083b9..03af8f3f0 100644 --- a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java @@ -389,21 +389,21 @@ public void getMaxSerializedSizeBytes() { @Test public void checkUbOnNumLevels() { - assertEquals(KllFloatsHelper.ubOnNumLevels(0), 1); + assertEquals(KllHelper.ubOnNumLevels(0), 1); } @Test public void checkIntCapAux() { - int lvlCap = KllFloatsHelper.levelCapacity(10, 61, 0, 8); + int lvlCap = KllHelper.levelCapacity(10, 61, 0, 8); assertEquals(lvlCap, 8); - lvlCap = KllFloatsHelper.levelCapacity(10, 61, 60, 8); + lvlCap = KllHelper.levelCapacity(10, 61, 60, 8); assertEquals(lvlCap, 10); } @Test public void checkSuperLargeKandLevels() { //This is beyond what the sketch can be configured for. - final int size = KllFloatsHelper.computeTotalCapacity(1 << 29, 8, 61); + final int size = KllHelper.computeTotalCapacity(1 << 29, 8, 61); assertEquals(size, 1_610_612_846); } diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsValidationTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsValidationTest.java index 88ed89799..71de641ed 100644 --- a/src/test/java/org/apache/datasketches/kll/KllFloatsValidationTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllFloatsValidationTest.java @@ -19,6 +19,8 @@ package org.apache.datasketches.kll; +import static org.apache.datasketches.Util.isOdd; + import org.testng.Assert; import org.testng.annotations.Test; @@ -157,7 +159,7 @@ public class KllFloatsValidationTest { }; private static int[] makeInputArray(int n, int stride) { - assert KllFloatsHelper.isOdd(stride); + assert isOdd(stride); int mask = (1 << 23) - 1; // because library items are single-precision floats int cur = 0; int[] arr = new int[n];