diff --git a/src/main/java/org/apache/datasketches/QuantilesHelper.java b/src/main/java/org/apache/datasketches/QuantilesHelper.java index c7546569c..f128a0d2c 100644 --- a/src/main/java/org/apache/datasketches/QuantilesHelper.java +++ b/src/main/java/org/apache/datasketches/QuantilesHelper.java @@ -29,7 +29,7 @@ public class QuantilesHelper { * An array of {1,1,1,0} becomes {0,1,2,3} * @param array of weights where first element is zero * @return total weight - */ //also used by KLL + */ //used by classic Quantiles and KLL public static long convertToPrecedingCummulative(final long[] array) { long subtotal = 0; for (int i = 0; i < array.length; i++) { @@ -43,15 +43,28 @@ public static long convertToPrecedingCummulative(final long[] array) { /** * Returns the linear zero-based index (position) of a value in the hypothetical sorted stream of * values of size n. - * @param phi the fractional position where: 0 ≤ φ ≤ 1.0. + * @param rank the fractional position where: 0 ≤ φ ≤ 1.0. * @param n the size of the stream * @return the index, a value between 0 and n-1. - */ //also used by KLL - public static long posOfPhi(final double phi, final long n) { - final long pos = (long) Math.floor(phi * n); + */ //used by classic Quantiles and KLL + public static long posOfRank(final double rank, final long n) { + final long pos = (long) Math.floor(rank * n); return pos == n ? n - 1 : pos; //avoids ArrayIndexOutOfBoundException } + /** + * Returns the linear zero-based index (position) of a value in the hypothetical sorted stream of + * values of size n. + * @param rank the fractional position where: 0 ≤ φ ≤ 1.0. + * @param n the size of the stream + * @return the index, a value between 0 and n-1. + * @deprecated use {@link #posOfRank(double, long)} instead. Version 3.2.0. + */ //used by classic Quantiles and KLL + @Deprecated + public static long posOfPhi(final double rank, final long n) { + return posOfRank(rank, n); + } + /** * This is written in terms of a plain array to facilitate testing. * @param wtArr the cumulative weights array consisting of chunks diff --git a/src/main/java/org/apache/datasketches/kll/BaseKllSketch.java b/src/main/java/org/apache/datasketches/kll/BaseKllSketch.java deleted file mode 100644 index be2335f79..000000000 --- a/src/main/java/org/apache/datasketches/kll/BaseKllSketch.java +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.kll; - -import static java.lang.Math.abs; -import static java.lang.Math.ceil; -import static java.lang.Math.exp; -import static java.lang.Math.log; -import static java.lang.Math.max; -import static java.lang.Math.min; -import static java.lang.Math.pow; -import static java.lang.Math.round; - -import java.util.Random; - -import org.apache.datasketches.SketchesArgumentException; - -abstract class BaseKllSketch { - - /* Serialized float sketch layout, more than one item: - * Adr: - * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || unused | M |--------K--------| Flags | FamID | SerVer | PreambleInts | - * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | - * 1 ||---------------------------------N_LONG---------------------------------------| - * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | - * 2 ||<--------------data----------------| unused |numLevels|-------min K-----------| - * - * - * - * Serialized float sketch layout, Empty and Single Item: - * Adr: - * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || unused | M |--------K--------| Flags | FamID | SerVer | PreambleInts | - * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | - * 1 || |-------------------data-------------------| - */ - - /* Serialized double sketch layout, more than one item: - * Adr: - * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || unused | M |--------K--------| Flags | FamID | SerVer | PreambleInts | - * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | - * 1 ||---------------------------------N_LONG---------------------------------------| - * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | - * 2 ||<-------------unused------------------------|numLevels|-------min K-----------| - * || | 24 | - * 3 ||<---------------------------------data----------------------------------------| - * - * Serialized double sketch layout, Empty and Single Item: - * Adr: - * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || unused | M |--------K--------| Flags | FamID | SerVer | PreambleInts | - * || | 8 | - * 1 ||----------------------------------data----------------------------------------| - */ - - /** - * The default value of K. - */ - public static final int DEFAULT_K = 200; - static final int DEFAULT_M = 8; - static final int MIN_K = DEFAULT_M; - static final int MAX_K = (1 << 16) - 1; // serialized as an unsigned short - - // Preamble byte addresses - static final int PREAMBLE_INTS_BYTE = 0; - static final int SER_VER_BYTE = 1; - static final int FAMILY_BYTE = 2; - static final int FLAGS_BYTE = 3; - static final int K_SHORT = 4; // to 5 - static final int M_BYTE = 6; - // 7 is reserved for future use - // SINGLE ITEM ONLY - static final int DATA_START_SINGLE_ITEM = 8; - - // MULTI-ITEM - static final int N_LONG = 8; // to 15 - static final int MIN_K_SHORT = 16; // to 17 - static final int NUM_LEVELS_BYTE = 18; - - // FLOAT SKETCH 19 is reserved for future use in float sketch - static final int DATA_START_FLOAT = 20; // float sketch, not single item - - // DOUBLE SKETCH 19 to 23 is reserved for future use in double sketch - static final int DATA_START_DOUBLE = 24; // double sketch, not single item - - // Other static values - static final byte SERIAL_VERSION = 1; - static final byte SERIAL_VERSION_SINGLE = 2; // only used to specify the single-item format - static final int PREAMBLE_INTS_EMPTY_SINGLE = 2; // for empty and single item - static final int PREAMBLE_INTS_FLOAT = 5; // not empty or single item - static final int PREAMBLE_INTS_DOUBLE = 6; // not empty or single item - - enum Flags { IS_EMPTY, IS_LEVEL_ZERO_SORTED, IS_SINGLE_ITEM } - - /* - * Data is stored in items_. - * The data for level i lies in positions levels_[i] through levels_[i + 1] - 1 inclusive. - * Hence levels_ must contain (numLevels_ + 1) indices. - * The valid portion of items_ is completely packed, except for level 0. - * Level 0 is filled from the top down. - * - * Invariants: - * 1) After a compaction, or an update, or a merge, all levels are sorted except for level zero. - * 2) After a compaction, (sum of capacities) - (sum of items) >= 1, - * so there is room for least 1 more item in level zero. - * 3) There are no gaps except at the bottom, so if levels_[0] = 0, - * the sketch is exactly filled to capacity and must be compacted. - * 4) Sum of weights of retained items == N. - * 5) curTotalCap == items_.length == levels_[numLevels_]. - */ - - final int k_; // configured value of K - final int m_; // configured minimum buffer "width", Must always be DEFAULT_M for now. - - int minK_; // for error estimation after merging with different k - long n_; // number of items input into this sketch - int numLevels_; // one-based number of current levels, - int[] levels_; // array of index offsets into the items[]. Size = numLevels + 1. - boolean isLevelZeroSorted_; - - final boolean compatible; //compatible with quantiles sketch - static final Random random = new Random(); - - /** - * Heap constructor. - * @param k configured size of sketch. Range [m, 2^16] - * @param m minimum level size. Default is 8. - */ - BaseKllSketch(final int k, final int m, final boolean compatible) { - checkK(k); - k_ = k; - minK_ = k; - m_ = m; - numLevels_ = 1; - levels_ = new int[] {k, k}; - isLevelZeroSorted_ = false; - this.compatible = compatible; - } - - // public functions - - /** - * Returns the parameter k - * @return parameter k - */ - public int getK() { - return k_; - } - - /** - * Gets the approximate value of k to use given epsilon, the normalized rank error. - * @param epsilon the normalized rank error between zero and one. - * @param pmf if true, this function returns the value of k assuming the input epsilon - * is the desired "double-sided" epsilon for the getPMF() function. Otherwise, this function - * returns the value of k assuming the input epsilon is the desired "single-sided" - * epsilon for all the other queries. - * @return the value of k given a value of epsilon. - * @see KllDoublesSketch - */ - // constants were derived as the best fit to 99 percentile empirically measured max error in - // thousands of trials - public static int getKFromEpsilon(final double epsilon, final boolean pmf) { - //Ensure that eps is >= than the lowest possible eps given MAX_K and pmf=false. - final double eps = max(epsilon, 4.7634E-5); - final double kdbl = pmf - ? exp(log(2.446 / eps) / 0.9433) - : exp(log(2.296 / eps) / 0.9723); - final double krnd = round(kdbl); - final double del = abs(krnd - kdbl); - final int k = (int) (del < 1E-6 ? krnd : ceil(kdbl)); - return max(MIN_K, min(MAX_K, k)); - } - - /** - * Returns the length of the input stream. - * @return stream length - */ - public long getN() { - return n_; - } - - /** - * Gets the approximate rank error of this sketch normalized as a fraction between zero and one. - * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function. - * Otherwise, it is the "single-sided" normalized rank error for all the other queries. - * @return if pmf is true, returns the normalized rank error for the getPMF() function. - * Otherwise, it is the "single-sided" normalized rank error for all the other queries. - * @see KllDoublesSketch - */ - public double getNormalizedRankError(final boolean pmf) { - return getNormalizedRankError(minK_, pmf); - } - - /** - * Gets the normalized rank error given k and pmf. - * Static method version of the getNormalizedRankError(boolean). - * @param k the configuration parameter - * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function. - * Otherwise, it is the "single-sided" normalized rank error for all the other queries. - * @return if pmf is true, the normalized rank error for the getPMF() function. - * Otherwise, it is the "single-sided" normalized rank error for all the other queries. - * @see KllDoublesSketch - */ - // constants were derived as the best fit to 99 percentile empirically measured max error in - // thousands of trials - public static double getNormalizedRankError(final int k, final boolean pmf) { - return pmf - ? 2.446 / pow(k, 0.9433) - : 2.296 / pow(k, 0.9723); - } - - /** - * Returns the number of retained items (samples) in the sketch. - * @return the number of retained items (samples) in the sketch - */ - public int getNumRetained() { - return levels_[numLevels_] - levels_[0]; - } - - /** - * Returns true if this sketch is empty. - * @return empty flag - */ - public boolean isEmpty() { - return n_ == 0; - } - - /** - * Returns true if this sketch is in estimation mode. - * @return estimation mode flag - */ - public boolean isEstimationMode() { - return numLevels_ > 1; - } - - /** - * Returns serialized sketch in a compact byte array form. - * @return serialized sketch in a compact byte array form. - */ - public abstract byte[] toByteArray(); - - - @Override - public String toString() { - return toString(false, false); - } - - /** - * Returns a summary of the sketch as a string. - * @param withLevels if true include information about levels - * @param withData if true include sketch data - * @return string representation of sketch summary - */ - public abstract String toString(final boolean withLevels, final boolean withData); - - // Restricted Methods - - /** - * Checks the validity of the given value k - * @param k must be greater than 7 and less than 65536. - */ - private static void checkK(final int k) { - if (k < MIN_K || k > MAX_K) { - throw new SketchesArgumentException( - "K must be >= " + MIN_K + " and <= " + MAX_K + ": " + k); - } - } - - /** - * Finds the first level starting with level 0 that exceeds its nominal capacity - * @return level to compact - */ - int findLevelToCompact() { // - int level = 0; - while (true) { - assert level < numLevels_; - final int pop = levels_[level + 1] - levels_[level]; - final int cap = KllHelper.levelCapacity(k_, numLevels_, level, m_); - if (pop >= cap) { - return level; - } - level++; - } - } - - int currentLevelSize(final int level) { - if (level >= numLevels_) { return 0; } - return levels_[level + 1] - levels_[level]; - } - - int getNumRetainedAboveLevelZero() { - if (numLevels_ == 1) { return 0; } - return levels_[numLevels_] - levels_[1]; - } - - // for testing - - int[] getLevels() { - return levels_; - } - - int getNumLevels() { - return numLevels_; - } - -} - diff --git a/src/main/java/org/apache/datasketches/kll/KllDirectDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDirectDoublesSketch.java new file mode 100644 index 000000000..0c89ff3a6 --- /dev/null +++ b/src/main/java/org/apache/datasketches/kll/KllDirectDoublesSketch.java @@ -0,0 +1,395 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static java.lang.Math.max; +import static java.lang.Math.min; +import static org.apache.datasketches.kll.KllPreambleUtil.DATA_START_ADR; +import static org.apache.datasketches.kll.KllPreambleUtil.DOUBLES_SKETCH_BIT_MASK; +import static org.apache.datasketches.kll.KllPreambleUtil.PREAMBLE_INTS_FULL; +import static org.apache.datasketches.kll.KllPreambleUtil.SERIAL_VERSION_UPDATABLE; +import static org.apache.datasketches.kll.KllPreambleUtil.UPDATABLE_BIT_MASK; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryFamilyID; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryFlags; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryK; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryM; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryMinK; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryN; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryNumLevels; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryPreInts; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemorySerVer; +import static org.apache.datasketches.kll.KllSketch.Error.MUST_NOT_CALL; +import static org.apache.datasketches.kll.KllSketch.Error.SRC_MUST_BE_DOUBLE; +import static org.apache.datasketches.kll.KllSketch.Error.TGT_IS_IMMUTABLE; +import static org.apache.datasketches.kll.KllSketch.Error.kllSketchThrow; + +import org.apache.datasketches.Family; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.MemoryRequestServer; +import org.apache.datasketches.memory.WritableMemory; + +/** + * This class implements an off-heap doubles KllSketch via a WritableMemory instance of the sketch. + * + *

Please refer to the documentation in the package-info:
+ * {@link org.apache.datasketches.kll}

+ * + * @author Lee Rhodes, Kevin Lang + */ +public final class KllDirectDoublesSketch extends KllDirectSketch { + + /** + * The actual constructor. + * @param wmem the current WritableMemory + * @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory + * @param memVal the MemoryValadate object + */ + private KllDirectDoublesSketch(final WritableMemory wmem, final MemoryRequestServer memReqSvr, + final KllMemoryValidate memVal) { + super(SketchType.DOUBLES_SKETCH, wmem, memReqSvr, memVal); + } + + /** + * Heapifies the given Memory object and returns a KllDoublesSketch + * @param mem the given Memory object. + * @return a KllDoublesSketch + */ + public static KllDoublesSketch heapify(final Memory mem) { + return KllDoublesSketch.heapify(mem); + } + + /** + * Create a new instance of this sketch using the default m of 8. + * @param k parameter that controls size of the sketch and accuracy of estimates + * @param dstMem the given destination WritableMemory object for use by the sketch + * @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory + * @return a new instance of this sketch + */ + public static KllDirectDoublesSketch newInstance(final int k, final WritableMemory dstMem, + final MemoryRequestServer memReqSvr) { + return newInstance(k, KllSketch.DEFAULT_M, dstMem, memReqSvr); + } + + /** + * Create a new instance of this sketch. + * @param k parameter that controls size of the sketch and accuracy of estimates + * @param m parameter that controls the minimum level width in items. + * @param dstMem the given destination WritableMemory object for use by the sketch + * @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory + * @return a new instance of this sketch + */ + static KllDirectDoublesSketch newInstance(final int k, final int m, final WritableMemory dstMem, + final MemoryRequestServer memReqSvr) { + setMemoryPreInts(dstMem, PREAMBLE_INTS_FULL); + setMemorySerVer(dstMem, SERIAL_VERSION_UPDATABLE); + setMemoryFamilyID(dstMem, Family.KLL.getID()); + setMemoryFlags(dstMem, DOUBLES_SKETCH_BIT_MASK | UPDATABLE_BIT_MASK); + setMemoryK(dstMem, k); + setMemoryM(dstMem, m); + setMemoryN(dstMem, 0); + setMemoryMinK(dstMem, k); + setMemoryNumLevels(dstMem, 1); + int offset = DATA_START_ADR; + dstMem.putIntArray(offset, new int[] {k, k}, 0, 2); + offset += 2 * Integer.BYTES; + dstMem.putDoubleArray(offset, new double[] {Double.NaN, Double.NaN}, 0, 2); + offset += 2 * Double.BYTES; + dstMem.putDoubleArray(offset, new double[k], 0, k); + final KllMemoryValidate memVal = new KllMemoryValidate(dstMem); + return new KllDirectDoublesSketch(dstMem, memReqSvr, memVal); + } + + /** + * Wrap a sketch around the given source Memory containing sketch data that originated from + * this sketch. + * @param srcMem a WritableMemory that contains data. + * @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory + * @return instance of this sketch + */ + public static KllDirectDoublesSketch writableWrap(final WritableMemory srcMem, final MemoryRequestServer memReqSvr) { + final KllMemoryValidate memVal = new KllMemoryValidate(srcMem); + return new KllDirectDoublesSketch(srcMem, memReqSvr, memVal); + } + + /** + * Returns an approximation to the Cumulative Distribution Function (CDF), which is the + * cumulative analog of the PMF, of the input stream given a set of splitPoint (values). + * + *

The resulting approximations have a probabilistic guarantee that can be obtained from the + * getNormalizedRankError(false) function. + * + *

If the sketch is empty this returns null.

+ * + * @param splitPoints an array of m unique, monotonically increasing double values + * that divide the real number line into m+1 consecutive disjoint intervals. + * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and + * exclusive of the right splitPoint, with the exception that the last interval will include + * the maximum value. + * It is not necessary to include either the min or max values in these split points. + * + * @return an array of m+1 double values on the interval [0.0, 1.0), + * which are a consecutive approximation to the CDF of the input stream given the splitPoints. + * The value at array position j of the returned CDF array is the sum of the returned values + * in positions 0 through j of the returned PMF array. + */ + public double[] getCDF(final double[] splitPoints) { + return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true); + } + + /** + * Returns the max value of the stream. + * If the sketch is empty this returns NaN. + * + * @return the max value of the stream + */ + public double getMaxValue() { return getMaxDoubleValue(); } + + /** + * Returns the min value of the stream. + * If the sketch is empty this returns NaN. + * + * @return the min value of the stream + */ + public double getMinValue() { return getMinDoubleValue(); } + + /** + * Returns an approximation to the Probability Mass Function (PMF) of the input stream + * given a set of splitPoints (values). + * + *

The resulting approximations have a probabilistic guarantee that can be obtained from the + * getNormalizedRankError(true) function. + * + *

If the sketch is empty this returns null.

+ * + * @param splitPoints an array of m unique, monotonically increasing double values + * that divide the real number line into m+1 consecutive disjoint intervals. + * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and + * exclusive of the right splitPoint, with the exception that the last interval will include + * the maximum value. + * It is not necessary to include either the min or max values in these split points. + * + * @return an array of m+1 doubles on the interval [0.0, 1.0), + * each of which is an approximation to the fraction of the total input stream values + * (the mass) that fall into one of those intervals. + * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right + * splitPoint, with the exception that the last interval will include maximum value. + */ + public double[] getPMF(final double[] splitPoints) { + return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false); + } + + /** + * Returns an approximation to the value of the data item + * that would be preceded by the given fraction of a hypothetical sorted + * version of the input stream so far. + * + *

We note that this method has a fairly large overhead (microseconds instead of nanoseconds) + * so it should not be called multiple times to get different quantiles from the same + * sketch. Instead use getQuantiles(), which pays the overhead only once. + * + *

If the sketch is empty this returns NaN. + * + * @param fraction the specified fractional position in the hypothetical sorted stream. + * These are also called normalized ranks or fractional ranks. + * If fraction = 0.0, the true minimum value of the stream is returned. + * If fraction = 1.0, the true maximum value of the stream is returned. + * + * @return the approximation to the value at the given fraction + */ + public double getQuantile(final double fraction) { + return KllDoublesHelper.getDoublesQuantile(this, fraction); + } + + /** + * Gets the lower bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. + * @param fraction the given normalized rank as a fraction + * @return the lower bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. Returns NaN if the sketch is empty. + */ + public double getQuantileLowerBound(final double fraction) { + return getQuantile(max(0, fraction - KllHelper.getNormalizedRankError(getMinK(), false))); + } + + /** + * This is a more efficient multiple-query version of getQuantile(). + * + *

This returns an array that could have been generated by using getQuantile() with many + * different fractional ranks, but would be very inefficient. + * This method incurs the internal set-up overhead once and obtains multiple quantile values in + * a single query. It is strongly recommend that this method be used instead of multiple calls + * to getQuantile(). + * + *

If the sketch is empty this returns null. + * + * @param fractions given array of fractional positions in the hypothetical sorted stream. + * These are also called normalized ranks or fractional ranks. + * These fractions must be in the interval [0.0, 1.0], inclusive. + * + * @return array of approximations to the given fractions in the same order as given fractions + * array. + */ + public double[] getQuantiles(final double[] fractions) { + return KllDoublesHelper.getDoublesQuantiles(this, fractions); + } + + /** + * This is also a more efficient multiple-query version of getQuantile() and allows the caller to + * specify the number of evenly spaced fractional ranks. + * + *

If the sketch is empty this returns null. + * + * @param numEvenlySpaced an integer that specifies the number of evenly spaced fractional ranks. + * This must be a positive integer greater than 0. A value of 1 will return the min value. + * A value of 2 will return the min and the max value. A value of 3 will return the min, + * the median and the max value, etc. + * + * @return array of approximations to the given fractions in the same order as given fractions + * array. + */ + public double[] getQuantiles(final int numEvenlySpaced) { + if (isEmpty()) { return null; } + return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, numEvenlySpaced)); + } + + /** + * Gets the upper bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. + * @param fraction the given normalized rank as a fraction + * @return the upper bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. Returns NaN if the sketch is empty. + */ + public double getQuantileUpperBound(final double fraction) { + return getQuantile(min(1.0, fraction + KllHelper.getNormalizedRankError(getMinK(), false))); + } + + /** + * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, + * inclusive. + * + *

The resulting approximation has a probabilistic guarantee that can be obtained from the + * getNormalizedRankError(false) function. + * + *

If the sketch is empty this returns NaN.

+ * + * @param value to be ranked + * @return an approximate rank of the given value + */ + public double getRank(final double value) { + return KllDoublesHelper.getDoubleRank(this, value); + } + + /** + * @return the iterator for this class + */ + public KllDoublesSketchIterator iterator() { + return new KllDoublesSketchIterator(getDoubleItemsArray(), getLevelsArray(), getNumLevels()); + } + + /** + * Merges another sketch into this one. + * @param other sketch to merge into this one + */ + public void merge(final KllSketch other) { + if (!other.isDoublesSketch()) { kllSketchThrow(SRC_MUST_BE_DOUBLE); } + KllDoublesHelper.mergeDoubleImpl(this, other); + } + + /** + * Updates this sketch with the given data item. + * + * @param value an item from a stream of items. NaNs are ignored. + */ + public void update(final double value) { + KllDoublesHelper.updateDouble(this, value); + } + + @Override + double[] getDoubleItemsArray() { + final int items = getItemsArrLengthItems(); + final double[] itemsArr = new double[items]; + itemsArrUpdatable.getDoubleArray(0, itemsArr, 0, items); + return itemsArr; + } + + @Override + double getDoubleItemsArrayAt(final int index) { + return itemsArrUpdatable.getDouble((long)index * Double.BYTES); + } + + @Override + float[] getFloatItemsArray() { kllSketchThrow(MUST_NOT_CALL); return null; } + + @Override + float getFloatItemsArrayAt(final int index) { kllSketchThrow(MUST_NOT_CALL); return Float.NaN; } + + @Override + double getMaxDoubleValue() { + return minMaxArrUpdatable.getDouble(Double.BYTES); + } + + @Override + float getMaxFloatValue() { kllSketchThrow(MUST_NOT_CALL); return Float.NaN; } + + @Override + double getMinDoubleValue() { + return minMaxArrUpdatable.getDouble(0); + } + + @Override + float getMinFloatValue() { kllSketchThrow(MUST_NOT_CALL); return Float.NaN; } + + @Override + void setDoubleItemsArray(final double[] doubleItems) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + itemsArrUpdatable.putDoubleArray(0, doubleItems, 0, doubleItems.length); + } + + @Override + void setDoubleItemsArrayAt(final int index, final double value) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + itemsArrUpdatable.putDouble((long)index * Double.BYTES, value); + } + + @Override + void setFloatItemsArray(final float[] floatItems) { kllSketchThrow(MUST_NOT_CALL); } + + @Override + void setFloatItemsArrayAt(final int index, final float value) { kllSketchThrow(MUST_NOT_CALL); } + + @Override + void setMaxDoubleValue(final double value) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + minMaxArrUpdatable.putDouble(Double.BYTES, value); + } + + @Override + void setMaxFloatValue(final float value) { kllSketchThrow(MUST_NOT_CALL); } + + @Override + void setMinDoubleValue(final double value) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + minMaxArrUpdatable.putDouble(0, value); + } + + @Override + void setMinFloatValue(final float value) { kllSketchThrow(MUST_NOT_CALL); } + +} diff --git a/src/main/java/org/apache/datasketches/kll/KllDirectFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllDirectFloatsSketch.java new file mode 100644 index 000000000..54abe54dd --- /dev/null +++ b/src/main/java/org/apache/datasketches/kll/KllDirectFloatsSketch.java @@ -0,0 +1,395 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static java.lang.Math.max; +import static java.lang.Math.min; +import static org.apache.datasketches.kll.KllPreambleUtil.DATA_START_ADR; +import static org.apache.datasketches.kll.KllPreambleUtil.PREAMBLE_INTS_FULL; +import static org.apache.datasketches.kll.KllPreambleUtil.SERIAL_VERSION_UPDATABLE; +import static org.apache.datasketches.kll.KllPreambleUtil.UPDATABLE_BIT_MASK; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryMinK; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryFamilyID; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryFlags; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryK; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryM; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryN; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryNumLevels; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryPreInts; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemorySerVer; +import static org.apache.datasketches.kll.KllSketch.Error.MUST_NOT_CALL; +import static org.apache.datasketches.kll.KllSketch.Error.SRC_MUST_BE_FLOAT; +import static org.apache.datasketches.kll.KllSketch.Error.TGT_IS_IMMUTABLE; +import static org.apache.datasketches.kll.KllSketch.Error.kllSketchThrow; + +import org.apache.datasketches.Family; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.MemoryRequestServer; +import org.apache.datasketches.memory.WritableMemory; + +//Intentional extra blank line so the code lines up with KllDirectDoublesSketch +/** + * This class implements an off-heap floats KllSketch via a WritableMemory instance of the sketch. + * + *

Please refer to the documentation in the package-info:
+ * {@link org.apache.datasketches.kll}

+ * + * @author Lee Rhodes, Kevin Lang + */ +public final class KllDirectFloatsSketch extends KllDirectSketch { + + /** + * The actual constructor + * @param wmem the current WritableMemory + * @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory + * @param memVal the MemoryValadate object + */ + private KllDirectFloatsSketch(final WritableMemory wmem, final MemoryRequestServer memReqSvr, + final KllMemoryValidate memVal) { + super(SketchType.FLOATS_SKETCH, wmem, memReqSvr, memVal); + } + + /** + * Heapifies the given Memory object and returns a KllFloatsSketch + * @param mem the given Memory object. + * @return a KllFloatsSketch + */ + public static KllFloatsSketch heapify(final Memory mem) { + return KllFloatsSketch.heapify(mem); + } + + /** + * Create a new instance of this sketch using the default m of 8. + * @param k parameter that controls size of the sketch and accuracy of estimates + * @param dstMem the given destination WritableMemory object for use by the sketch + * @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory + * @return a new instance of this sketch + */ + public static KllDirectFloatsSketch newInstance(final int k, final WritableMemory dstMem, + final MemoryRequestServer memReqSvr) { + return newInstance(k, KllSketch.DEFAULT_M, dstMem, memReqSvr); + } + + /** + * Create a new instance of this sketch. + * @param k parameter that controls size of the sketch and accuracy of estimates + * @param m parameter that controls the minimum level width in items. + * @param dstMem the given destination WritableMemory object for use by the sketch + * @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory + * @return a new instance of this sketch + */ + static KllDirectFloatsSketch newInstance(final int k, final int m, final WritableMemory dstMem, + final MemoryRequestServer memReqSvr) { + setMemoryPreInts(dstMem, PREAMBLE_INTS_FULL); + setMemorySerVer(dstMem, SERIAL_VERSION_UPDATABLE); + setMemoryFamilyID(dstMem, Family.KLL.getID()); + setMemoryFlags(dstMem, UPDATABLE_BIT_MASK); + setMemoryK(dstMem, k); + setMemoryM(dstMem, m); + setMemoryN(dstMem, 0); + setMemoryMinK(dstMem, k); + setMemoryNumLevels(dstMem, 1); + int offset = DATA_START_ADR; + dstMem.putIntArray(offset, new int[] {k, k}, 0, 2); + offset += 2 * Integer.BYTES; + dstMem.putFloatArray(offset, new float[] {Float.NaN, Float.NaN}, 0, 2); + offset += 2 * Float.BYTES; + dstMem.putFloatArray(offset, new float[k], 0, k); + final KllMemoryValidate memVal = new KllMemoryValidate(dstMem); + return new KllDirectFloatsSketch(dstMem, memReqSvr, memVal); + } + + /** + * Wrap a sketch around the given source Memory containing sketch data that originated from + * this sketch. + * @param srcMem a WritableMemory that contains data. + * @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory + * @return instance of this sketch + */ + public static KllDirectFloatsSketch writableWrap(final WritableMemory srcMem, final MemoryRequestServer memReqSvr) { + final KllMemoryValidate memVal = new KllMemoryValidate(srcMem); + return new KllDirectFloatsSketch(srcMem, memReqSvr, memVal); + } + + /** + * Returns an approximation to the Cumulative Distribution Function (CDF), which is the + * cumulative analog of the PMF, of the input stream given a set of splitPoint (values). + * + *

The resulting approximations have a probabilistic guarantee that can be obtained from the + * getNormalizedRankError(false) function. + * + *

If the sketch is empty this returns null.

+ * + * @param splitPoints an array of m unique, monotonically increasing float values + * that divide the real number line into m+1 consecutive disjoint intervals. + * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and + * exclusive of the right splitPoint, with the exception that the last interval will include + * the maximum value. + * It is not necessary to include either the min or max values in these split points. + * + * @return an array of m+1 double values on the interval [0.0, 1.0), + * which are a consecutive approximation to the CDF of the input stream given the splitPoints. + * The value at array position j of the returned CDF array is the sum of the returned values + * in positions 0 through j of the returned PMF array. + */ + public double[] getCDF(final float[] splitPoints) { + return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, true); + } + + /** + * Returns the max value of the stream. + * If the sketch is empty this returns NaN. + * + * @return the max value of the stream + */ + public float getMaxValue() { return getMaxFloatValue(); } + + /** + * Returns the min value of the stream. + * If the sketch is empty this returns NaN. + * + * @return the min value of the stream + */ + public float getMinValue() { return getMinFloatValue(); } + + /** + * Returns an approximation to the Probability Mass Function (PMF) of the input stream + * given a set of splitPoints (values). + * + *

The resulting approximations have a probabilistic guarantee that can be obtained from the + * getNormalizedRankError(true) function. + * + *

If the sketch is empty this returns null.

+ * + * @param splitPoints an array of m unique, monotonically increasing float values + * that divide the real number line into m+1 consecutive disjoint intervals. + * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and + * exclusive of the right splitPoint, with the exception that the last interval will include + * the maximum value. + * It is not necessary to include either the min or max values in these split points. + * + * @return an array of m+1 doubles on the interval [0.0, 1.0), + * each of which is an approximation to the fraction of the total input stream values + * (the mass) that fall into one of those intervals. + * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right + * splitPoint, with the exception that the last interval will include maximum value. + */ + public double[] getPMF(final float[] splitPoints) { + return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, false); + } + + /** + * Returns an approximation to the value of the data item + * that would be preceded by the given fraction of a hypothetical sorted + * version of the input stream so far. + * + *

We note that this method has a fairly large overhead (microseconds instead of nanoseconds) + * so it should not be called multiple times to get different quantiles from the same + * sketch. Instead use getQuantiles(), which pays the overhead only once. + * + *

If the sketch is empty this returns NaN. + * + * @param fraction the specified fractional position in the hypothetical sorted stream. + * These are also called normalized ranks or fractional ranks. + * If fraction = 0.0, the true minimum value of the stream is returned. + * If fraction = 1.0, the true maximum value of the stream is returned. + * + * @return the approximation to the value at the given fraction + */ + public float getQuantile(final double fraction) { + return KllFloatsHelper.getFloatsQuantile(this, fraction); + } + + /** + * Gets the lower bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. + * @param fraction the given normalized rank as a fraction + * @return the lower bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. Returns NaN if the sketch is empty. + */ + public float getQuantileLowerBound(final double fraction) { + return getQuantile(max(0, fraction - KllHelper.getNormalizedRankError(getMinK(), false))); + } + + /** + * This is a more efficient multiple-query version of getQuantile(). + * + *

This returns an array that could have been generated by using getQuantile() with many + * different fractional ranks, but would be very inefficient. + * This method incurs the internal set-up overhead once and obtains multiple quantile values in + * a single query. It is strongly recommend that this method be used instead of multiple calls + * to getQuantile(). + * + *

If the sketch is empty this returns null. + * + * @param fractions given array of fractional positions in the hypothetical sorted stream. + * These are also called normalized ranks or fractional ranks. + * These fractions must be in the interval [0.0, 1.0], inclusive. + * + * @return array of approximations to the given fractions in the same order as given fractions + * array. + */ + public float[] getQuantiles(final double[] fractions) { + return KllFloatsHelper.getFloatsQuantiles(this, fractions); + } + + /** + * This is also a more efficient multiple-query version of getQuantile() and allows the caller to + * specify the number of evenly spaced fractional ranks. + * + *

If the sketch is empty this returns null. + * + * @param numEvenlySpaced an integer that specifies the number of evenly spaced fractional ranks. + * This must be a positive integer greater than 0. A value of 1 will return the min value. + * A value of 2 will return the min and the max value. A value of 3 will return the min, + * the median and the max value, etc. + * + * @return array of approximations to the given fractions in the same order as given fractions + * array. + */ + public float[] getQuantiles(final int numEvenlySpaced) { + if (isEmpty()) { return null; } + return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, numEvenlySpaced)); + } + + /** + * Gets the upper bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. + * @param fraction the given normalized rank as a fraction + * @return the upper bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. Returns NaN if the sketch is empty. + */ + public float getQuantileUpperBound(final double fraction) { + return getQuantile(min(1.0, fraction + KllHelper.getNormalizedRankError(getMinK(), false))); + } + + /** + * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, + * inclusive. + * + *

The resulting approximation has a probabilistic guarantee that can be obtained from the + * getNormalizedRankError(false) function. + * + *

If the sketch is empty this returns NaN.

+ * + * @param value to be ranked + * @return an approximate rank of the given value + */ + public double getRank(final float value) { + return KllFloatsHelper.getFloatRank(this, value); + } + + /** + * @return the iterator for this class + */ + public KllFloatsSketchIterator iterator() { + return new KllFloatsSketchIterator(getFloatItemsArray(), getLevelsArray(), getNumLevels()); + } + + /** + * Merges another sketch into this one. + * @param other sketch to merge into this one + */ + public void merge(final KllSketch other) { + if (!other.isFloatsSketch()) { kllSketchThrow(SRC_MUST_BE_FLOAT); } + KllFloatsHelper.mergeFloatImpl(this, other); + } + + /** + * Updates this sketch with the given data item. + * + * @param value an item from a stream of items. NaNs are ignored. + */ + public void update(final float value) { + KllFloatsHelper.updateFloat(this, value); + } + + @Override + double[] getDoubleItemsArray() { kllSketchThrow(MUST_NOT_CALL); return null; } + + @Override + double getDoubleItemsArrayAt(final int index) { kllSketchThrow(MUST_NOT_CALL); return Double.NaN; } + + @Override + float[] getFloatItemsArray() { + final int items = getItemsArrLengthItems(); + final float[] itemsArr = new float[items]; + itemsArrUpdatable.getFloatArray(0, itemsArr, 0, items); + return itemsArr; + } + + @Override + float getFloatItemsArrayAt(final int index) { + return itemsArrUpdatable.getFloat((long)index * Float.BYTES); + } + + @Override + double getMaxDoubleValue() { kllSketchThrow(MUST_NOT_CALL); return Double.NaN; } + + @Override + float getMaxFloatValue() { + return minMaxArrUpdatable.getFloat(Float.BYTES); + } + + @Override + double getMinDoubleValue() { kllSketchThrow(MUST_NOT_CALL); return Double.NaN; } + + @Override + float getMinFloatValue() { + return minMaxArrUpdatable.getFloat(0); + } + + @Override + void setDoubleItemsArray(final double[] doubleItems) { kllSketchThrow(MUST_NOT_CALL); } + + @Override + void setDoubleItemsArrayAt(final int index, final double value) { kllSketchThrow(MUST_NOT_CALL); } + + @Override + void setFloatItemsArray(final float[] floatItems) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + itemsArrUpdatable.putFloatArray(0, floatItems, 0, floatItems.length); + } + + @Override + void setFloatItemsArrayAt(final int index, final float value) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + itemsArrUpdatable.putFloat((long)index * Float.BYTES, value); + } + + @Override + void setMaxDoubleValue(final double value) { kllSketchThrow(MUST_NOT_CALL); } + + @Override + void setMaxFloatValue(final float value) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + minMaxArrUpdatable.putFloat(Float.BYTES, value); + } + + @Override + void setMinDoubleValue(final double value) { kllSketchThrow(MUST_NOT_CALL); } + + @Override + void setMinFloatValue(final float value) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + minMaxArrUpdatable.putFloat(0, value); + } + +} diff --git a/src/main/java/org/apache/datasketches/kll/KllDirectSketch.java b/src/main/java/org/apache/datasketches/kll/KllDirectSketch.java new file mode 100644 index 000000000..959d4bd3d --- /dev/null +++ b/src/main/java/org/apache/datasketches/kll/KllDirectSketch.java @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryMinK; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryK; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryLevelZeroSortedFlag; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryM; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryN; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryNumLevels; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryMinK; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryLevelZeroSortedFlag; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryN; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryNumLevels; +import static org.apache.datasketches.kll.KllSketch.Error.TGT_IS_IMMUTABLE; +import static org.apache.datasketches.kll.KllSketch.Error.kllSketchThrow; + +import org.apache.datasketches.memory.MemoryRequestServer; +import org.apache.datasketches.memory.WritableMemory; + +/** + * This class implements all the methods for the Direct (off-heap) sketches that are independent + * of the sketch type (float or double). + */ +abstract class KllDirectSketch extends KllSketch { + final boolean updatableMemory; + WritableMemory levelsArrUpdatable; + WritableMemory minMaxArrUpdatable; + WritableMemory itemsArrUpdatable; + + /** + * For the direct sketches it is important that the methods implemented here are designed to + * work dynamically as the sketch grows off-heap. + * @param sketchType either DOUBLE_SKETCH or FLOAT_SKETCH + * @param wmem the current WritableMemory + * @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory + */ + KllDirectSketch(final SketchType sketchType, final WritableMemory wmem, final MemoryRequestServer memReqSvr, + final KllMemoryValidate memVal) { + super(sketchType, wmem, memReqSvr); + updatableMemory = memVal.updatableMemory && memReqSvr != null; + levelsArrUpdatable = memVal.levelsArrUpdatable; + minMaxArrUpdatable = memVal.minMaxArrUpdatable; + itemsArrUpdatable = memVal.itemsArrUpdatable; + } + + @Override + public int getK() { + return getMemoryK(wmem); + } + + @Override + public long getN() { + return getMemoryN(wmem); + } + + @Override + public void reset() { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + final int k = getK(); + setN(0); + setMinK(k); + setNumLevels(1); + setLevelsArray(new int[] {k, k}); + setLevelZeroSorted(false); + final int newLevelsArrLen = 2 * Integer.BYTES; + final int newItemsArrLen = k; + KllHelper.memorySpaceMgmt(this, newLevelsArrLen, newItemsArrLen); + levelsArrUpdatable.putIntArray(0L, new int[] {k, k}, 0, 2); + if (sketchType == SketchType.DOUBLES_SKETCH) { + minMaxArrUpdatable.putDoubleArray(0L, new double[] {Double.NaN, Double.NaN}, 0, 2); + itemsArrUpdatable.putDoubleArray(0L, new double[k], 0, k); + } else { + minMaxArrUpdatable.putFloatArray(0L, new float[] {Float.NaN, Float.NaN}, 0, 2); + itemsArrUpdatable.putFloatArray(0L, new float[k], 0, k); + } + } + + @Override + public byte[] toUpdatableByteArray() { + final int bytes = (int) wmem.getCapacity(); + final byte[] byteArr = new byte[bytes]; + wmem.getByteArray(0, byteArr, 0, bytes); + return byteArr; + } + + int getItemsArrLengthItems() { + return getLevelsArray()[getNumLevels()]; + } + + @Override + int[] getLevelsArray() { + final int numInts = getNumLevels() + 1; + final int[] myLevelsArr = new int[numInts]; + levelsArrUpdatable.getIntArray(0, myLevelsArr, 0, numInts); + return myLevelsArr; + } + + @Override + int getLevelsArrayAt(final int index) { + return levelsArrUpdatable.getInt((long)index * Integer.BYTES); + } + + @Override + int getM() { + return getMemoryM(wmem); + } + + @Override + int getMinK() { + return getMemoryMinK(wmem); + } + + @Override + int getNumLevels() { + return getMemoryNumLevels(wmem); + } + + @Override + void incN() { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + long n = getMemoryN(wmem); + setMemoryN(wmem, ++n); + } + + @Override + void incNumLevels() { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + int numLevels = getMemoryNumLevels(wmem); + setMemoryNumLevels(wmem, ++numLevels); + } + + @Override + boolean isLevelZeroSorted() { + return getMemoryLevelZeroSortedFlag(wmem); + } + + @Override + void setItemsArrayUpdatable(final WritableMemory itemsMem) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + itemsArrUpdatable = itemsMem; + } + + @Override + void setLevelsArray(final int[] levelsArr) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + levelsArrUpdatable.putIntArray(0, levelsArr, 0, levelsArr.length); + } + + @Override + void setLevelsArrayAt(final int index, final int value) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + levelsArrUpdatable.putInt((long)index * Integer.BYTES, value); + } + + @Override + void setLevelsArrayAtMinusEq(final int index, final int minusEq) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + final int offset = index * Integer.BYTES; + final int curV = levelsArrUpdatable.getInt(offset); + levelsArrUpdatable.putInt(offset, curV - minusEq); + } + + @Override + void setLevelsArrayAtPlusEq(final int index, final int plusEq) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + final int offset = index * Integer.BYTES; + final int curV = levelsArrUpdatable.getInt(offset); + levelsArrUpdatable.putInt(offset, curV + plusEq); + } + + @Override + void setLevelsArrayUpdatable(final WritableMemory levelsMem) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + levelsArrUpdatable = levelsMem; + } + + @Override + void setLevelZeroSorted(final boolean sorted) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + setMemoryLevelZeroSortedFlag(wmem, sorted); + } + + @Override + void setMinK(final int minK) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + setMemoryMinK(wmem, minK); + } + + @Override + void setMinMaxArrayUpdatable(final WritableMemory minMaxMem) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + minMaxArrUpdatable = minMaxMem; + } + + @Override + void setN(final long n) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + setMemoryN(wmem, n); + } + + @Override + void setNumLevels(final int numLevels) { + if (!updatableMemory) { kllSketchThrow(TGT_IS_IMMUTABLE); } + setMemoryNumLevels(wmem, numLevels); + } + +} diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java index 9b738553e..25a71f699 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java @@ -19,6 +19,8 @@ package org.apache.datasketches.kll; +import static java.lang.Math.max; +import static java.lang.Math.min; import static org.apache.datasketches.Util.isEven; import static org.apache.datasketches.Util.isOdd; @@ -32,23 +34,193 @@ * @author Kevin Lang * @author Alexander Saydakov */ -class KllDoublesHelper { +final class KllDoublesHelper { + + static double getDoubleRank(final KllSketch mine, final double value) { + if (mine.isEmpty()) { return Double.NaN; } + int level = 0; + int weight = 1; + long total = 0; + final double[] myDoubleItemsArr = mine.getDoubleItemsArray(); + final int[] myLevelsArr = mine.getLevelsArray(); + while (level < mine.getNumLevels()) { + final int fromIndex = myLevelsArr[level]; + final int toIndex = myLevelsArr[level + 1]; // exclusive + for (int i = fromIndex; i < toIndex; i++) { + if (myDoubleItemsArr[i] < value) { + total += weight; + } else if (level > 0 || mine.isLevelZeroSorted()) { + break; // levels above 0 are sorted, no point comparing further + } + } + level++; + weight *= 2; + } + return (double) total / mine.getN(); + } - /** - * Checks the sequential validity of the given array of double values. - * They must be unique, monotonically increasing and not NaN. - * @param values the given array of values - */ - static void validateDoubleValues(final double[] values) { - for (int i = 0; i < values.length; i++) { - if (!Double.isFinite(values[i])) { - throw new SketchesArgumentException("Values must be finite"); + static double[] getDoublesPmfOrCdf(final KllSketch mine, final double[] splitPoints, final boolean isCdf) { + if (mine.isEmpty()) { return null; } + validateDoubleValues(splitPoints); + final double[] buckets = new double[splitPoints.length + 1]; + final int myNumLevels = mine.getNumLevels(); + final int[] myLevelsArr = mine.getLevelsArray(); + int level = 0; + int weight = 1; + while (level < myNumLevels) { + final int fromIndex = myLevelsArr[level]; + final int toIndex = myLevelsArr[level + 1]; // exclusive + if (level == 0 && !mine.isLevelZeroSorted()) { + KllDoublesHelper.incrementDoublesBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets); + } else { + KllDoublesHelper.incrementDoublesBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets); } - if (i < values.length - 1 && values[i] >= values[i + 1]) { - throw new SketchesArgumentException( - "Values must be unique and monotonically increasing"); + level++; + weight *= 2; + } + // normalize and, if CDF, convert to cumulative + if (isCdf) { + double subtotal = 0; + for (int i = 0; i < buckets.length; i++) { + subtotal += buckets[i]; + buckets[i] = subtotal / mine.getN(); + } + } else { + for (int i = 0; i < buckets.length; i++) { + buckets[i] /= mine.getN(); + } + } + return buckets; + } + + static double getDoublesQuantile(final KllSketch mine, final double fraction) { + if (mine.isEmpty()) { return Double.NaN; } + if (fraction < 0.0 || fraction > 1.0) { + throw new SketchesArgumentException("Fraction cannot be less than zero nor greater than 1.0"); + } + //These two assumptions make KLL compatible with the previous classic Quantiles Sketch + if (fraction == 0.0) { return mine.getMinDoubleValue(); } + if (fraction == 1.0) { return mine.getMaxDoubleValue(); } + final KllDoublesQuantileCalculator quant = KllDoublesHelper.getDoublesQuantileCalculator(mine); + return quant.getQuantile(fraction); + } + + static double[] getDoublesQuantiles(final KllSketch mine, final double[] fractions) { + if (mine.isEmpty()) { return null; } + KllDoublesQuantileCalculator quant = null; + final double[] quantiles = new double[fractions.length]; + for (int i = 0; i < fractions.length; i++) { + final double fraction = fractions[i]; + if (fraction < 0.0 || fraction > 1.0) { + throw new SketchesArgumentException("Fraction cannot be less than zero nor greater than 1.0"); + } + if (fraction == 0.0) { quantiles[i] = mine.getMinDoubleValue(); } + else if (fraction == 1.0) { quantiles[i] = mine.getMaxDoubleValue(); } + else { + if (quant == null) { + quant = KllDoublesHelper.getDoublesQuantileCalculator(mine); + } + quantiles[i] = quant.getQuantile(fraction); } } + return quantiles; + } + + static void mergeDoubleImpl(final KllSketch mine, final KllSketch other) { + if (other.isEmpty()) { return; } + final long finalN = mine.getN() + other.getN(); + //update this sketch with level0 items from the other sketch + final double[] otherDoubleItemsArr = other.getDoubleItemsArray(); + final int otherNumLevels = other.getNumLevels(); + final int[] otherLevelsArr = other.getLevelsArray(); + for (int i = otherLevelsArr[0]; i < otherLevelsArr[1]; i++) { + KllDoublesHelper.updateDouble(mine, otherDoubleItemsArr[i]); + } + // after the level 0 update, we capture the key mutable variables + final double myMin = mine.getMinDoubleValue(); + final double myMax = mine.getMaxDoubleValue(); + final int myMinK = mine.getMinK(); + + final int myCurNumLevels = mine.getNumLevels(); + final int[] myCurLevelsArr = mine.getLevelsArray(); + final double[] myCurDoubleItemsArr = mine.getDoubleItemsArray(); + + final int myNewNumLevels; + final int[] myNewLevelsArr; + final double[] myNewDoubleItemsArr; + + if (otherNumLevels > 1) { //now merge other levels if they exist + final int tmpSpaceNeeded = mine.getNumRetained() + + KllHelper.getNumRetainedAboveLevelZero(otherNumLevels, otherLevelsArr); + final double[] workbuf = new double[tmpSpaceNeeded]; + final int ub = KllHelper.ubOnNumLevels(finalN); + final int[] worklevels = new int[ub + 2]; // ub+1 does not work + final int[] outlevels = new int[ub + 2]; + + final int provisionalNumLevels = max(myCurNumLevels, otherNumLevels); + + populateDoubleWorkArrays(mine, other, workbuf, worklevels, provisionalNumLevels); + + // notice that workbuf is being used as both the input and output + final int[] result = generalDoublesCompress(mine.getK(), mine.getM(), provisionalNumLevels, + workbuf, worklevels, workbuf, outlevels, mine.isLevelZeroSorted(), KllSketch.random); + final int targetItemCount = result[1]; //was finalCapacity. Max size given k, m, numLevels + final int curItemCount = result[2]; //was finalPop + + // now we need to finalize the results for the "self" sketch + + //THE NEW NUM LEVELS + myNewNumLevels = result[0]; //was finalNumLevels + assert myNewNumLevels <= ub; // ub may be much bigger + + // THE NEW ITEMS ARRAY (was newbuf) + myNewDoubleItemsArr = (targetItemCount == myCurDoubleItemsArr.length) + ? myCurDoubleItemsArr + : new double[targetItemCount]; + final int freeSpaceAtBottom = targetItemCount - curItemCount; + //shift the new items array + System.arraycopy(workbuf, outlevels[0], myNewDoubleItemsArr, freeSpaceAtBottom, curItemCount); + final int theShift = freeSpaceAtBottom - outlevels[0]; + + //calculate the new levels array length + final int finalLevelsArrLen; + if (myCurLevelsArr.length < myNewNumLevels + 1) { finalLevelsArrLen = myNewNumLevels + 1; } + else { finalLevelsArrLen = myCurLevelsArr.length; } + + //THE NEW LEVELS ARRAY + myNewLevelsArr = new int[finalLevelsArrLen]; + for (int lvl = 0; lvl < myNewNumLevels + 1; lvl++) { // includes the "extra" index + myNewLevelsArr[lvl] = outlevels[lvl] + theShift; + } + + //MEMORY SPACE MANAGEMENT + if (mine.updatablMemory) { + mine.wmem = KllHelper.memorySpaceMgmt(mine, myNewLevelsArr.length, myNewDoubleItemsArr.length); + } + + } else { + myNewNumLevels = myCurNumLevels; + myNewLevelsArr = myCurLevelsArr; + myNewDoubleItemsArr = myCurDoubleItemsArr; + } + + //Update Preamble: + mine.setN(finalN); + if (other.isEstimationMode()) { //otherwise the merge brings over exact items. + mine.setMinK(min(myMinK, other.getMinK())); + } + + //Update min, max values + final double otherMin = other.getMinDoubleValue(); + final double otherMax = other.getMaxDoubleValue(); + mine.setMinDoubleValue(resolveDoubleMinValue(myMin, otherMin)); + mine.setMaxDoubleValue(resolveDoubleMaxValue(myMax, otherMax)); + + //Update numLevels, levelsArray, items + mine.setNumLevels(myNewNumLevels); + mine.setLevelsArray(myNewLevelsArr); + mine.setDoubleItemsArray(myNewDoubleItemsArr); + assert KllHelper.sumTheSampleWeights(mine.getNumLevels(), mine.getLevelsArray()) == mine.getN(); } static void mergeSortedDoubleArrays( @@ -82,6 +254,62 @@ static void mergeSortedDoubleArrays( assert b == limB; } + /** + * Validation Method. This must be modified to test validation + * @param buf the items array + * @param start data start + * @param length items length + * @param random instance of Random + */ + static void randomlyHalveDownDoubles(final double[] buf, final int start, final int length, final Random random) { + assert isEven(length); + final int half_length = length / 2; + final int offset = random.nextInt(2); // disable for validation + //final int offset = deterministicOffset(); // enable for validation + int j = start + offset; + for (int i = start; i < (start + half_length); i++) { + buf[i] = buf[j]; + j += 2; + } + } + + /** + * Validation Method. This must be modified to test validation + * @param buf the items array + * @param start data start + * @param length items length + * @param random instance of Random + */ + static void randomlyHalveUpDoubles(final double[] buf, final int start, final int length, final Random random) { + assert isEven(length); + final int half_length = length / 2; + final int offset = random.nextInt(2); // disable for validation + //final int offset = deterministicOffset(); // enable for validation + int j = (start + length) - 1 - offset; + for (int i = (start + length) - 1; i >= (start + half_length); i--) { + buf[i] = buf[j]; + j -= 2; + } + } + + static void updateDouble(final KllSketch mine, final double value) { + if (Double.isNaN(value)) { return; } + if (mine.isEmpty()) { + mine.setMinDoubleValue(value); + mine.setMaxDoubleValue(value); + } else { + if (value < mine.getMinDoubleValue()) { mine.setMinDoubleValue(value); } + if (value > mine.getMaxDoubleValue()) { mine.setMaxDoubleValue(value); } + } + if (mine.getLevelsArrayAt(0) == 0) { KllHelper.compressWhileUpdatingSketch(mine); } + mine.incN(); + mine.setLevelZeroSorted(false); + final int nextPos = mine.getLevelsArrayAt(0) - 1; + assert mine.getLevelsArrayAt(0) >= 0; + mine.setLevelsArrayAt(0, nextPos); + mine.setDoubleItemsArrayAt(nextPos, value); + } + /** * Compression algorithm used to merge higher levels. *

Here is what we do for each level:

@@ -112,9 +340,10 @@ static void mergeSortedDoubleArrays( * @param outBuf the same array as inBuf * @param outLevels the same size as inLevels * @param isLevelZeroSorted true if this.level 0 is sorted + * @param random instance of java.util.Random * @return int array of: {numLevels, targetItemCount, currentItemCount) */ - static int[] generalDoublesCompress( + private static int[] generalDoublesCompress( final int k, final int m, final int numLevelsIn, @@ -192,52 +421,130 @@ static int[] generalDoublesCompress( numLevels++; targetItemCount += KllHelper.levelCapacity(k, numLevels, 0, m); } - } // end of code for compacting a level // determine whether we have processed all levels yet (including any new levels that we created) - if (curLevel == (numLevels - 1)) { doneYet = true; } - } // end of loop over levels assert (outLevels[numLevels] - outLevels[0]) == currentItemCount; - return new int[] {numLevels, targetItemCount, currentItemCount}; } - //This must be modified for validation - static void randomlyHalveDownDoubles(final double[] buf, final int start, final int length, final Random random) { - assert isEven(length); - final int half_length = length / 2; - final int offset = random.nextInt(2); // disable for validation - //final int offset = deterministicOffset(); // enable for validation - int j = start + offset; - for (int i = start; i < (start + half_length); i++) { - buf[i] = buf[j]; - j += 2; + private static KllDoublesQuantileCalculator getDoublesQuantileCalculator(final KllSketch mine) { + final int[] myLevelsArr = mine.getLevelsArray(); + final double[] myDoubleItemsArr = mine.getDoubleItemsArray(); + if (!mine.isLevelZeroSorted()) { + Arrays.sort(mine.getDoubleItemsArray(), myLevelsArr[0], myLevelsArr[1]); + mine.setLevelZeroSorted(true); } + return new KllDoublesQuantileCalculator(myDoubleItemsArr, myLevelsArr, mine.getNumLevels(), mine.getN()); } - //This must be modified for validation - static void randomlyHalveUpDoubles(final double[] buf, final int start, final int length, final Random random) { - assert isEven(length); - final int half_length = length / 2; - final int offset = random.nextInt(2); // disable for validation - //final int offset = deterministicOffset(); // enable for validation - int j = (start + length) - 1 - offset; - for (int i = (start + length) - 1; i >= (start + half_length); i--) { - buf[i] = buf[j]; - j -= 2; + private static void incrementDoublesBucketsSortedLevel( + final KllSketch mine, final int fromIndex, final int toIndex, + final int weight, final double[] splitPoints, final double[] buckets) { + final double[] myDoubleItemsArr = mine.getDoubleItemsArray(); + int i = fromIndex; + int j = 0; + while (i < toIndex && j < splitPoints.length) { + if (myDoubleItemsArr[i] < splitPoints[j]) { + buckets[j] += weight; // this sample goes into this bucket + i++; // move on to next sample and see whether it also goes into this bucket + } else { + j++; // no more samples for this bucket + } + } + // now either i == toIndex (we are out of samples), or + // j == numSplitPoints (we are out of buckets, but there are more samples remaining) + // we only need to do something in the latter case + if (j == splitPoints.length) { + buckets[j] += weight * (toIndex - i); + } + } + + private static void incrementDoublesBucketsUnsortedLevel( + final KllSketch mine, final int fromIndex, final int toIndex, + final int weight, final double[] splitPoints, final double[] buckets) { + final double[] myDoubleItemsArr = mine.getDoubleItemsArray(); + for (int i = fromIndex; i < toIndex; i++) { + int j; + for (j = 0; j < splitPoints.length; j++) { + if (myDoubleItemsArr[i] < splitPoints[j]) { + break; + } + } + buckets[j] += weight; + } + } + + private static void populateDoubleWorkArrays(final KllSketch mine, final KllSketch other, final double[] workbuf, + final int[] worklevels, final int provisionalNumLevels) { + worklevels[0] = 0; + final int[] myLevelsArr = mine.getLevelsArray(); + final int[] otherLevelsArr = other.getLevelsArray(); + final double[] myDoubleItemsArr = mine.getDoubleItemsArray(); + final double[] otherDoubleItemsArr = other.getDoubleItemsArray(); + + // Note: the level zero data from "other" was already inserted into "self" + final int selfPopZero = KllHelper.currentLevelSize(0, mine.getNumLevels(),myLevelsArr); + System.arraycopy(myDoubleItemsArr, myLevelsArr[0], workbuf, worklevels[0], selfPopZero); + worklevels[1] = worklevels[0] + selfPopZero; + + for (int lvl = 1; lvl < provisionalNumLevels; lvl++) { + final int selfPop = KllHelper.currentLevelSize(lvl, mine.getNumLevels(), myLevelsArr); + final int otherPop = KllHelper.currentLevelSize(lvl, other.getNumLevels(), otherLevelsArr); + worklevels[lvl + 1] = worklevels[lvl] + selfPop + otherPop; + + if (selfPop > 0 && otherPop == 0) { + System.arraycopy(myDoubleItemsArr, myLevelsArr[lvl], workbuf, worklevels[lvl], selfPop); + } else if (selfPop == 0 && otherPop > 0) { + System.arraycopy(otherDoubleItemsArr, otherLevelsArr[lvl], workbuf, worklevels[lvl], otherPop); + } else if (selfPop > 0 && otherPop > 0) { + mergeSortedDoubleArrays(myDoubleItemsArr, myLevelsArr[lvl], selfPop, otherDoubleItemsArr, + otherLevelsArr[lvl], otherPop, workbuf, worklevels[lvl]); + } + } + } + + private static double resolveDoubleMaxValue(final double myMax, final double otherMax) { + if (Double.isNaN(myMax) && Double.isNaN(otherMax)) { return Double.NaN; } + if (Double.isNaN(myMax)) { return otherMax; } + if (Double.isNaN(otherMax)) { return myMax; } + return max(myMax, otherMax); + } + + private static double resolveDoubleMinValue(final double myMin, final double otherMin) { + if (Double.isNaN(myMin) && Double.isNaN(otherMin)) { return Double.NaN; } + if (Double.isNaN(myMin)) { return otherMin; } + if (Double.isNaN(otherMin)) { return myMin; } + return min(myMin, otherMin); + } + + /** + * Validation Method. + * Checks the sequential validity of the given array of double values. + * They must be unique, monotonically increasing and not NaN. + * @param values the given array of values + */ + private static void validateDoubleValues(final double[] values) { + for (int i = 0; i < values.length; i++) { + if (!Double.isFinite(values[i])) { + throw new SketchesArgumentException("Values must be finite"); + } + if (i < values.length - 1 && values[i] >= values[i + 1]) { + throw new SketchesArgumentException( + "Values must be unique and monotonically increasing"); + } } } /* + * Validation Method. * The following must be enabled for use with the KllDoublesValidationTest, - * which is only enabled for manual testing. In addition, the two methods + * which is only enabled for manual testing. In addition, two Validation Methods * above need to be modified as commented. */ - // static int nextOffset = 0; // // private static int deterministicOffset() { @@ -247,4 +554,3 @@ static void randomlyHalveUpDoubles(final double[] buf, final int start, final in // } } - diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesQuantileCalculator.java b/src/main/java/org/apache/datasketches/kll/KllDoublesQuantileCalculator.java index ba269836f..7870002f1 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesQuantileCalculator.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesQuantileCalculator.java @@ -58,41 +58,6 @@ final class KllDoublesQuantileCalculator { numLevels_ = 0; //not used by test } - double getQuantile(final double phi) { //phi is normalized rank [0,1]. - final long pos = QuantilesHelper.posOfPhi(phi, n_); - return approximatelyAnswerPositonalQuery(pos); - } - - private double approximatelyAnswerPositonalQuery(final long pos) { - assert pos >= 0; - assert pos < n_; - final int index = QuantilesHelper.chunkContainingPos(weights_, pos); - return items_[index]; - } - - private void populateFromSketch(final double[] srcItems, final int[] srcLevels, - final int numLevels, final int numItems) { - final int offset = srcLevels[0]; - System.arraycopy(srcItems, offset, items_, 0, numItems); - int srcLevel = 0; - int dstLevel = 0; - long weight = 1; - while (srcLevel < numLevels) { - final int fromIndex = srcLevels[srcLevel] - offset; - final int toIndex = srcLevels[srcLevel + 1] - offset; // exclusive - if (fromIndex < toIndex) { // if equal, skip empty level - Arrays.fill(weights_, fromIndex, toIndex, weight); - levels_[dstLevel] = fromIndex; - levels_[dstLevel + 1] = toIndex; - dstLevel++; - } - srcLevel++; - weight *= 2; - } - weights_[numItems] = 0; - numLevels_ = dstLevel; - } - private static void blockyTandemMergeSort(final double[] items, final long[] weights, final int[] levels, final int numLevels) { if (numLevels == 1) { return; } @@ -167,5 +132,39 @@ private static void tandemMerge( } } -} + double getQuantile(final double rank) { + final long pos = QuantilesHelper.posOfRank(rank, n_); + return approximatelyAnswerPositonalQuery(pos); + } + + private double approximatelyAnswerPositonalQuery(final long pos) { + assert pos >= 0; + assert pos < n_; + final int index = QuantilesHelper.chunkContainingPos(weights_, pos); + return items_[index]; + } + private void populateFromSketch(final double[] srcItems, final int[] srcLevels, + final int numLevels, final int numItems) { + final int offset = srcLevels[0]; + System.arraycopy(srcItems, offset, items_, 0, numItems); + int srcLevel = 0; + int dstLevel = 0; + long weight = 1; + while (srcLevel < numLevels) { + final int fromIndex = srcLevels[srcLevel] - offset; + final int toIndex = srcLevels[srcLevel + 1] - offset; // exclusive + if (fromIndex < toIndex) { // if equal, skip empty level + Arrays.fill(weights_, fromIndex, toIndex, weight); + levels_[dstLevel] = fromIndex; + levels_[dstLevel + 1] = toIndex; + dstLevel++; + } + srcLevel++; + weight *= 2; + } + weights_[numItems] = 0; + numLevels_ = dstLevel; + } + +} diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index 1503ddb60..c5aadebf0 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -21,117 +21,69 @@ import static java.lang.Math.max; import static java.lang.Math.min; -import static org.apache.datasketches.Util.isOdd; +import static org.apache.datasketches.kll.KllSketch.Error.SRC_MUST_BE_DOUBLE; +import static org.apache.datasketches.kll.KllSketch.Error.MUST_NOT_CALL; +import static org.apache.datasketches.kll.KllSketch.Error.kllSketchThrow; -import java.util.Arrays; - -import org.apache.datasketches.Family; -import org.apache.datasketches.SketchesArgumentException; -import org.apache.datasketches.Util; import org.apache.datasketches.memory.Memory; -import org.apache.datasketches.memory.WritableMemory; /** - * Please refer to the documentation in the package-info:
- * {@link org.apache.datasketches.kll} + * This class implements an on-heap doubles KllSketch. + * + *

Please refer to the documentation in the package-info:
+ * {@link org.apache.datasketches.kll}

+ * + * @author Lee Rhodes, Kevin Lang */ -public class KllDoublesSketch extends BaseKllSketch { +public final class KllDoublesSketch extends KllHeapSketch { + private double[] doubleItems_; + private double minDoubleValue_; + private double maxDoubleValue_; - // Specific to the doubles sketch - private double[] items_; // the continuous array of double items - private double minValue_; - private double maxValue_; + /** + * Private heapify constructor. + * @param mem Memory object that contains data serialized by this sketch. + * @param memVal the MemoryCheck object + */ + private KllDoublesSketch(final Memory mem, final KllMemoryValidate memVal) { + super(memVal.k, memVal.m, SketchType.DOUBLES_SKETCH); + KllHelper.buildHeapKllSketchFromMemory(this, memVal); + } /** - * Heap constructor with the default k = 200, which has a rank error of about 1.65%. + * Heap constructor with the default k = 200. + * This will have a rank error of about 1.65%. */ public KllDoublesSketch() { - this(DEFAULT_K); + this(KllSketch.DEFAULT_K, KllSketch.DEFAULT_M); } /** - * Heap constructor with a given parameter k. k can be any value between 8 and + * Heap constructor with a given parameter k. k can be any value between DEFAULT_M and * 65535, inclusive. The default k = 200 results in a normalized rank error of about * 1.65%. Higher values of K will have smaller error but the sketch will be larger (and slower). * @param k parameter that controls size of the sketch and accuracy of estimates */ public KllDoublesSketch(final int k) { - this(k, DEFAULT_M, true); + this(k, KllSketch.DEFAULT_M); } /** - * Used for testing only. - * @param k configured size of sketch. Range [m, 2^16] - * @param compatible if true, compatible with quantiles sketch. - */ - KllDoublesSketch(final int k, final boolean compatible) { - this(k, DEFAULT_M, compatible); - } - - /** - * Heap constructor. - * @param k configured size of sketch. Range [m, 2^16] - * @param m minimum level size. Default is 8. - */ - private KllDoublesSketch(final int k, final int m, final boolean compatible) { - super(k, m, compatible); - items_ = new double[k]; - minValue_ = Double.NaN; - maxValue_ = Double.NaN; - } - - /** - * Private heapify constructor. - * @param mem Memory object that contains data serialized by this sketch. + * Heap constructor with a given parameters k and m. + * + * @param k parameter that controls size of the sketch and accuracy of estimates. + * k can be any value between m and 65535, inclusive. + * The default k = 200 results in a normalized rank error of about 1.65%. + * Higher values of k will have smaller error but the sketch will be larger (and slower). + * @param m parameter controls the minimum level width in items. It can be 2, 4, 6 or 8. + * The DEFAULT_M, which is 8 is recommended. Other values of m should be considered + * experimental as they have not been as well characterized. */ - private KllDoublesSketch(final Memory mem) { - super(mem.getShort(K_SHORT) & 0xffff, DEFAULT_M, true); - final int flags = mem.getByte(FLAGS_BYTE) & 0xff; - final boolean empty = (flags & 1 << Flags.IS_EMPTY.ordinal()) > 0; - final boolean singleItem = (flags & 1 << Flags.IS_SINGLE_ITEM.ordinal()) > 0; - if (empty) { - numLevels_ = 1; - levels_ = new int[] {k_, k_}; - isLevelZeroSorted_ = false; - minK_ = k_; - items_ = new double[k_]; - minValue_ = Double.NaN; - maxValue_ = Double.NaN; - } else { - if (singleItem) { - n_ = 1; - minK_ = k_; - numLevels_ = 1; - } else { - n_ = mem.getLong(N_LONG); - minK_ = mem.getShort(MIN_K_SHORT) & 0xffff; - numLevels_ = mem.getByte(NUM_LEVELS_BYTE) & 0xff; - } - levels_ = new int[numLevels_ + 1]; - int offset = singleItem ? DATA_START_SINGLE_ITEM : DATA_START_DOUBLE; - final int itemCapacity = KllHelper.computeTotalItemCapacity(k_, m_, numLevels_); - if (singleItem) { - levels_[0] = itemCapacity - 1; - } else { - // the last integer in levels_ is not serialized because it can be derived - mem.getIntArray(offset, levels_, 0, numLevels_); - offset += numLevels_ * Integer.BYTES; - } - levels_[numLevels_] = itemCapacity; - if (!singleItem) { - minValue_ = mem.getDouble(offset); - offset += Double.BYTES; - maxValue_ = mem.getDouble(offset); - offset += Double.BYTES; - } - items_ = new double[itemCapacity]; - mem.getDoubleArray(offset, items_, levels_[0], getNumRetained()); - if (singleItem) { - minValue_ = items_[levels_[0]]; - maxValue_ = items_[levels_[0]]; - } - isLevelZeroSorted_ = (flags & 1 << Flags.IS_LEVEL_ZERO_SORTED.ordinal()) > 0; - } + KllDoublesSketch(final int k, final int m) { + super(k, m, SketchType.DOUBLES_SKETCH); + doubleItems_ = new double[k]; + minDoubleValue_ = Double.NaN; + maxDoubleValue_ = Double.NaN; } /** @@ -141,45 +93,12 @@ private KllDoublesSketch(final Memory mem) { * See Memory * @return a heap-based sketch based on the given Memory. */ - //To simplify the code, this method does all the validity checking - // then passes the verified Memory to the actual heapify constructor public static KllDoublesSketch heapify(final Memory mem) { - final int preambleInts = mem.getByte(PREAMBLE_INTS_BYTE) & 0xff; - final int serialVersion = mem.getByte(SER_VER_BYTE) & 0xff; - final int family = mem.getByte(FAMILY_BYTE) & 0xff; - final int flags = mem.getByte(FLAGS_BYTE) & 0xff; - final int m = mem.getByte(M_BYTE) & 0xff; - if (m != DEFAULT_M) { - throw new SketchesArgumentException( - "Possible corruption: M must be " + DEFAULT_M + ": " + m); - } - final boolean empty = (flags & 1 << Flags.IS_EMPTY.ordinal()) > 0; - final boolean singleItem = (flags & 1 << Flags.IS_SINGLE_ITEM.ordinal()) > 0; - if (empty || singleItem) { - if (preambleInts != PREAMBLE_INTS_EMPTY_SINGLE) { - throw new SketchesArgumentException("Possible corruption: preambleInts must be " - + PREAMBLE_INTS_EMPTY_SINGLE + " for an empty or single item sketch: " + preambleInts); - } - } else { - if (preambleInts != PREAMBLE_INTS_DOUBLE) { - throw new SketchesArgumentException("Possible corruption: preambleInts must be " - + PREAMBLE_INTS_DOUBLE + " for a sketch with more than one item: " + preambleInts); - } - } - if (serialVersion != SERIAL_VERSION && serialVersion != SERIAL_VERSION_SINGLE) { - throw new SketchesArgumentException( - "Possible corruption: serial version mismatch: expected " + SERIAL_VERSION + " or " - + SERIAL_VERSION_SINGLE + ", got " + serialVersion); - } - if (family != Family.KLL.getID()) { - throw new SketchesArgumentException( - "Possible corruption: family mismatch: expected " + Family.KLL.getID() + ", got " + family); - } - return new KllDoublesSketch(mem); + final KllMemoryValidate memChk = new KllMemoryValidate(mem); + if (!memChk.doublesSketch) { Error.kllSketchThrow(SRC_MUST_BE_DOUBLE); } + return new KllDoublesSketch(mem, memChk); } - // public functions - /** * Returns an approximation to the Cumulative Distribution Function (CDF), which is the * cumulative analog of the PMF, of the input stream given a set of splitPoint (values). @@ -196,13 +115,13 @@ public static KllDoublesSketch heapify(final Memory mem) { * the maximum value. * It is not necessary to include either the min or max values in these split points. * - * @return an array of m+1 double values, which are a consecutive approximation to the CDF - * of the input stream given the splitPoints. The value at array position j of the returned - * CDF array is the sum of the returned values in positions 0 through j of the returned PMF - * array. + * @return an array of m+1 double values on the interval [0.0, 1.0), + * which are a consecutive approximation to the CDF of the input stream given the splitPoints. + * The value at array position j of the returned CDF array is the sum of the returned values + * in positions 0 through j of the returned PMF array. */ public double[] getCDF(final double[] splitPoints) { - return getPmfOrCdf(splitPoints, true); + return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true); } /** @@ -211,9 +130,7 @@ public double[] getCDF(final double[] splitPoints) { * * @return the max value of the stream */ - public double getMaxValue() { - return maxValue_; - } + public double getMaxValue() { return getMaxDoubleValue(); } /** * Returns the min value of the stream. @@ -221,24 +138,7 @@ public double getMaxValue() { * * @return the min value of the stream */ - public double getMinValue() { - return minValue_; - } - - /** - * Returns upper bound on the serialized size of a sketch given a parameter k and stream - * length. The resulting size is an overestimate to make sure actual sketches don't exceed it. - * This method can be used if allocation of storage is necessary beforehand, but it is not - * optimal. - * @param k parameter that controls size of the sketch and accuracy of estimates - * @param n stream length - * @return upper bound on the serialized size - */ - public static int getMaxSerializedSizeBytes(final int k, final long n) { - final int numLevels = KllHelper.ubOnNumLevels(n); - final int maxNumItems = KllHelper.computeTotalItemCapacity(k, DEFAULT_M, numLevels); - return getSerializedSizeBytes(numLevels, maxNumItems); - } + public double getMinValue() { return getMinDoubleValue(); } /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream @@ -256,13 +156,14 @@ public static int getMaxSerializedSizeBytes(final int k, final long n) { * the maximum value. * It is not necessary to include either the min or max values in these split points. * - * @return an array of m+1 doubles each of which is an approximation - * to the fraction of the input stream values (the mass) that fall into one of those intervals. + * @return an array of m+1 doubles on the interval [0.0, 1.0), + * each of which is an approximation to the fraction of the total input stream values + * (the mass) that fall into one of those intervals. * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right * splitPoint, with the exception that the last interval will include maximum value. */ public double[] getPMF(final double[] splitPoints) { - return getPmfOrCdf(splitPoints, false); + return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false); } /** @@ -284,27 +185,7 @@ public double[] getPMF(final double[] splitPoints) { * @return the approximation to the value at the given fraction */ public double getQuantile(final double fraction) { - if (isEmpty()) { return Double.NaN; } - if (compatible) { - if (fraction == 0.0) { return minValue_; } - if (fraction == 1.0) { return maxValue_; } - } - if (fraction < 0.0 || fraction > 1.0) { - throw new SketchesArgumentException("Fraction cannot be less than zero or greater than 1.0"); - } - final KllDoublesQuantileCalculator quant = getQuantileCalculator(); - return quant.getQuantile(fraction); - } - - /** - * Gets the upper bound of the value interval in which the true quantile of the given rank - * exists with a confidence of at least 99%. - * @param fraction the given normalized rank as a fraction - * @return the upper bound of the value interval in which the true quantile of the given rank - * exists with a confidence of at least 99%. Returns NaN if the sketch is empty. - */ - public double getQuantileUpperBound(final double fraction) { - return getQuantile(min(1.0, fraction + getNormalizedRankError(minK_, false))); + return KllDoublesHelper.getDoublesQuantile(this, fraction); } /** @@ -315,7 +196,7 @@ public double getQuantileUpperBound(final double fraction) { * exists with a confidence of at least 99%. Returns NaN if the sketch is empty. */ public double getQuantileLowerBound(final double fraction) { - return getQuantile(max(0, fraction - getNormalizedRankError(minK_, false))); + return getQuantile(max(0, fraction - KllHelper.getNormalizedRankError(getMinK(), false))); } /** @@ -337,24 +218,7 @@ public double getQuantileLowerBound(final double fraction) { * array. */ public double[] getQuantiles(final double[] fractions) { - if (isEmpty()) { return null; } - KllDoublesQuantileCalculator quant = null; - final double[] quantiles = new double[fractions.length]; - for (int i = 0; i < fractions.length; i++) { - final double fraction = fractions[i]; - if (fraction < 0.0 || fraction > 1.0) { - throw new SketchesArgumentException("Fraction cannot be less than zero or greater than 1.0"); - } - if (fraction == 0.0 && compatible) { quantiles[i] = minValue_; } - else if (fraction == 1.0 && compatible) { quantiles[i] = maxValue_; } - else { - if (quant == null) { - quant = getQuantileCalculator(); - } - quantiles[i] = quant.getQuantile(fraction); - } - } - return quantiles; + return KllDoublesHelper.getDoublesQuantiles(this, fractions); } /** @@ -376,6 +240,17 @@ public double[] getQuantiles(final int numEvenlySpaced) { return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, numEvenlySpaced)); } + /** + * Gets the upper bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. + * @param fraction the given normalized rank as a fraction + * @return the upper bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. Returns NaN if the sketch is empty. + */ + public double getQuantileUpperBound(final double fraction) { + return getQuantile(min(1.0, fraction + KllHelper.getNormalizedRankError(getMinK(), false))); + } + /** * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, * inclusive. @@ -389,170 +264,36 @@ public double[] getQuantiles(final int numEvenlySpaced) { * @return an approximate rank of the given value */ public double getRank(final double value) { - if (isEmpty()) { return Double.NaN; } - int level = 0; - int weight = 1; - long total = 0; - while (level < numLevels_) { - final int fromIndex = levels_[level]; - final int toIndex = levels_[level + 1]; // exclusive - for (int i = fromIndex; i < toIndex; i++) { - if (items_[i] < value) { - total += weight; - } else if (level > 0 || isLevelZeroSorted_) { - break; // levels above 0 are sorted, no point comparing further - } - } - level++; - weight *= 2; - } - return (double) total / n_; - } - - /** - * Returns the number of bytes this sketch would require to store. - * @return the number of bytes this sketch would require to store. - */ - public int getSerializedSizeBytes() { - if (isEmpty()) { return N_LONG; } - return getSerializedSizeBytes(numLevels_, getNumRetained()); + return KllDoublesHelper.getDoubleRank(this, value); } /** * @return the iterator for this class */ public KllDoublesSketchIterator iterator() { - return new KllDoublesSketchIterator(items_, levels_, numLevels_); + return new KllDoublesSketchIterator(getDoubleItemsArray(), getLevelsArray(), getNumLevels()); } /** * Merges another sketch into this one. * @param other sketch to merge into this one */ - public void merge(final KllDoublesSketch other) { - if (other == null || other.isEmpty()) { return; } - if (m_ != other.m_) { - throw new SketchesArgumentException("incompatible M: " + m_ + " and " + other.m_); - } - final long finalN = n_ + other.n_; - //update this sketch with level0 items from the other sketch - for (int i = other.levels_[0]; i < other.levels_[1]; i++) { - update(other.items_[i]); - } - if (other.numLevels_ >= 2) { //now merge other levels if they exist - mergeHigherLevels(other, finalN); - } - //update min, max values, n - if (Double.isNaN(minValue_) || other.minValue_ < minValue_) { minValue_ = other.minValue_; } - if (Double.isNaN(maxValue_) || other.maxValue_ > maxValue_) { maxValue_ = other.maxValue_; } - n_ = finalN; - - assert KllHelper.sumTheSampleWeights(numLevels_, levels_) == n_; - if (other.isEstimationMode()) { - minK_ = min(minK_, other.minK_); - } + public void merge(final KllSketch other) { + if (!other.isDoublesSketch()) { kllSketchThrow(SRC_MUST_BE_DOUBLE); } + KllDoublesHelper.mergeDoubleImpl(this, other); } @Override - public byte[] toByteArray() { - final byte[] bytes = new byte[getSerializedSizeBytes()]; - final WritableMemory wmem = WritableMemory.writableWrap(bytes); - final boolean singleItem = n_ == 1; - final boolean empty = isEmpty(); - //load the preamble - wmem.putByte(PREAMBLE_INTS_BYTE, (byte) (empty || singleItem ? PREAMBLE_INTS_EMPTY_SINGLE : PREAMBLE_INTS_DOUBLE)); - wmem.putByte(SER_VER_BYTE, singleItem ? SERIAL_VERSION_SINGLE : SERIAL_VERSION); - wmem.putByte(FAMILY_BYTE, (byte) Family.KLL.getID()); - final byte flags = (byte) ( - (empty ? 1 << Flags.IS_EMPTY.ordinal() : 0) - | (isLevelZeroSorted_ ? 1 << Flags.IS_LEVEL_ZERO_SORTED.ordinal() : 0) - | (singleItem ? 1 << Flags.IS_SINGLE_ITEM.ordinal() : 0)); - wmem.putByte(FLAGS_BYTE, flags); - wmem.putShort(K_SHORT, (short) k_); - wmem.putByte(M_BYTE, (byte) m_); - if (empty) { return bytes; } - //load data - int offset = DATA_START_SINGLE_ITEM; - if (!singleItem) { - wmem.putLong(N_LONG, n_); - wmem.putShort(MIN_K_SHORT, (short) minK_); - wmem.putByte(NUM_LEVELS_BYTE, (byte) numLevels_); - offset = DATA_START_DOUBLE; - // the last integer in levels_ is not serialized because it can be derived - final int len = levels_.length - 1; - wmem.putIntArray(offset, levels_, 0, len); - offset += len * Integer.BYTES; - wmem.putDouble(offset, minValue_); - offset += Double.BYTES; - wmem.putDouble(offset, maxValue_); - offset += Double.BYTES; - } - wmem.putDoubleArray(offset, items_, levels_[0], getNumRetained()); - return bytes; - } - - @Override - public String toString(final boolean withLevels, final boolean withData) { - final String epsPct = String.format("%.3f%%", getNormalizedRankError(false) * 100); - final String epsPMFPct = String.format("%.3f%%", getNormalizedRankError(true) * 100); - final StringBuilder sb = new StringBuilder(); - sb.append(Util.LS).append("### KLL sketch summary:").append(Util.LS); - sb.append(" K : ").append(k_).append(Util.LS); - sb.append(" min K : ").append(minK_).append(Util.LS); - sb.append(" M : ").append(m_).append(Util.LS); - sb.append(" N : ").append(n_).append(Util.LS); - sb.append(" Epsilon : ").append(epsPct).append(Util.LS); - sb.append(" Epsison PMF : ").append(epsPMFPct).append(Util.LS); - sb.append(" Empty : ").append(isEmpty()).append(Util.LS); - sb.append(" Estimation Mode : ").append(isEstimationMode()).append(Util.LS); - sb.append(" Levels : ").append(numLevels_).append(Util.LS); - sb.append(" Level 0 Sorted : ").append(isLevelZeroSorted_).append(Util.LS); - sb.append(" Capacity Items : ").append(items_.length).append(Util.LS); - sb.append(" Retained Items : ").append(getNumRetained()).append(Util.LS); - sb.append(" Storage Bytes : ").append(getSerializedSizeBytes()).append(Util.LS); - sb.append(" Min Value : ").append(minValue_).append(Util.LS); - sb.append(" Max Value : ").append(maxValue_).append(Util.LS); - sb.append("### End sketch summary").append(Util.LS); - - if (withLevels) { - sb.append("### KLL sketch levels:").append(Util.LS) - .append(" level, offset: nominal capacity, actual size").append(Util.LS); - for (int i = 0; i < numLevels_; i++) { - sb.append(" ").append(i).append(", ").append(levels_[i]).append(": ") - .append(KllHelper.levelCapacity(k_, numLevels_, i, m_)) - .append(", ").append(currentLevelSize(i)).append(Util.LS); - } - sb.append("### End sketch levels").append(Util.LS); - } - - if (withData) { - sb.append("### KLL sketch data {index, item}:").append(Util.LS); - if (levels_[0] > 0) { - sb.append(" Garbage:" + Util.LS); - for (int i = 0; i < levels_[0]; i++) { - if (items_[i] == 0.0f) { continue; } - sb.append(" ").append(i + ", ").append(items_[i]).append(Util.LS); - } - } - int level = 0; - while (level < numLevels_) { - final int fromIndex = levels_[level]; - final int toIndex = levels_[level + 1]; // exclusive - if (fromIndex < toIndex) { - sb.append(" level[").append(level).append("]: offset: " + levels_[level] + " wt: " + (1 << level)); - sb.append(Util.LS); - } - for (int i = fromIndex; i < toIndex; i++) { - sb.append(" ").append(i + ", ").append(items_[i]).append(Util.LS); - } - level++; - } - sb.append(" level[" + level + "]: offset: " + levels_[level] + " (Exclusive)"); - sb.append(Util.LS); - sb.append("### End sketch data").append(Util.LS); - } - - return sb.toString(); + public void reset() { + final int k = getK(); + setN(0); + setMinK(k); + setNumLevels(1); + setLevelsArray(new int[] {k, k}); + setLevelZeroSorted(false); + doubleItems_ = new double[k]; + minDoubleValue_ = Double.NaN; + maxDoubleValue_ = Double.NaN; } /** @@ -561,269 +302,55 @@ public String toString(final boolean withLevels, final boolean withData) { * @param value an item from a stream of items. NaNs are ignored. */ public void update(final double value) { - if (Double.isNaN(value)) { return; } - if (isEmpty()) { - minValue_ = value; - maxValue_ = value; - } else { - if (value < minValue_) { minValue_ = value; } - if (value > maxValue_) { maxValue_ = value; } - } - if (levels_[0] == 0) { - compressWhileUpdating(); - } - n_++; - isLevelZeroSorted_ = false; - final int nextPos = levels_[0] - 1; - assert levels_[0] >= 0; - levels_[0] = nextPos; - items_[nextPos] = value; - } - - // Restricted Methods - - private KllDoublesQuantileCalculator getQuantileCalculator() { - sortLevelZero(); // sort in the sketch to reuse if possible - return new KllDoublesQuantileCalculator(items_, levels_, numLevels_, n_); + KllDoublesHelper.updateDouble(this, value); } - private double[] getPmfOrCdf(final double[] splitPoints, final boolean isCdf) { - if (isEmpty()) { return null; } - KllDoublesHelper.validateDoubleValues(splitPoints); - final double[] buckets = new double[splitPoints.length + 1]; - int level = 0; - int weight = 1; - while (level < numLevels_) { - final int fromIndex = levels_[level]; - final int toIndex = levels_[level + 1]; // exclusive - if (level == 0 && !isLevelZeroSorted_) { - incrementBucketsUnsortedLevel(fromIndex, toIndex, weight, splitPoints, buckets); - } else { - incrementBucketsSortedLevel(fromIndex, toIndex, weight, splitPoints, buckets); - } - level++; - weight *= 2; - } - // normalize and, if CDF, convert to cumulative - if (isCdf) { - double subtotal = 0; - for (int i = 0; i < buckets.length; i++) { - subtotal += buckets[i]; - buckets[i] = subtotal / n_; - } - } else { - for (int i = 0; i < buckets.length; i++) { - buckets[i] /= n_; - } - } - return buckets; - } + @Override //Used internally + double[] getDoubleItemsArray() { return doubleItems_; } - private void incrementBucketsUnsortedLevel(final int fromIndex, final int toIndex, - final int weight, final double[] splitPoints, final double[] buckets) { - for (int i = fromIndex; i < toIndex; i++) { - int j; - for (j = 0; j < splitPoints.length; j++) { - if (items_[i] < splitPoints[j]) { - break; - } - } - buckets[j] += weight; - } - } - - private void incrementBucketsSortedLevel(final int fromIndex, final int toIndex, - final int weight, final double[] splitPoints, final double[] buckets) { - int i = fromIndex; - int j = 0; - while (i < toIndex && j < splitPoints.length) { - if (items_[i] < splitPoints[j]) { - buckets[j] += weight; // this sample goes into this bucket - i++; // move on to next sample and see whether it also goes into this bucket - } else { - j++; // no more samples for this bucket - } - } - // now either i == toIndex (we are out of samples), or - // j == numSplitPoints (we are out of buckets, but there are more samples remaining) - // we only need to do something in the latter case - if (j == splitPoints.length) { - buckets[j] += weight * (toIndex - i); - } - } - - // The following code is only valid in the special case of exactly reaching capacity while updating. - // It cannot be used while merging, while reducing k, or anything else. - private void compressWhileUpdating() { - final int level = findLevelToCompact(); - - // It is important to do add the new top level right here. Be aware that this operation - // grows the buffer and shifts the data and also the boundaries of the data and grows the - // levels array and increments numLevels_ - if (level == numLevels_ - 1) { - addEmptyTopLevelToCompletelyFullSketch(); - } - - final int rawBeg = levels_[level]; - final int rawLim = levels_[level + 1]; - // +2 is OK because we already added a new top level if necessary - final int popAbove = levels_[level + 2] - rawLim; - final int rawPop = rawLim - rawBeg; - final boolean oddPop = isOdd(rawPop); - final int adjBeg = oddPop ? rawBeg + 1 : rawBeg; - final int adjPop = oddPop ? rawPop - 1 : rawPop; - final int halfAdjPop = adjPop / 2; - - // level zero might not be sorted, so we must sort it if we wish to compact it - if (level == 0) { - Arrays.sort(items_, adjBeg, adjBeg + adjPop); - } - if (popAbove == 0) { - KllDoublesHelper.randomlyHalveUpDoubles(items_, adjBeg, adjPop, random); - } else { - KllDoublesHelper.randomlyHalveDownDoubles(items_, adjBeg, adjPop, random); - KllDoublesHelper.mergeSortedDoubleArrays( - items_, adjBeg, halfAdjPop, - items_, rawLim, popAbove, - items_, adjBeg + halfAdjPop); - } - levels_[level + 1] -= halfAdjPop; // adjust boundaries of the level above - if (oddPop) { - levels_[level] = levels_[level + 1] - 1; // the current level now contains one item - items_[levels_[level]] = items_[rawBeg]; // namely this leftover guy - } else { - levels_[level] = levels_[level + 1]; // the current level is now empty - } - - // verify that we freed up halfAdjPop array slots just below the current level - assert levels_[level] == rawBeg + halfAdjPop; - - // finally, we need to shift up the data in the levels below - // so that the freed-up space can be used by level zero - if (level > 0) { - final int amount = rawBeg - levels_[0]; - System.arraycopy(items_, levels_[0], items_, levels_[0] + halfAdjPop, amount); - for (int lvl = 0; lvl < level; lvl++) { - levels_[lvl] += halfAdjPop; - } - } - } - - private void addEmptyTopLevelToCompletelyFullSketch() { - final int curTotalCap = levels_[numLevels_]; - - // make sure that we are following a certain growth scheme - assert levels_[0] == 0; //definition of full - assert items_.length == curTotalCap; - - // note that merging MIGHT over-grow levels_, in which case we might not have to grow it here - if (levels_.length < numLevels_ + 2) { - levels_ = KllHelper.growIntArray(levels_, numLevels_ + 2); - } - - final int deltaCap = KllHelper.levelCapacity(k_, numLevels_ + 1, 0, m_); - final int newTotalCap = curTotalCap + deltaCap; - - final double[] newBuf = new double[newTotalCap]; - - // copy (and shift) the current data into the new buffer - System.arraycopy(items_, levels_[0], newBuf, levels_[0] + deltaCap, curTotalCap); - items_ = newBuf; - - // this loop includes the old "extra" index at the top - for (int i = 0; i <= numLevels_; i++) { - levels_[i] += deltaCap; - } - - assert levels_[numLevels_] == newTotalCap; - - numLevels_++; - levels_[numLevels_] = newTotalCap; // initialize the new "extra" index at the top - } + @Override + double getDoubleItemsArrayAt(final int index) { return doubleItems_[index]; } - private void sortLevelZero() { - if (!isLevelZeroSorted_) { - Arrays.sort(items_, levels_[0], levels_[1]); - isLevelZeroSorted_ = true; - } - } + @Override //Dummy + float[] getFloatItemsArray() { kllSketchThrow(MUST_NOT_CALL); return null; } - private void mergeHigherLevels(final KllDoublesSketch other, final long finalN) { - final int tmpSpaceNeeded = getNumRetained() + other.getNumRetainedAboveLevelZero(); - final double[] workbuf = new double[tmpSpaceNeeded]; - final int ub = KllHelper.ubOnNumLevels(finalN); - final int[] worklevels = new int[ub + 2]; // ub+1 does not work - final int[] outlevels = new int[ub + 2]; + @Override //Dummy + float getFloatItemsArrayAt(final int index) { kllSketchThrow(MUST_NOT_CALL); return Float.NaN; } - final int provisionalNumLevels = max(numLevels_, other.numLevels_); + @Override //Used internally + double getMaxDoubleValue() { return maxDoubleValue_; } - populateWorkArrays(other, workbuf, worklevels, provisionalNumLevels); + @Override //Dummy + float getMaxFloatValue() { kllSketchThrow(MUST_NOT_CALL); return (float) maxDoubleValue_; } - // notice that workbuf is being used as both the input and output here - final int[] result = KllDoublesHelper.generalDoublesCompress(k_, m_, provisionalNumLevels, workbuf, - worklevels, workbuf, outlevels, isLevelZeroSorted_, random); - final int finalNumLevels = result[0]; - final int finalCapacity = result[1]; - final int finalPop = result[2]; + @Override //Used internally + double getMinDoubleValue() { return minDoubleValue_; } - assert finalNumLevels <= ub; // can sometimes be much bigger + @Override //Dummy + float getMinFloatValue() { kllSketchThrow(MUST_NOT_CALL); return (float) minDoubleValue_; } - // now we need to transfer the results back into the "self" sketch - final double[] newbuf = finalCapacity == items_.length ? items_ : new double[finalCapacity]; - final int freeSpaceAtBottom = finalCapacity - finalPop; - System.arraycopy(workbuf, outlevels[0], newbuf, freeSpaceAtBottom, finalPop); - final int theShift = freeSpaceAtBottom - outlevels[0]; + @Override //Used internally + void setDoubleItemsArray(final double[] doubleItems) { doubleItems_ = doubleItems; } - if (levels_.length < finalNumLevels + 1) { - levels_ = new int[finalNumLevels + 1]; - } + @Override //Used internally + void setDoubleItemsArrayAt(final int index, final double value) { doubleItems_[index] = value; } - for (int lvl = 0; lvl < finalNumLevels + 1; lvl++) { // includes the "extra" index - levels_[lvl] = outlevels[lvl] + theShift; - } + @Override //Dummy + void setFloatItemsArray(final float[] floatItems) { kllSketchThrow(MUST_NOT_CALL); } - items_ = newbuf; - numLevels_ = finalNumLevels; - } + @Override //Dummy + void setFloatItemsArrayAt(final int index, final float value) { kllSketchThrow(MUST_NOT_CALL); } - private void populateWorkArrays(final KllDoublesSketch other, final double[] workbuf, - final int[] worklevels, final int provisionalNumLevels) { - worklevels[0] = 0; - - // Note: the level zero data from "other" was already inserted into "self" - final int selfPopZero = currentLevelSize(0); - System.arraycopy(items_, levels_[0], workbuf, worklevels[0], selfPopZero); - worklevels[1] = worklevels[0] + selfPopZero; - - for (int lvl = 1; lvl < provisionalNumLevels; lvl++) { - final int selfPop = currentLevelSize(lvl); - final int otherPop = other.currentLevelSize(lvl); - worklevels[lvl + 1] = worklevels[lvl] + selfPop + otherPop; - - if (selfPop > 0 && otherPop == 0) { - System.arraycopy(items_, levels_[lvl], workbuf, worklevels[lvl], selfPop); - } else if (selfPop == 0 && otherPop > 0) { - System.arraycopy(other.items_, other.levels_[lvl], workbuf, worklevels[lvl], otherPop); - } else if (selfPop > 0 && otherPop > 0) { - KllDoublesHelper.mergeSortedDoubleArrays(items_, levels_[lvl], selfPop, other.items_, - other.levels_[lvl], otherPop, workbuf, worklevels[lvl]); - } - } - } + @Override //Used internally + void setMaxDoubleValue(final double value) { maxDoubleValue_ = value; } - private static int getSerializedSizeBytes(final int numLevels, final int numRetained) { - if (numLevels == 1 && numRetained == 1) { - return DATA_START_SINGLE_ITEM + Double.BYTES; - } - // the last integer in levels_ is not serialized because it can be derived - // + 2 for min and max - return DATA_START_DOUBLE + numLevels * Integer.BYTES + (numRetained + 2) * Double.BYTES; - } + @Override //Dummy + void setMaxFloatValue(final float value) { kllSketchThrow(MUST_NOT_CALL); } - // for testing + @Override //Used internally + void setMinDoubleValue(final double value) { minDoubleValue_ = value; } - double[] getItems() { - return items_; - } + @Override //Dummy + void setMinFloatValue(final float value) { kllSketchThrow(MUST_NOT_CALL); } } diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java index 6d4c2044a..3835cd0b4 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketchIterator.java @@ -39,6 +39,26 @@ public class KllDoublesSketchIterator { isInitialized_ = false; } + /** + * Gets a value from the current entry in the sketch. + * Don't call this before calling next() for the first time + * or after getting false from next(). + * @return value from the current entry + */ + public double getValue() { + return items_[i_]; + } + + /** + * Gets a weight for the value from the current entry in the sketch. + * Don't call this before calling next() for the first time + * or after getting false from next(). + * @return weight for the value from the current entry + */ + public long getWeight() { + return weight_; + } + /** * Advancing the iterator and checking existence of the next entry * is combined here for efficiency. This results in an undefined @@ -69,25 +89,4 @@ public boolean next() { return true; } - /** - * Gets a value from the current entry in the sketch. - * Don't call this before calling next() for the first time - * or after getting false from next(). - * @return value from the current entry - */ - public double getValue() { - return items_[i_]; - } - - /** - * Gets a weight for the value from the current entry in the sketch. - * Don't call this before calling next() for the first time - * or after getting false from next(). - * @return weight for the value from the current entry - */ - public long getWeight() { - return weight_; - } - } - diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java index 065e38eba..85742f9bb 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java @@ -19,6 +19,8 @@ package org.apache.datasketches.kll; +import static java.lang.Math.max; +import static java.lang.Math.min; import static org.apache.datasketches.Util.isEven; import static org.apache.datasketches.Util.isOdd; @@ -32,23 +34,193 @@ * @author Kevin Lang * @author Alexander Saydakov */ -class KllFloatsHelper { +final class KllFloatsHelper { - /** - * Checks the sequential validity of the given array of float values. - * They must be unique, monotonically increasing and not NaN. - * @param values the given array of values - */ - static void validateFloatValues(final float[] values) { - for (int i = 0; i < values.length; i++) { - if (!Float.isFinite(values[i])) { - throw new SketchesArgumentException("Values must be finite"); + static double getFloatRank(final KllSketch mine, final float value) { + if (mine.isEmpty()) { return Double.NaN; } + int level = 0; + int weight = 1; + long total = 0; + final float[] myFloatItemsArr = mine.getFloatItemsArray(); + final int[] myLevelsArr = mine.getLevelsArray(); + while (level < mine.getNumLevels()) { + final int fromIndex = myLevelsArr[level]; + final int toIndex = myLevelsArr[level + 1]; // exclusive + for (int i = fromIndex; i < toIndex; i++) { + if (myFloatItemsArr[i] < value) { + total += weight; + } else if (level > 0 || mine.isLevelZeroSorted()) { + break; // levels above 0 are sorted, no point comparing further + } } - if (i < values.length - 1 && values[i] >= values[i + 1]) { - throw new SketchesArgumentException( - "Values must be unique and monotonically increasing"); + level++; + weight *= 2; + } + return (double) total / mine.getN(); + } + + static double[] getFloatsPmfOrCdf(final KllSketch mine, final float[] splitPoints, final boolean isCdf) { + if (mine.isEmpty()) { return null; } + validateFloatValues(splitPoints); + final double[] buckets = new double[splitPoints.length + 1]; + final int myNumLevels = mine.getNumLevels(); + final int[] myLevelsArr = mine.getLevelsArray(); + int level = 0; + int weight = 1; + while (level < myNumLevels) { + final int fromIndex = myLevelsArr[level]; + final int toIndex = myLevelsArr[level + 1]; // exclusive + if (level == 0 && !mine.isLevelZeroSorted()) { + KllFloatsHelper.incrementFloatBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets); + } else { + KllFloatsHelper.incrementFloatBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets); + } + level++; + weight *= 2; + } + // normalize and, if CDF, convert to cumulative + if (isCdf) { + double subtotal = 0; + for (int i = 0; i < buckets.length; i++) { + subtotal += buckets[i]; + buckets[i] = subtotal / mine.getN(); + } + } else { + for (int i = 0; i < buckets.length; i++) { + buckets[i] /= mine.getN(); + } + } + return buckets; + } + + static float getFloatsQuantile(final KllSketch mine, final double fraction) { + if (mine.isEmpty()) { return Float.NaN; } + if (fraction < 0.0 || fraction > 1.0) { + throw new SketchesArgumentException("Fraction cannot be less than zero nor greater than 1.0"); + } + //These two assumptions make KLL compatible with the previous classic Quantiles Sketch + if (fraction == 0.0) { return mine.getMinFloatValue(); } + if (fraction == 1.0) { return mine.getMaxFloatValue(); } + final KllFloatsQuantileCalculator quant = KllFloatsHelper.getFloatsQuantileCalculator(mine); + return quant.getQuantile(fraction); + } + + static float[] getFloatsQuantiles(final KllSketch mine, final double[] fractions) { + if (mine.isEmpty()) { return null; } + KllFloatsQuantileCalculator quant = null; + final float[] quantiles = new float[fractions.length]; + for (int i = 0; i < fractions.length; i++) { + final double fraction = fractions[i]; + if (fraction < 0.0 || fraction > 1.0) { + throw new SketchesArgumentException("Fraction cannot be less than zero nor greater than 1.0"); + } + if (fraction == 0.0) { quantiles[i] = mine.getMinFloatValue(); } + else if (fraction == 1.0) { quantiles[i] = mine.getMaxFloatValue(); } + else { + if (quant == null) { + quant = KllFloatsHelper.getFloatsQuantileCalculator(mine); + } + quantiles[i] = quant.getQuantile(fraction); } } + return quantiles; + } + + static void mergeFloatImpl(final KllSketch mine, final KllSketch other) { + if (other.isEmpty()) { return; } + final long finalN = mine.getN() + other.getN(); + //update this sketch with level0 items from the other sketch + final float[] otherFloatItemsArr = other.getFloatItemsArray(); + final int otherNumLevels = other.getNumLevels(); + final int[] otherLevelsArr = other.getLevelsArray(); + for (int i = otherLevelsArr[0]; i < otherLevelsArr[1]; i++) { + KllFloatsHelper.updateFloat(mine, otherFloatItemsArr[i]); + } + // after the level 0 update, we capture the key mutable variables + final float myMin = mine.getMinFloatValue(); + final float myMax = mine.getMaxFloatValue(); + final int myMinK = mine.getMinK(); + + final int myCurNumLevels = mine.getNumLevels(); + final int[] myCurLevelsArr = mine.getLevelsArray(); + final float[] myCurFloatItemsArr = mine.getFloatItemsArray(); + + final int myNewNumLevels; + final int[] myNewLevelsArr; + final float[] myNewFloatItemsArr; + + if (otherNumLevels > 1) { //now merge higher levels if they exist + final int tmpSpaceNeeded = mine.getNumRetained() + + KllHelper.getNumRetainedAboveLevelZero(otherNumLevels, otherLevelsArr); + final float[] workbuf = new float[tmpSpaceNeeded]; + final int ub = KllHelper.ubOnNumLevels(finalN); + final int[] worklevels = new int[ub + 2]; // ub+1 does not work + final int[] outlevels = new int[ub + 2]; + + final int provisionalNumLevels = max(myCurNumLevels, otherNumLevels); + + populateFloatWorkArrays(mine, other, workbuf, worklevels, provisionalNumLevels); + + // notice that workbuf is being used as both the input and output + final int[] result = generalFloatsCompress(mine.getK(), mine.getM(), provisionalNumLevels, + workbuf, worklevels, workbuf, outlevels, mine.isLevelZeroSorted(), KllSketch.random); + final int targetItemCount = result[1]; //was finalCapacity. Max size given k, m, numLevels + final int curItemCount = result[2]; //was finalPop + + // now we need to finalize the results for the "self" sketch + + //THE NEW NUM LEVELS + myNewNumLevels = result[0]; //was finalNumLevels + assert myNewNumLevels <= ub; // ub may be much bigger + + // THE NEW ITEMS ARRAY (was newbuf) + myNewFloatItemsArr = (targetItemCount == myCurFloatItemsArr.length) + ? myCurFloatItemsArr + : new float[targetItemCount]; + final int freeSpaceAtBottom = targetItemCount - curItemCount; + //shift the new items array + System.arraycopy(workbuf, outlevels[0], myNewFloatItemsArr, freeSpaceAtBottom, curItemCount); + final int theShift = freeSpaceAtBottom - outlevels[0]; + + //calculate the new levels array length + final int finalLevelsArrLen; + if (myCurLevelsArr.length < myNewNumLevels + 1) { finalLevelsArrLen = myNewNumLevels + 1; } + else { finalLevelsArrLen = myCurLevelsArr.length; } + + //THE NEW LEVELS ARRAY + myNewLevelsArr = new int[finalLevelsArrLen]; + for (int lvl = 0; lvl < myNewNumLevels + 1; lvl++) { // includes the "extra" index + myNewLevelsArr[lvl] = outlevels[lvl] + theShift; + } + + //MEMORY SPACE MANAGEMENT + if (mine.updatablMemory) { + mine.wmem = KllHelper.memorySpaceMgmt(mine, myNewLevelsArr.length, myNewFloatItemsArr.length); + } + + } else { + myNewNumLevels = myCurNumLevels; + myNewLevelsArr = myCurLevelsArr; + myNewFloatItemsArr = myCurFloatItemsArr; + } + + //Update Preamble: + mine.setN(finalN); + if (other.isEstimationMode()) { //otherwise the merge brings over exact items. + mine.setMinK(min(myMinK, other.getMinK())); + } + + //Update min, max values + final float otherMin = other.getMinFloatValue(); + final float otherMax = other.getMaxFloatValue(); + mine.setMinFloatValue(resolveFloatMinValue(myMin, otherMin)); + mine.setMaxFloatValue(resolveFloatMaxValue(myMax, otherMax)); + + //Update numLevels, levelsArray, items + mine.setNumLevels(myNewNumLevels); + mine.setLevelsArray(myNewLevelsArr); + mine.setFloatItemsArray(myNewFloatItemsArr); + assert KllHelper.sumTheSampleWeights(mine.getNumLevels(), mine.getLevelsArray()) == mine.getN(); } static void mergeSortedFloatArrays( @@ -82,6 +254,62 @@ static void mergeSortedFloatArrays( assert b == limB; } + /** + * Validation Method. This must be modified to test validation + * @param buf the items array + * @param start data start + * @param length items length + * @param random instance of Random + */ + static void randomlyHalveDownFloats(final float[] buf, final int start, final int length, final Random random) { + assert isEven(length); + final int half_length = length / 2; + final int offset = random.nextInt(2); // disable for validation + //final int offset = deterministicOffset(); // enable for validation + int j = start + offset; + for (int i = start; i < (start + half_length); i++) { + buf[i] = buf[j]; + j += 2; + } + } + + /** + * Validation Method. This must be modified to test validation + * @param buf the items array + * @param start data start + * @param length items length + * @param random instance of Random + */ + static void randomlyHalveUpFloats(final float[] buf, final int start, final int length, final Random random) { + assert isEven(length); + final int half_length = length / 2; + final int offset = random.nextInt(2); // disable for validation + //final int offset = deterministicOffset(); // enable for validation + int j = (start + length) - 1 - offset; + for (int i = (start + length) - 1; i >= (start + half_length); i--) { + buf[i] = buf[j]; + j -= 2; + } + } + + static void updateFloat(final KllSketch mine, final float value) { + if (Float.isNaN(value)) { return; } + if (mine.isEmpty()) { + mine.setMinFloatValue(value); + mine.setMaxFloatValue(value); + } else { + if (value < mine.getMinFloatValue()) { mine.setMinFloatValue(value); } + if (value > mine.getMaxFloatValue()) { mine.setMaxFloatValue(value); } + } + if (mine.getLevelsArrayAt(0) == 0) { KllHelper.compressWhileUpdatingSketch(mine); } + mine.incN(); + mine.setLevelZeroSorted(false); + final int nextPos = mine.getLevelsArrayAt(0) - 1; + assert mine.getLevelsArrayAt(0) >= 0; + mine.setLevelsArrayAt(0, nextPos); + mine.setFloatItemsArrayAt(nextPos, value); + } + /** * Compression algorithm used to merge higher levels. *

Here is what we do for each level:

@@ -112,9 +340,10 @@ static void mergeSortedFloatArrays( * @param outBuf the same array as inBuf * @param outLevels the same size as inLevels * @param isLevelZeroSorted true if this.level 0 is sorted + * @param random instance of java.util.Random * @return int array of: {numLevels, targetItemCount, currentItemCount) */ - static int[] generalFloatsCompress( + private static int[] generalFloatsCompress( final int k, final int m, final int numLevelsIn, @@ -192,52 +421,130 @@ static int[] generalFloatsCompress( numLevels++; targetItemCount += KllHelper.levelCapacity(k, numLevels, 0, m); } - } // end of code for compacting a level // determine whether we have processed all levels yet (including any new levels that we created) - if (curLevel == (numLevels - 1)) { doneYet = true; } - } // end of loop over levels assert (outLevels[numLevels] - outLevels[0]) == currentItemCount; - return new int[] {numLevels, targetItemCount, currentItemCount}; } - //This must be modified for validation - static void randomlyHalveDownFloats(final float[] buf, final int start, final int length, final Random random) { - assert isEven(length); - final int half_length = length / 2; - final int offset = random.nextInt(2); // disable for validation - //final int offset = deterministicOffset(); // enable for validation - int j = start + offset; - for (int i = start; i < (start + half_length); i++) { - buf[i] = buf[j]; - j += 2; + private static KllFloatsQuantileCalculator getFloatsQuantileCalculator(final KllSketch mine) { + final int[] myLevelsArr = mine.getLevelsArray(); + final float[] myFloatItemsArr = mine.getFloatItemsArray(); + if (!mine.isLevelZeroSorted()) { + Arrays.sort(myFloatItemsArr, myLevelsArr[0], myLevelsArr[1]); + mine.setLevelZeroSorted(true); } + return new KllFloatsQuantileCalculator(myFloatItemsArr, myLevelsArr, mine.getNumLevels(), mine.getN()); } - //This must be modified for validation - static void randomlyHalveUpFloats(final float[] buf, final int start, final int length, final Random random) { - assert isEven(length); - final int half_length = length / 2; - final int offset = random.nextInt(2); // disable for validation - //final int offset = deterministicOffset(); // enable for validation - int j = (start + length) - 1 - offset; - for (int i = (start + length) - 1; i >= (start + half_length); i--) { - buf[i] = buf[j]; - j -= 2; + private static void incrementFloatBucketsSortedLevel( + final KllSketch mine, final int fromIndex, final int toIndex, + final int weight, final float[] splitPoints, final double[] buckets) { + final float[] myFloatItemsArr = mine.getFloatItemsArray(); + int i = fromIndex; + int j = 0; + while (i < toIndex && j < splitPoints.length) { + if (myFloatItemsArr[i] < splitPoints[j]) { + buckets[j] += weight; // this sample goes into this bucket + i++; // move on to next sample and see whether it also goes into this bucket + } else { + j++; // no more samples for this bucket + } + } + // now either i == toIndex (we are out of samples), or + // j == numSplitPoints (we are out of buckets, but there are more samples remaining) + // we only need to do something in the latter case + if (j == splitPoints.length) { + buckets[j] += weight * (toIndex - i); + } + } + + private static void incrementFloatBucketsUnsortedLevel( + final KllSketch mine, final int fromIndex, final int toIndex, + final int weight, final float[] splitPoints, final double[] buckets) { + final float[] myFloatItemsArr = mine.getFloatItemsArray(); + for (int i = fromIndex; i < toIndex; i++) { + int j; + for (j = 0; j < splitPoints.length; j++) { + if (myFloatItemsArr[i] < splitPoints[j]) { + break; + } + } + buckets[j] += weight; + } + } + + private static void populateFloatWorkArrays(final KllSketch mine, final KllSketch other, final float[] workbuf, + final int[] worklevels, final int provisionalNumLevels) { + worklevels[0] = 0; + final int[] myLevelsArr = mine.getLevelsArray(); + final int[] otherLevelsArr = other.getLevelsArray(); + final float[] myFloatItemsArr = mine.getFloatItemsArray(); + final float[] otherFloatItemsArr = other.getFloatItemsArray(); + + // Note: the level zero data from "other" was already inserted into "self" + final int selfPopZero = KllHelper.currentLevelSize(0, mine.getNumLevels(), myLevelsArr); + System.arraycopy( myFloatItemsArr, myLevelsArr[0], workbuf, worklevels[0], selfPopZero); + worklevels[1] = worklevels[0] + selfPopZero; + + for (int lvl = 1; lvl < provisionalNumLevels; lvl++) { + final int selfPop = KllHelper.currentLevelSize(lvl, mine.getNumLevels(), myLevelsArr); + final int otherPop = KllHelper.currentLevelSize(lvl, other.getNumLevels(), otherLevelsArr); + worklevels[lvl + 1] = worklevels[lvl] + selfPop + otherPop; + + if (selfPop > 0 && otherPop == 0) { + System.arraycopy( myFloatItemsArr, myLevelsArr[lvl], workbuf, worklevels[lvl], selfPop); + } else if (selfPop == 0 && otherPop > 0) { + System.arraycopy(otherFloatItemsArr, otherLevelsArr[lvl], workbuf, worklevels[lvl], otherPop); + } else if (selfPop > 0 && otherPop > 0) { + mergeSortedFloatArrays( myFloatItemsArr, myLevelsArr[lvl], selfPop, otherFloatItemsArr, + otherLevelsArr[lvl], otherPop, workbuf, worklevels[lvl]); + } + } + } + + private static float resolveFloatMaxValue(final float myMax, final float otherMax) { + if (Float.isNaN(myMax) && Float.isNaN(otherMax)) { return Float.NaN; } + if (Float.isNaN(myMax)) { return otherMax; } + if (Float.isNaN(otherMax)) { return myMax; } + return max(myMax, otherMax); + } + + private static float resolveFloatMinValue(final float myMin, final float otherMin) { + if (Float.isNaN(myMin) && Float.isNaN(otherMin)) { return Float.NaN; } + if (Float.isNaN(myMin)) { return otherMin; } + if (Float.isNaN(otherMin)) { return myMin; } + return min(myMin, otherMin); + } + + /** + * Validation Method. + * Checks the sequential validity of the given array of float values. + * They must be unique, monotonically increasing and not NaN. + * @param values the given array of values + */ + private static void validateFloatValues(final float[] values) { + for (int i = 0; i < values.length; i++) { + if (!Float.isFinite(values[i])) { + throw new SketchesArgumentException("Values must be finite"); + } + if (i < values.length - 1 && values[i] >= values[i + 1]) { + throw new SketchesArgumentException( + "Values must be unique and monotonically increasing"); + } } } /* + * Validation Method. * The following must be enabled for use with the KllFloatsValidationTest, - * which is only enabled for manual testing. In addition, the two methods + * which is only enabled for manual testing. In addition, two methods * above need to be modified as commented. */ - // static int nextOffset = 0; // // private static int deterministicOffset() { diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsQuantileCalculator.java b/src/main/java/org/apache/datasketches/kll/KllFloatsQuantileCalculator.java index 97f628e29..87539fc0c 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsQuantileCalculator.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsQuantileCalculator.java @@ -58,41 +58,6 @@ final class KllFloatsQuantileCalculator { numLevels_ = 0; //not used by test } - float getQuantile(final double phi) { //phi is normalized rank [0,1]. - final long pos = QuantilesHelper.posOfPhi(phi, n_); - return approximatelyAnswerPositonalQuery(pos); - } - - private float approximatelyAnswerPositonalQuery(final long pos) { - assert pos >= 0; - assert pos < n_; - final int index = QuantilesHelper.chunkContainingPos(weights_, pos); - return items_[index]; - } - - private void populateFromSketch(final float[] srcItems, final int[] srcLevels, - final int numLevels, final int numItems) { - final int offset = srcLevels[0]; - System.arraycopy(srcItems, offset, items_, 0, numItems); - int srcLevel = 0; - int dstLevel = 0; - long weight = 1; - while (srcLevel < numLevels) { - final int fromIndex = srcLevels[srcLevel] - offset; - final int toIndex = srcLevels[srcLevel + 1] - offset; // exclusive - if (fromIndex < toIndex) { // if equal, skip empty level - Arrays.fill(weights_, fromIndex, toIndex, weight); - levels_[dstLevel] = fromIndex; - levels_[dstLevel + 1] = toIndex; - dstLevel++; - } - srcLevel++; - weight *= 2; - } - weights_[numItems] = 0; - numLevels_ = dstLevel; - } - private static void blockyTandemMergeSort(final float[] items, final long[] weights, final int[] levels, final int numLevels) { if (numLevels == 1) { return; } @@ -167,4 +132,39 @@ private static void tandemMerge( } } + float getQuantile(final double rank) { + final long pos = QuantilesHelper.posOfRank(rank, n_); + return approximatelyAnswerPositonalQuery(pos); + } + + private float approximatelyAnswerPositonalQuery(final long pos) { + assert pos >= 0; + assert pos < n_; + final int index = QuantilesHelper.chunkContainingPos(weights_, pos); + return items_[index]; + } + + private void populateFromSketch(final float[] srcItems, final int[] srcLevels, + final int numLevels, final int numItems) { + final int offset = srcLevels[0]; + System.arraycopy(srcItems, offset, items_, 0, numItems); + int srcLevel = 0; + int dstLevel = 0; + long weight = 1; + while (srcLevel < numLevels) { + final int fromIndex = srcLevels[srcLevel] - offset; + final int toIndex = srcLevels[srcLevel + 1] - offset; // exclusive + if (fromIndex < toIndex) { // if equal, skip empty level + Arrays.fill(weights_, fromIndex, toIndex, weight); + levels_[dstLevel] = fromIndex; + levels_[dstLevel + 1] = toIndex; + dstLevel++; + } + srcLevel++; + weight *= 2; + } + weights_[numItems] = 0; + numLevels_ = dstLevel; + } + } diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java index 7d2639fc8..2ef1c3de4 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java @@ -21,32 +21,41 @@ import static java.lang.Math.max; import static java.lang.Math.min; -import static org.apache.datasketches.Util.isOdd; +import static org.apache.datasketches.kll.KllSketch.Error.SRC_MUST_BE_FLOAT; +import static org.apache.datasketches.kll.KllSketch.Error.MUST_NOT_CALL; +import static org.apache.datasketches.kll.KllSketch.Error.kllSketchThrow; -import java.util.Arrays; - -import org.apache.datasketches.Family; -import org.apache.datasketches.SketchesArgumentException; -import org.apache.datasketches.Util; import org.apache.datasketches.memory.Memory; -import org.apache.datasketches.memory.WritableMemory; /** - * Please refer to the documentation in the package-info:
- * {@link org.apache.datasketches.kll} + * This class implements an on-heap floats KllSketch. + * + *

Please refer to the documentation in the package-info:
+ * {@link org.apache.datasketches.kll}

+ * + * @author Lee Rhodes, Kevin Lang */ -public class KllFloatsSketch extends BaseKllSketch { +public final class KllFloatsSketch extends KllHeapSketch { + private float[] floatItems_; + private float minFloatValue_; + private float maxFloatValue_; - // Specific to the floats sketch - private float[] items_; // the continuous array of float items - private float minValue_; - private float maxValue_; + /** + * Private heapify constructor. + * @param mem Memory object that contains data serialized by this sketch. + * @param memVal the MemoryCheck object + */ + private KllFloatsSketch(final Memory mem, final KllMemoryValidate memVal) { + super(memVal.k, memVal.m, SketchType.FLOATS_SKETCH); + KllHelper.buildHeapKllSketchFromMemory(this, memVal); + } /** - * Heap constructor with the default k = 200, which has a rank error of about 1.65%. + * Heap constructor with the default k = 200. + * This will have a rank error of about 1.65%. */ public KllFloatsSketch() { - this(DEFAULT_K); + this(KllSketch.DEFAULT_K, KllSketch.DEFAULT_M); } /** @@ -56,82 +65,25 @@ public KllFloatsSketch() { * @param k parameter that controls size of the sketch and accuracy of estimates */ public KllFloatsSketch(final int k) { - this(k, DEFAULT_M, true); + this(k, KllSketch.DEFAULT_M); } /** - * Used for testing only. - * @param k configured size of sketch. Range [m, 2^16] - * @param compatible if true, compatible with quantiles sketch. - */ - KllFloatsSketch(final int k, final boolean compatible) { - this(k, DEFAULT_M, compatible); - } - - /** - * Heap constructor. - * @param k configured size of sketch. Range [m, 2^16] - * @param m minimum level size. Default is 8. - */ - private KllFloatsSketch(final int k, final int m, final boolean compatible) { - super(k, m, compatible); - items_ = new float[k]; - minValue_ = Float.NaN; - maxValue_ = Float.NaN; - } - - /** - * Private heapify constructor. - * @param mem Memory object that contains data serialized by this sketch. + * Heap constructor with a given parameters k and m. + * + * @param k parameter that controls size of the sketch and accuracy of estimates. + * k can be any value between m and 65535, inclusive. + * The default k = 200 results in a normalized rank error of about 1.65%. + * Higher values of k will have smaller error but the sketch will be larger (and slower). + * @param m parameter that controls the minimum level width in items. It can be 2, 4, 6 or 8. + * The DEFAULT_M, which is 8 is recommended. Other values of m should be considered + * experimental as they have not been as well characterized. */ - private KllFloatsSketch(final Memory mem) { - super(mem.getShort(K_SHORT) & 0xffff, DEFAULT_M, true); - final int flags = mem.getByte(FLAGS_BYTE) & 0xff; - final boolean empty = (flags & 1 << Flags.IS_EMPTY.ordinal()) > 0; - final boolean singleItem = (flags & 1 << Flags.IS_SINGLE_ITEM.ordinal()) > 0; - if (empty) { - numLevels_ = 1; - levels_ = new int[] {k_, k_}; - isLevelZeroSorted_ = false; - minK_ = k_; - items_ = new float[k_]; - minValue_ = Float.NaN; - maxValue_ = Float.NaN; - } else { - if (singleItem) { - n_ = 1; - minK_ = k_; - numLevels_ = 1; - } else { - n_ = mem.getLong(N_LONG); - minK_ = mem.getShort(MIN_K_SHORT) & 0xffff; - numLevels_ = mem.getByte(NUM_LEVELS_BYTE) & 0xff; - } - levels_ = new int[numLevels_ + 1]; - int offset = singleItem ? DATA_START_SINGLE_ITEM : DATA_START_FLOAT; - final int itemCapacity = KllHelper.computeTotalItemCapacity(k_, m_, numLevels_); - if (singleItem) { - levels_[0] = itemCapacity - 1; - } else { - // the last integer in levels_ is not serialized because it can be derived - mem.getIntArray(offset, levels_, 0, numLevels_); - offset += numLevels_ * Integer.BYTES; - } - levels_[numLevels_] = itemCapacity; - if (!singleItem) { - minValue_ = mem.getFloat(offset); - offset += Float.BYTES; - maxValue_ = mem.getFloat(offset); - offset += Float.BYTES; - } - items_ = new float[itemCapacity]; - mem.getFloatArray(offset, items_, levels_[0], getNumRetained()); - if (singleItem) { - minValue_ = items_[levels_[0]]; - maxValue_ = items_[levels_[0]]; - } - isLevelZeroSorted_ = (flags & 1 << Flags.IS_LEVEL_ZERO_SORTED.ordinal()) > 0; - } + KllFloatsSketch(final int k, final int m) { + super(k, m, SketchType.FLOATS_SKETCH); + floatItems_ = new float[k]; + minFloatValue_ = Float.NaN; + maxFloatValue_ = Float.NaN; } /** @@ -141,45 +93,12 @@ private KllFloatsSketch(final Memory mem) { * See Memory * @return a heap-based sketch based on the given Memory. */ - //To simplify the code, this method does all the validity checking - // then passes the verified Memory to the actual heapify constructor public static KllFloatsSketch heapify(final Memory mem) { - final int preambleInts = mem.getByte(PREAMBLE_INTS_BYTE) & 0xff; - final int serialVersion = mem.getByte(SER_VER_BYTE) & 0xff; - final int family = mem.getByte(FAMILY_BYTE) & 0xff; - final int flags = mem.getByte(FLAGS_BYTE) & 0xff; - final int m = mem.getByte(M_BYTE) & 0xff; - if (m != DEFAULT_M) { - throw new SketchesArgumentException( - "Possible corruption: M must be " + DEFAULT_M + ": " + m); - } - final boolean empty = (flags & 1 << Flags.IS_EMPTY.ordinal()) > 0; - final boolean singleItem = (flags & 1 << Flags.IS_SINGLE_ITEM.ordinal()) > 0; - if (empty || singleItem) { - if (preambleInts != PREAMBLE_INTS_EMPTY_SINGLE) { - throw new SketchesArgumentException("Possible corruption: preambleInts must be " - + PREAMBLE_INTS_EMPTY_SINGLE + " for an empty or single item sketch: " + preambleInts); - } - } else { - if (preambleInts != PREAMBLE_INTS_FLOAT) { - throw new SketchesArgumentException("Possible corruption: preambleInts must be " - + PREAMBLE_INTS_FLOAT + " for a sketch with more than one item: " + preambleInts); - } - } - if (serialVersion != SERIAL_VERSION && serialVersion != SERIAL_VERSION_SINGLE) { - throw new SketchesArgumentException( - "Possible corruption: serial version mismatch: expected " + SERIAL_VERSION + " or " - + SERIAL_VERSION_SINGLE + ", got " + serialVersion); - } - if (family != Family.KLL.getID()) { - throw new SketchesArgumentException( - "Possible corruption: family mismatch: expected " + Family.KLL.getID() + ", got " + family); - } - return new KllFloatsSketch(mem); + final KllMemoryValidate memVal = new KllMemoryValidate(mem); + if (memVal.doublesSketch) { Error.kllSketchThrow(SRC_MUST_BE_FLOAT); } + return new KllFloatsSketch(mem, memVal); } - // public functions - /** * Returns an approximation to the Cumulative Distribution Function (CDF), which is the * cumulative analog of the PMF, of the input stream given a set of splitPoint (values). @@ -196,13 +115,13 @@ public static KllFloatsSketch heapify(final Memory mem) { * the maximum value. * It is not necessary to include either the min or max values in these split points. * - * @return an array of m+1 double values, which are a consecutive approximation to the CDF - * of the input stream given the splitPoints. The value at array position j of the returned - * CDF array is the sum of the returned values in positions 0 through j of the returned PMF - * array. + * @return an array of m+1 double values on the interval [0.0, 1.0), + * which are a consecutive approximation to the CDF of the input stream given the splitPoints. + * The value at array position j of the returned CDF array is the sum of the returned values + * in positions 0 through j of the returned PMF array. */ public double[] getCDF(final float[] splitPoints) { - return getPmfOrCdf(splitPoints, true); + return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, true); } /** @@ -211,9 +130,7 @@ public double[] getCDF(final float[] splitPoints) { * * @return the max value of the stream */ - public float getMaxValue() { - return maxValue_; - } + public float getMaxValue() { return getMaxFloatValue(); } /** * Returns the min value of the stream. @@ -221,24 +138,7 @@ public float getMaxValue() { * * @return the min value of the stream */ - public float getMinValue() { - return minValue_; - } - - /** - * Returns upper bound on the serialized size of a sketch given a parameter k and stream - * length. The resulting size is an overestimate to make sure actual sketches don't exceed it. - * This method can be used if allocation of storage is necessary beforehand, but it is not - * optimal. - * @param k parameter that controls size of the sketch and accuracy of estimates - * @param n stream length - * @return upper bound on the serialized size - */ - public static int getMaxSerializedSizeBytes(final int k, final long n) { - final int numLevels = KllHelper.ubOnNumLevels(n); - final int maxNumItems = KllHelper.computeTotalItemCapacity(k, DEFAULT_M, numLevels); - return getSerializedSizeBytes(numLevels, maxNumItems); - } + public float getMinValue() { return getMinFloatValue(); } /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream @@ -256,13 +156,14 @@ public static int getMaxSerializedSizeBytes(final int k, final long n) { * the maximum value. * It is not necessary to include either the min or max values in these split points. * - * @return an array of m+1 doubles each of which is an approximation - * to the fraction of the input stream values (the mass) that fall into one of those intervals. + * @return an array of m+1 doubles on the interval [0.0, 1.0), + * each of which is an approximation to the fraction of the total input stream values + * (the mass) that fall into one of those intervals. * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right * splitPoint, with the exception that the last interval will include maximum value. */ public double[] getPMF(final float[] splitPoints) { - return getPmfOrCdf(splitPoints, false); + return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, false); } /** @@ -284,27 +185,7 @@ public double[] getPMF(final float[] splitPoints) { * @return the approximation to the value at the given fraction */ public float getQuantile(final double fraction) { - if (isEmpty()) { return Float.NaN; } - if (compatible) { - if (fraction == 0.0) { return minValue_; } - if (fraction == 1.0) { return maxValue_; } - } - if (fraction < 0.0 || fraction > 1.0) { - throw new SketchesArgumentException("Fraction cannot be less than zero or greater than 1.0"); - } - final KllFloatsQuantileCalculator quant = getQuantileCalculator(); - return quant.getQuantile(fraction); - } - - /** - * Gets the upper bound of the value interval in which the true quantile of the given rank - * exists with a confidence of at least 99%. - * @param fraction the given normalized rank as a fraction - * @return the upper bound of the value interval in which the true quantile of the given rank - * exists with a confidence of at least 99%. Returns NaN if the sketch is empty. - */ - public float getQuantileUpperBound(final double fraction) { - return getQuantile(min(1.0, fraction + getNormalizedRankError(minK_, false))); + return KllFloatsHelper.getFloatsQuantile(this, fraction); } /** @@ -315,7 +196,7 @@ public float getQuantileUpperBound(final double fraction) { * exists with a confidence of at least 99%. Returns NaN if the sketch is empty. */ public float getQuantileLowerBound(final double fraction) { - return getQuantile(max(0, fraction - getNormalizedRankError(minK_, false))); + return getQuantile(max(0, fraction - KllHelper.getNormalizedRankError(getMinK(), false))); } /** @@ -337,24 +218,7 @@ public float getQuantileLowerBound(final double fraction) { * array. */ public float[] getQuantiles(final double[] fractions) { - if (isEmpty()) { return null; } - KllFloatsQuantileCalculator quant = null; - final float[] quantiles = new float[fractions.length]; - for (int i = 0; i < fractions.length; i++) { - final double fraction = fractions[i]; - if (fraction < 0.0 || fraction > 1.0) { - throw new SketchesArgumentException("Fraction cannot be less than zero or greater than 1.0"); - } - if (fraction == 0.0 && compatible) { quantiles[i] = minValue_; } - else if (fraction == 1.0 && compatible) { quantiles[i] = maxValue_; } - else { - if (quant == null) { - quant = getQuantileCalculator(); - } - quantiles[i] = quant.getQuantile(fraction); - } - } - return quantiles; + return KllFloatsHelper.getFloatsQuantiles(this, fractions); } /** @@ -376,6 +240,17 @@ public float[] getQuantiles(final int numEvenlySpaced) { return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, numEvenlySpaced)); } + /** + * Gets the upper bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. + * @param fraction the given normalized rank as a fraction + * @return the upper bound of the value interval in which the true quantile of the given rank + * exists with a confidence of at least 99%. Returns NaN if the sketch is empty. + */ + public float getQuantileUpperBound(final double fraction) { + return getQuantile(min(1.0, fraction + KllHelper.getNormalizedRankError(getMinK(), false))); + } + /** * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, * inclusive. @@ -389,40 +264,14 @@ public float[] getQuantiles(final int numEvenlySpaced) { * @return an approximate rank of the given value */ public double getRank(final float value) { - if (isEmpty()) { return Double.NaN; } - int level = 0; - int weight = 1; - long total = 0; - while (level < numLevels_) { - final int fromIndex = levels_[level]; - final int toIndex = levels_[level + 1]; // exclusive - for (int i = fromIndex; i < toIndex; i++) { - if (items_[i] < value) { - total += weight; - } else if (level > 0 || isLevelZeroSorted_) { - break; // levels above 0 are sorted, no point comparing further - } - } - level++; - weight *= 2; - } - return (double) total / n_; - } - - /** - * Returns the number of bytes this sketch would require to store. - * @return the number of bytes this sketch would require to store. - */ - public int getSerializedSizeBytes() { - if (isEmpty()) { return N_LONG; } - return getSerializedSizeBytes(numLevels_, getNumRetained()); + return KllFloatsHelper.getFloatRank(this, value); } /** * @return the iterator for this class */ public KllFloatsSketchIterator iterator() { - return new KllFloatsSketchIterator(items_, levels_, numLevels_); + return new KllFloatsSketchIterator(getFloatItemsArray(), getLevelsArray(), getNumLevels()); } /** @@ -430,129 +279,21 @@ public KllFloatsSketchIterator iterator() { * @param other sketch to merge into this one */ public void merge(final KllFloatsSketch other) { - if (other == null || other.isEmpty()) { return; } - if (m_ != other.m_) { - throw new SketchesArgumentException("incompatible M: " + m_ + " and " + other.m_); - } - final long finalN = n_ + other.n_; - //update this sketch with level0 items from the other sketch - for (int i = other.levels_[0]; i < other.levels_[1]; i++) { - update(other.items_[i]); - } - if (other.numLevels_ >= 2) { //now merge other levels if they exist - mergeHigherLevels(other, finalN); - } - //update min, max values, n - if (Float.isNaN(minValue_) || other.minValue_ < minValue_) { minValue_ = other.minValue_; } - if (Float.isNaN(maxValue_) || other.maxValue_ > maxValue_) { maxValue_ = other.maxValue_; } - n_ = finalN; - - assert KllHelper.sumTheSampleWeights(numLevels_, levels_) == n_; - if (other.isEstimationMode()) { - minK_ = min(minK_, other.minK_); - } - } - - @Override - public byte[] toByteArray() { - final byte[] bytes = new byte[getSerializedSizeBytes()]; - final WritableMemory wmem = WritableMemory.writableWrap(bytes); - final boolean singleItem = n_ == 1; - final boolean empty = isEmpty(); - //load the preamble - wmem.putByte(PREAMBLE_INTS_BYTE, (byte) (empty || singleItem ? PREAMBLE_INTS_EMPTY_SINGLE : PREAMBLE_INTS_FLOAT)); - wmem.putByte(SER_VER_BYTE, singleItem ? SERIAL_VERSION_SINGLE : SERIAL_VERSION); - wmem.putByte(FAMILY_BYTE, (byte) Family.KLL.getID()); - final byte flags = (byte) ( - (empty ? 1 << Flags.IS_EMPTY.ordinal() : 0) - | (isLevelZeroSorted_ ? 1 << Flags.IS_LEVEL_ZERO_SORTED.ordinal() : 0) - | (singleItem ? 1 << Flags.IS_SINGLE_ITEM.ordinal() : 0)); - wmem.putByte(FLAGS_BYTE, flags); - wmem.putShort(K_SHORT, (short) k_); - wmem.putByte(M_BYTE, (byte) m_); - if (empty) { return bytes; } - //load data - int offset = DATA_START_SINGLE_ITEM; - if (!singleItem) { - wmem.putLong(N_LONG, n_); - wmem.putShort(MIN_K_SHORT, (short) minK_); - wmem.putByte(NUM_LEVELS_BYTE, (byte) numLevels_); - offset = DATA_START_FLOAT; - // the last integer in levels_ is not serialized because it can be derived - final int len = levels_.length - 1; - wmem.putIntArray(offset, levels_, 0, len); - offset += len * Integer.BYTES; - wmem.putFloat(offset, minValue_); - offset += Float.BYTES; - wmem.putFloat(offset, maxValue_); - offset += Float.BYTES; - } - wmem.putFloatArray(offset, items_, levels_[0], getNumRetained()); - return bytes; + if (!other.isFloatsSketch()) { kllSketchThrow(SRC_MUST_BE_FLOAT); } + KllFloatsHelper.mergeFloatImpl(this, other); } @Override - public String toString(final boolean withLevels, final boolean withData) { - final String epsPct = String.format("%.3f%%", getNormalizedRankError(false) * 100); - final String epsPMFPct = String.format("%.3f%%", getNormalizedRankError(true) * 100); - final StringBuilder sb = new StringBuilder(); - sb.append(Util.LS).append("### KLL sketch summary:").append(Util.LS); - sb.append(" K : ").append(k_).append(Util.LS); - sb.append(" min K : ").append(minK_).append(Util.LS); - sb.append(" M : ").append(m_).append(Util.LS); - sb.append(" N : ").append(n_).append(Util.LS); - sb.append(" Epsilon : ").append(epsPct).append(Util.LS); - sb.append(" Epsison PMF : ").append(epsPMFPct).append(Util.LS); - sb.append(" Empty : ").append(isEmpty()).append(Util.LS); - sb.append(" Estimation Mode : ").append(isEstimationMode()).append(Util.LS); - sb.append(" Levels : ").append(numLevels_).append(Util.LS); - sb.append(" Level 0 Sorted : ").append(isLevelZeroSorted_).append(Util.LS); - sb.append(" Capacity Items : ").append(items_.length).append(Util.LS); - sb.append(" Retained Items : ").append(getNumRetained()).append(Util.LS); - sb.append(" Storage Bytes : ").append(getSerializedSizeBytes()).append(Util.LS); - sb.append(" Min Value : ").append(minValue_).append(Util.LS); - sb.append(" Max Value : ").append(maxValue_).append(Util.LS); - sb.append("### End sketch summary").append(Util.LS); - - if (withLevels) { - sb.append("### KLL sketch levels:").append(Util.LS) - .append(" level, offset: nominal capacity, actual size").append(Util.LS); - for (int i = 0; i < numLevels_; i++) { - sb.append(" ").append(i).append(", ").append(levels_[i]).append(": ") - .append(KllHelper.levelCapacity(k_, numLevels_, i, m_)) - .append(", ").append(currentLevelSize(i)).append(Util.LS); - } - sb.append("### End sketch levels").append(Util.LS); - } - - if (withData) { - sb.append("### KLL sketch data {index, item}:").append(Util.LS); - if (levels_[0] > 0) { - sb.append(" Garbage:" + Util.LS); - for (int i = 0; i < levels_[0]; i++) { - if (items_[i] == 0.0f) { continue; } - sb.append(" ").append(i + ", ").append(items_[i]).append(Util.LS); - } - } - int level = 0; - while (level < numLevels_) { - final int fromIndex = levels_[level]; - final int toIndex = levels_[level + 1]; // exclusive - if (fromIndex < toIndex) { - sb.append(" level[").append(level).append("]: offset: " + levels_[level] + " wt: " + (1 << level)); - sb.append(Util.LS); - } - for (int i = fromIndex; i < toIndex; i++) { - sb.append(" ").append(i + ", ").append(items_[i]).append(Util.LS); - } - level++; - } - sb.append(" level[" + level + "]: offset: " + levels_[level] + " (Exclusive)"); - sb.append(Util.LS); - sb.append("### End sketch data").append(Util.LS); - } - - return sb.toString(); + public void reset() { + final int k = getK(); + setN(0); + setMinK(k); + setNumLevels(1); + setLevelsArray(new int[] {k, k}); + setLevelZeroSorted(false); + floatItems_ = new float[k]; + minFloatValue_ = Float.NaN; + maxFloatValue_ = Float.NaN; } /** @@ -561,269 +302,55 @@ public String toString(final boolean withLevels, final boolean withData) { * @param value an item from a stream of items. NaNs are ignored. */ public void update(final float value) { - if (Float.isNaN(value)) { return; } - if (isEmpty()) { - minValue_ = value; - maxValue_ = value; - } else { - if (value < minValue_) { minValue_ = value; } - if (value > maxValue_) { maxValue_ = value; } - } - if (levels_[0] == 0) { - compressWhileUpdating(); - } - n_++; - isLevelZeroSorted_ = false; - final int nextPos = levels_[0] - 1; - assert levels_[0] >= 0; - levels_[0] = nextPos; - items_[nextPos] = value; - } - - // Restricted Methods - - private KllFloatsQuantileCalculator getQuantileCalculator() { - sortLevelZero(); // sort in the sketch to reuse if possible - return new KllFloatsQuantileCalculator(items_, levels_, numLevels_, n_); - } - - private double[] getPmfOrCdf(final float[] splitPoints, final boolean isCdf) { - if (isEmpty()) { return null; } - KllFloatsHelper.validateFloatValues(splitPoints); - final double[] buckets = new double[splitPoints.length + 1]; - int level = 0; - int weight = 1; - while (level < numLevels_) { - final int fromIndex = levels_[level]; - final int toIndex = levels_[level + 1]; // exclusive - if (level == 0 && !isLevelZeroSorted_) { - incrementBucketsUnsortedLevel(fromIndex, toIndex, weight, splitPoints, buckets); - } else { - incrementBucketsSortedLevel(fromIndex, toIndex, weight, splitPoints, buckets); - } - level++; - weight *= 2; - } - // normalize and, if CDF, convert to cumulative - if (isCdf) { - double subtotal = 0; - for (int i = 0; i < buckets.length; i++) { - subtotal += buckets[i]; - buckets[i] = subtotal / n_; - } - } else { - for (int i = 0; i < buckets.length; i++) { - buckets[i] /= n_; - } - } - return buckets; - } - - private void incrementBucketsUnsortedLevel(final int fromIndex, final int toIndex, - final int weight, final float[] splitPoints, final double[] buckets) { - for (int i = fromIndex; i < toIndex; i++) { - int j; - for (j = 0; j < splitPoints.length; j++) { - if (items_[i] < splitPoints[j]) { - break; - } - } - buckets[j] += weight; - } - } - - private void incrementBucketsSortedLevel(final int fromIndex, final int toIndex, - final int weight, final float[] splitPoints, final double[] buckets) { - int i = fromIndex; - int j = 0; - while (i < toIndex && j < splitPoints.length) { - if (items_[i] < splitPoints[j]) { - buckets[j] += weight; // this sample goes into this bucket - i++; // move on to next sample and see whether it also goes into this bucket - } else { - j++; // no more samples for this bucket - } - } - // now either i == toIndex (we are out of samples), or - // j == numSplitPoints (we are out of buckets, but there are more samples remaining) - // we only need to do something in the latter case - if (j == splitPoints.length) { - buckets[j] += weight * (toIndex - i); - } + KllFloatsHelper.updateFloat(this, value); } - // The following code is only valid in the special case of exactly reaching capacity while updating. - // It cannot be used while merging, while reducing k, or anything else. - private void compressWhileUpdating() { - final int level = findLevelToCompact(); - - // It is important to do add the new top level right here. Be aware that this operation - // grows the buffer and shifts the data and also the boundaries of the data and grows the - // levels array and increments numLevels_ - if (level == numLevels_ - 1) { - addEmptyTopLevelToCompletelyFullSketch(); - } - - final int rawBeg = levels_[level]; - final int rawLim = levels_[level + 1]; - // +2 is OK because we already added a new top level if necessary - final int popAbove = levels_[level + 2] - rawLim; - final int rawPop = rawLim - rawBeg; - final boolean oddPop = isOdd(rawPop); - final int adjBeg = oddPop ? rawBeg + 1 : rawBeg; - final int adjPop = oddPop ? rawPop - 1 : rawPop; - final int halfAdjPop = adjPop / 2; - - // level zero might not be sorted, so we must sort it if we wish to compact it - if (level == 0) { - Arrays.sort(items_, adjBeg, adjBeg + adjPop); - } - if (popAbove == 0) { - KllFloatsHelper.randomlyHalveUpFloats(items_, adjBeg, adjPop, random); - } else { - KllFloatsHelper.randomlyHalveDownFloats(items_, adjBeg, adjPop, random); - KllFloatsHelper.mergeSortedFloatArrays( - items_, adjBeg, halfAdjPop, - items_, rawLim, popAbove, - items_, adjBeg + halfAdjPop); - } - levels_[level + 1] -= halfAdjPop; // adjust boundaries of the level above - if (oddPop) { - levels_[level] = levels_[level + 1] - 1; // the current level now contains one item - items_[levels_[level]] = items_[rawBeg]; // namely this leftover guy - } else { - levels_[level] = levels_[level + 1]; // the current level is now empty - } - - // verify that we freed up halfAdjPop array slots just below the current level - assert levels_[level] == rawBeg + halfAdjPop; - - // finally, we need to shift up the data in the levels below - // so that the freed-up space can be used by level zero - if (level > 0) { - final int amount = rawBeg - levels_[0]; - System.arraycopy(items_, levels_[0], items_, levels_[0] + halfAdjPop, amount); - for (int lvl = 0; lvl < level; lvl++) { - levels_[lvl] += halfAdjPop; - } - } - } - - private void addEmptyTopLevelToCompletelyFullSketch() { - final int curTotalCap = levels_[numLevels_]; - - // make sure that we are following a certain growth scheme - assert levels_[0] == 0; //definition of full - assert items_.length == curTotalCap; - - // note that merging MIGHT over-grow levels_, in which case we might not have to grow it here - if (levels_.length < numLevels_ + 2) { - levels_ = KllHelper.growIntArray(levels_, numLevels_ + 2); - } - - final int deltaCap = KllHelper.levelCapacity(k_, numLevels_ + 1, 0, m_); - final int newTotalCap = curTotalCap + deltaCap; + @Override //Dummy + double[] getDoubleItemsArray() { kllSketchThrow(MUST_NOT_CALL); return null; } - final float[] newBuf = new float[newTotalCap]; + @Override //Dummy + double getDoubleItemsArrayAt(final int index) { kllSketchThrow(MUST_NOT_CALL); return Double.NaN; } - // copy (and shift) the current data into the new buffer - System.arraycopy(items_, levels_[0], newBuf, levels_[0] + deltaCap, curTotalCap); - items_ = newBuf; + @Override //Used internally + float[] getFloatItemsArray() { return floatItems_; } - // this loop includes the old "extra" index at the top - for (int i = 0; i <= numLevels_; i++) { - levels_[i] += deltaCap; - } + @Override //Used internally + float getFloatItemsArrayAt(final int index) { return floatItems_[index]; } - assert levels_[numLevels_] == newTotalCap; + @Override //Dummy + double getMaxDoubleValue() { kllSketchThrow(MUST_NOT_CALL); return maxFloatValue_; } - numLevels_++; - levels_[numLevels_] = newTotalCap; // initialize the new "extra" index at the top - } - - private void sortLevelZero() { - if (!isLevelZeroSorted_) { - Arrays.sort(items_, levels_[0], levels_[1]); - isLevelZeroSorted_ = true; - } - } - - private void mergeHigherLevels(final KllFloatsSketch other, final long finalN) { - final int tmpSpaceNeeded = getNumRetained() + other.getNumRetainedAboveLevelZero(); - final float[] workbuf = new float[tmpSpaceNeeded]; - final int ub = KllHelper.ubOnNumLevels(finalN); - final int[] worklevels = new int[ub + 2]; // ub+1 does not work - final int[] outlevels = new int[ub + 2]; - - final int provisionalNumLevels = max(numLevels_, other.numLevels_); + @Override //Used internally + float getMaxFloatValue() { return maxFloatValue_; } - populateWorkArrays(other, workbuf, worklevels, provisionalNumLevels); + @Override //Dummy + double getMinDoubleValue() { kllSketchThrow(MUST_NOT_CALL); return minFloatValue_; } - // notice that workbuf is being used as both the input and output here - final int[] result = KllFloatsHelper.generalFloatsCompress(k_, m_, provisionalNumLevels, workbuf, - worklevels, workbuf, outlevels, isLevelZeroSorted_, random); - final int finalNumLevels = result[0]; - final int finalCapacity = result[1]; - final int finalPop = result[2]; + @Override //Used internally + float getMinFloatValue() { return minFloatValue_; } - assert finalNumLevels <= ub; // ub can sometimes be much bigger + @Override //Dummy + void setDoubleItemsArray(final double[] doubleItems) { kllSketchThrow(MUST_NOT_CALL); } - // now we need to transfer the results back into the "self" sketch - final float[] newbuf = finalCapacity == items_.length ? items_ : new float[finalCapacity]; - final int freeSpaceAtBottom = finalCapacity - finalPop; - System.arraycopy(workbuf, outlevels[0], newbuf, freeSpaceAtBottom, finalPop); - final int theShift = freeSpaceAtBottom - outlevels[0]; + @Override //Dummy + void setDoubleItemsArrayAt(final int index, final double value) { kllSketchThrow(MUST_NOT_CALL); } - if (levels_.length < finalNumLevels + 1) { - levels_ = new int[finalNumLevels + 1]; - } + @Override //Used internally + void setFloatItemsArray(final float[] floatItems) { floatItems_ = floatItems; } - for (int lvl = 0; lvl < finalNumLevels + 1; lvl++) { // includes the "extra" index - levels_[lvl] = outlevels[lvl] + theShift; - } + @Override //Used internally + void setFloatItemsArrayAt(final int index, final float value) { floatItems_[index] = value; } - items_ = newbuf; - numLevels_ = finalNumLevels; - } + @Override //Dummy + void setMaxDoubleValue(final double value) { kllSketchThrow(MUST_NOT_CALL); } - private void populateWorkArrays(final KllFloatsSketch other, final float[] workbuf, - final int[] worklevels, final int provisionalNumLevels) { - worklevels[0] = 0; - - // Note: the level zero data from "other" was already inserted into "self" - final int selfPopZero = currentLevelSize(0); - System.arraycopy(items_, levels_[0], workbuf, worklevels[0], selfPopZero); - worklevels[1] = worklevels[0] + selfPopZero; - - for (int lvl = 1; lvl < provisionalNumLevels; lvl++) { - final int selfPop = currentLevelSize(lvl); - final int otherPop = other.currentLevelSize(lvl); - worklevels[lvl + 1] = worklevels[lvl] + selfPop + otherPop; - - if (selfPop > 0 && otherPop == 0) { - System.arraycopy(items_, levels_[lvl], workbuf, worklevels[lvl], selfPop); - } else if (selfPop == 0 && otherPop > 0) { - System.arraycopy(other.items_, other.levels_[lvl], workbuf, worklevels[lvl], otherPop); - } else if (selfPop > 0 && otherPop > 0) { - KllFloatsHelper.mergeSortedFloatArrays(items_, levels_[lvl], selfPop, other.items_, - other.levels_[lvl], otherPop, workbuf, worklevels[lvl]); - } - } - } + @Override //Used internally + void setMaxFloatValue(final float value) { maxFloatValue_ = value; } - private static int getSerializedSizeBytes(final int numLevels, final int numRetained) { - if (numLevels == 1 && numRetained == 1) { - return DATA_START_SINGLE_ITEM + Float.BYTES; - } - // the last integer in levels_ is not serialized because it can be derived - // + 2 for min and max - return DATA_START_FLOAT + numLevels * Integer.BYTES + (numRetained + 2) * Float.BYTES; - } + @Override //Dummy + void setMinDoubleValue(final double value) { kllSketchThrow(MUST_NOT_CALL); } - // for testing - - float[] getItems() { - return items_; - } + @Override //Used internally + void setMinFloatValue(final float value) { minFloatValue_ = value; } } diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java index 2642bd210..c2c7e4508 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketchIterator.java @@ -39,6 +39,26 @@ public class KllFloatsSketchIterator { isInitialized_ = false; } + /** + * Gets a value from the current entry in the sketch. + * Don't call this before calling next() for the first time + * or after getting false from next(). + * @return value from the current entry + */ + public float getValue() { + return items_[i_]; + } + + /** + * Gets a weight for the value from the current entry in the sketch. + * Don't call this before calling next() for the first time + * or after getting false from next(). + * @return weight for the value from the current entry + */ + public long getWeight() { + return weight_; + } + /** * Advancing the iterator and checking existence of the next entry * is combined here for efficiency. This results in an undefined @@ -69,24 +89,4 @@ public boolean next() { return true; } - /** - * Gets a value from the current entry in the sketch. - * Don't call this before calling next() for the first time - * or after getting false from next(). - * @return value from the current entry - */ - public float getValue() { - return items_[i_]; - } - - /** - * Gets a weight for the value from the current entry in the sketch. - * Don't call this before calling next() for the first time - * or after getting false from next(). - * @return weight for the value from the current entry - */ - public long getWeight() { - return weight_; - } - } diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapSketch.java new file mode 100644 index 000000000..f50cc2132 --- /dev/null +++ b/src/main/java/org/apache/datasketches/kll/KllHeapSketch.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import org.apache.datasketches.memory.WritableMemory; + +/** + * This class implements all the methods for the heap sketches that are independent + * of the sketch type (float or double). + * + * @author lrhodes + */ +abstract class KllHeapSketch extends KllSketch { + private final int k; // configured value of K. + private final int m; // configured value of M. + private long n_; // number of items input into this sketch. + private int minK_; // dynamic minK for error estimation after merging with different k. + private int numLevels_; // one-based number of current levels. + private int[] levels_; // array of index offsets into the items[]. Size = numLevels + 1. + private boolean isLevelZeroSorted_; + + /** + * Heap constructor. + * @param k user configured size of sketch. Range [m, 2^16] + * @param m user configured minimum level width + * @param sketchType either DOUBLE_SKETCH or FLOAT_SKETCH + */ + KllHeapSketch(final int k, final int m, final SketchType sketchType) { + super(sketchType, null, null); + KllHelper.checkM(m); + KllHelper.checkK(k, m); + this.k = k; + this.m = m; + n_ = 0; + minK_ = k; + numLevels_ = 1; + levels_ = new int[] {k, k}; + isLevelZeroSorted_ = false; + } + + @Override + public int getK() { + return k; + } + + @Override + public long getN() { + return n_; + } + + @Override + int[] getLevelsArray() { + return levels_; + } + + @Override + int getLevelsArrayAt(final int index) { return levels_[index]; } + + @Override + int getM() { + return m; + } + + @Override + int getMinK() { + return minK_; + } + + @Override + int getNumLevels() { + return numLevels_; + } + + @Override + void incN() { + n_++; + } + + @Override + void incNumLevels() { + numLevels_++; + } + + @Override + boolean isLevelZeroSorted() { + return isLevelZeroSorted_; + } + + @Override + void setItemsArrayUpdatable(final WritableMemory itemsMem) { } //dummy + + @Override + void setLevelsArray(final int[] levelsArr) { + levels_ = levelsArr; + } + + @Override + void setLevelsArrayAt(final int index, final int value) { levels_[index] = value; } + + @Override + void setLevelsArrayAtMinusEq(final int index, final int minusEq) { + levels_[index] -= minusEq; + } + + @Override + void setLevelsArrayAtPlusEq(final int index, final int plusEq) { + levels_[index] += plusEq; + } + + @Override + void setLevelsArrayUpdatable(final WritableMemory levelsMem) { } //dummy + + @Override + void setLevelZeroSorted(final boolean sorted) { + this.isLevelZeroSorted_ = sorted; + } + + @Override + void setMinK(final int minK) { + minK_ = minK; + } + + @Override + void setMinMaxArrayUpdatable(final WritableMemory minMaxMem) { } //dummy + + @Override + void setN(final long n) { + n_ = n; + } + + @Override + void setNumLevels(final int numLevels) { + numLevels_ = numLevels; + } + +} diff --git a/src/main/java/org/apache/datasketches/kll/KllHelper.java b/src/main/java/org/apache/datasketches/kll/KllHelper.java index d59dfc606..93ad2ce82 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllHelper.java @@ -19,32 +19,322 @@ package org.apache.datasketches.kll; +import static java.lang.Math.abs; +import static java.lang.Math.ceil; +import static java.lang.Math.exp; +import static java.lang.Math.log; +import static java.lang.Math.max; +import static java.lang.Math.min; +import static java.lang.Math.pow; +import static java.lang.Math.round; import static org.apache.datasketches.Util.floorPowerOf2; +import static org.apache.datasketches.Util.isOdd; +import static org.apache.datasketches.kll.KllPreambleUtil.DATA_START_ADR; +import static org.apache.datasketches.kll.KllPreambleUtil.DATA_START_ADR_SINGLE_ITEM; +import static org.apache.datasketches.kll.KllPreambleUtil.PREAMBLE_INTS_EMPTY_SINGLE; +import static org.apache.datasketches.kll.KllPreambleUtil.PREAMBLE_INTS_FULL; +import static org.apache.datasketches.kll.KllPreambleUtil.SERIAL_VERSION_EMPTY_FULL; +import static org.apache.datasketches.kll.KllPreambleUtil.SERIAL_VERSION_SINGLE; +import static org.apache.datasketches.kll.KllPreambleUtil.SERIAL_VERSION_UPDATABLE; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryDoubleSketchFlag; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryEmptyFlag; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryFamilyID; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryK; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryLevelZeroSortedFlag; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryM; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryMinK; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryN; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryNumLevels; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryPreInts; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemorySerVer; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemorySingleItemFlag; +import static org.apache.datasketches.kll.KllPreambleUtil.setMemoryUpdatableFlag; +import static org.apache.datasketches.kll.KllSketch.SketchType.DOUBLES_SKETCH; -class KllHelper { +import java.util.Arrays; + +import org.apache.datasketches.Family; +import org.apache.datasketches.SketchesArgumentException; +import org.apache.datasketches.Util; +import org.apache.datasketches.kll.KllSketch.SketchType; +import org.apache.datasketches.memory.WritableMemory; + +/** + * This class provides some useful sketch analysis tools that are used internally. + * + * @author lrhodes + * + */ +final class KllHelper { + + static class GrowthStats { + SketchType sketchType; + int k; + int m; + long givenN; + long maxN; + int numLevels; + int maxItems; + int compactBytes; + int updatableBytes; + } + + static class LevelStats { + long n; + int numLevels; + int items; + + LevelStats(final long n, final int numLevels, final int items) { + this.n = n; + this.numLevels = numLevels; + this.items = items; + } + } + + static final double EPS_DELTA_THRESHOLD = 1E-6; + static final double MIN_EPS = 4.7634E-5; + static final double PMF_COEF = 2.446; + static final double PMF_EXP = 0.9433; + static final double CDF_COEF = 2.296; + static final double CDF_EXP = 0.9723; /** - * Copy the old array into a new larger array. - * The extra space is at the top. - * @param oldArr the given old array with data - * @param newLen the new length larger than the oldArr.length. - * @return the new array + * This is the exact powers of 3 from 3^0 to 3^30 where the exponent is the index */ - static int[] growIntArray(final int[] oldArr, final int newLen) { - final int oldLen = oldArr.length; - assert newLen > oldLen; - final int[] newArr = new int[newLen]; - System.arraycopy(oldArr, 0, newArr, 0, oldLen); - return newArr; + private static long[] powersOfThree = + new long[] {1, 3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441, + 1594323, 4782969, 14348907, 43046721, 129140163, 387420489, 1162261467, + 3486784401L, 10460353203L, 31381059609L, 94143178827L, 282429536481L, + 847288609443L, 2541865828329L, 7625597484987L, 22876792454961L, 68630377364883L, + 205891132094649L}; + + static void buildHeapKllSketchFromMemory(final KllSketch mine, final KllMemoryValidate memVal) { + final boolean doubleType = (mine.sketchType == DOUBLES_SKETCH); + mine.setLevelZeroSorted(memVal.level0Sorted); + mine.setN(memVal.n); + mine.setMinK(memVal.minK); + mine.setNumLevels(memVal.numLevels); + final int[] myLevelsArr = new int[mine.getNumLevels() + 1]; + + if (memVal.updatableMemory) { + memVal.levelsArrUpdatable.getIntArray(0, myLevelsArr, 0, mine.getNumLevels() + 1); + mine.setLevelsArray(myLevelsArr); + if (doubleType) { + mine.setMinDoubleValue(memVal.minMaxArrUpdatable.getDouble(0)); + mine.setMaxDoubleValue(memVal.minMaxArrUpdatable.getDouble(Double.BYTES)); + final int itemsCap = (int)memVal.itemsArrUpdatable.getCapacity() / Double.BYTES; + final double[] myItemsArr = new double[itemsCap]; + memVal.itemsArrUpdatable.getDoubleArray(0, myItemsArr, 0, itemsCap); + mine.setDoubleItemsArray(myItemsArr); + } else { //float + mine.setMinFloatValue(memVal.minMaxArrUpdatable.getFloat(0)); + mine.setMaxFloatValue(memVal.minMaxArrUpdatable.getFloat(Float.BYTES)); + final int itemsCap = (int)memVal.itemsArrUpdatable.getCapacity() / Float.BYTES; + final float[] myItemsArr = new float[itemsCap]; + memVal.itemsArrUpdatable.getFloatArray(0, myItemsArr, 0, itemsCap); + mine.setFloatItemsArray(myItemsArr); + } + } else { //compact + memVal.levelsArrCompact.getIntArray(0, myLevelsArr, 0, mine.getNumLevels() + 1); + mine.setLevelsArray(myLevelsArr); + if (doubleType) { + mine.setMinDoubleValue(memVal.minMaxArrCompact.getDouble(0)); + mine.setMaxDoubleValue(memVal.minMaxArrCompact.getDouble(Double.BYTES)); + final int itemsCap = (int)memVal.itemsArrCompact.getCapacity() / Double.BYTES; + final double[] myItemsArr = new double[itemsCap]; + memVal.itemsArrCompact.getDoubleArray(0, myItemsArr, 0, itemsCap); + mine.setDoubleItemsArray(myItemsArr); + } else { //float + mine.setMinFloatValue(memVal.minMaxArrCompact.getFloat(0)); + mine.setMaxFloatValue(memVal.minMaxArrCompact.getFloat(Float.BYTES)); + final int itemsCap = (int)memVal.itemsArrCompact.getCapacity() / Float.BYTES; + final float[] myItemsArr = new float[itemsCap]; + memVal.itemsArrCompact.getFloatArray(0, myItemsArr, 0, itemsCap); + mine.setFloatItemsArray(myItemsArr); + } + } } /** - * Returns the upper bound of the number of levels based on n. - * @param n the length of the stream - * @return floor( log_2(n) ) + * Checks the validity of the given value k + * @param k must be greater than 7 and less than 65536. */ - static int ubOnNumLevels(final long n) { - return 1 + Long.numberOfTrailingZeros(floorPowerOf2(n)); + static void checkK(final int k, final int m) { + if (k < m || k > KllSketch.MAX_K) { + throw new SketchesArgumentException( + "K must be >= " + m + " and <= " + KllSketch.MAX_K + ": " + k); + } + } + + static void checkM(final int m) { + if (m < KllSketch.MIN_M || m > KllSketch.MAX_M || ((m & 1) == 1)) { + throw new SketchesArgumentException( + "M must be >= 2, <= 8 and even: " + m); + } + } + + /** + * The following code is only valid in the special case of exactly reaching capacity while updating. + * It cannot be used while merging, while reducing k, or anything else. + * @param mine the current sketch + */ + static void compressWhileUpdatingSketch(final KllSketch mine) { + final int level = + findLevelToCompact(mine.getK(), mine.getM(), mine.getNumLevels(), mine.getLevelsArray()); + if (level == mine.getNumLevels() - 1) { + //The level to compact is the top level, thus we need to add a level. + //Be aware that this operation grows the items array, + //shifts the items data and the level boundaries of the data, + //and grows the levels array and increments numLevels_. + KllHelper.addEmptyTopLevelToCompletelyFullSketch(mine); + } + + final int[] myLevelsArr = mine.getLevelsArray(); + final int rawBeg = myLevelsArr[level]; + final int rawEnd = myLevelsArr[level + 1]; + // +2 is OK because we already added a new top level if necessary + final int popAbove = myLevelsArr[level + 2] - rawEnd; + final int rawPop = rawEnd - rawBeg; + final boolean oddPop = isOdd(rawPop); + final int adjBeg = oddPop ? rawBeg + 1 : rawBeg; + final int adjPop = oddPop ? rawPop - 1 : rawPop; + final int halfAdjPop = adjPop / 2; + + // level zero might not be sorted, so we must sort it if we wish to compact it + float[] myFloatItemsArr; + double[] myDoubleItemsArr; + + if (mine.sketchType == DOUBLES_SKETCH) { + myFloatItemsArr = null; + myDoubleItemsArr = mine.getDoubleItemsArray(); + if (level == 0) { + if (mine.updatablMemory) { + myDoubleItemsArr = mine.getDoubleItemsArray(); + Arrays.sort(myDoubleItemsArr, adjBeg, adjBeg + adjPop); + mine.setDoubleItemsArray(myDoubleItemsArr); + } else { + Arrays.sort(mine.getDoubleItemsArray(), adjBeg, adjBeg + adjPop); + } + } + if (popAbove == 0) { + if (mine.updatablMemory) { + myDoubleItemsArr = mine.getDoubleItemsArray(); + KllDoublesHelper.randomlyHalveUpDoubles(myDoubleItemsArr, adjBeg, adjPop, KllSketch.random); + mine.setDoubleItemsArray(myDoubleItemsArr); + } else { + KllDoublesHelper.randomlyHalveUpDoubles(mine.getDoubleItemsArray(), adjBeg, adjPop, KllSketch.random); + } + } else { + if (mine.updatablMemory) { + myDoubleItemsArr = mine.getDoubleItemsArray(); + KllDoublesHelper.randomlyHalveDownDoubles(myDoubleItemsArr, adjBeg, adjPop, KllSketch.random); + mine.setDoubleItemsArray(myDoubleItemsArr); + } else { + KllDoublesHelper.randomlyHalveDownDoubles(mine.getDoubleItemsArray(), adjBeg, adjPop, KllSketch.random); + } + if (mine.updatablMemory ) { + myDoubleItemsArr = mine.getDoubleItemsArray(); + KllDoublesHelper.mergeSortedDoubleArrays( + myDoubleItemsArr, adjBeg, halfAdjPop, + myDoubleItemsArr, rawEnd, popAbove, + myDoubleItemsArr, adjBeg + halfAdjPop); + mine.setDoubleItemsArray(myDoubleItemsArr); + } else { + myDoubleItemsArr = mine.getDoubleItemsArray(); + KllDoublesHelper.mergeSortedDoubleArrays( + myDoubleItemsArr, adjBeg, halfAdjPop, + myDoubleItemsArr, rawEnd, popAbove, + myDoubleItemsArr, adjBeg + halfAdjPop); + } + } + } else { //Float sketch + myFloatItemsArr = mine.getFloatItemsArray(); + myDoubleItemsArr = null; + if (level == 0) { + if (mine.updatablMemory) { + myFloatItemsArr = mine.getFloatItemsArray(); + Arrays.sort(myFloatItemsArr, adjBeg, adjBeg + adjPop); + mine.setFloatItemsArray(myFloatItemsArr); + } else { + Arrays.sort(mine.getFloatItemsArray(), adjBeg, adjBeg + adjPop); + } + } + if (popAbove == 0) { + if (mine.updatablMemory) { + myFloatItemsArr = mine.getFloatItemsArray(); + KllFloatsHelper.randomlyHalveUpFloats(myFloatItemsArr, adjBeg, adjPop, KllSketch.random); + mine.setFloatItemsArray(myFloatItemsArr); + } else { + KllFloatsHelper.randomlyHalveUpFloats(mine.getFloatItemsArray(), adjBeg, adjPop, KllSketch.random); + } + } else { + if (mine.updatablMemory) { + myFloatItemsArr = mine.getFloatItemsArray(); + KllFloatsHelper.randomlyHalveDownFloats(myFloatItemsArr, adjBeg, adjPop, KllSketch.random); + mine.setFloatItemsArray(myFloatItemsArr); + } else { + KllFloatsHelper.randomlyHalveDownFloats(mine.getFloatItemsArray(), adjBeg, adjPop, KllSketch.random); + } + if (mine.updatablMemory ) { + myFloatItemsArr = mine.getFloatItemsArray(); + KllFloatsHelper.mergeSortedFloatArrays( + myFloatItemsArr, adjBeg, halfAdjPop, + myFloatItemsArr, rawEnd, popAbove, + myFloatItemsArr, adjBeg + halfAdjPop); + mine.setFloatItemsArray(myFloatItemsArr); + } else { + myFloatItemsArr = mine.getFloatItemsArray(); + KllFloatsHelper.mergeSortedFloatArrays( + myFloatItemsArr, adjBeg, halfAdjPop, + myFloatItemsArr, rawEnd, popAbove, + myFloatItemsArr, adjBeg + halfAdjPop); + } + } + } + mine.setLevelsArrayAtMinusEq(level + 1, halfAdjPop); // adjust boundaries of the level above + + if (oddPop) { + mine.setLevelsArrayAt(level, mine.getLevelsArrayAt(level + 1) - 1); // the current level now contains one item + if (mine.sketchType == DOUBLES_SKETCH) { + mine.setDoubleItemsArrayAt( + mine.getLevelsArrayAt(level), mine.getDoubleItemsArrayAt(rawBeg)); // namely this leftover guy + } else { + mine.setFloatItemsArrayAt( + mine.getLevelsArrayAt(level), mine.getFloatItemsArrayAt(rawBeg)); // namely this leftover guy + } + + } else { + mine.setLevelsArrayAt(level, mine.getLevelsArrayAt(level + 1)); // the current level is now empty + } + + // verify that we freed up halfAdjPop array slots just below the current level + assert mine.getLevelsArrayAt(level) == rawBeg + halfAdjPop; + + // finally, we need to shift up the data in the levels below + // so that the freed-up space can be used by level zero + if (level > 0) { + final int amount = rawBeg - mine.getLevelsArrayAt(0); + if (mine.sketchType == DOUBLES_SKETCH) { + if (mine.updatablMemory) { + myDoubleItemsArr = mine.getDoubleItemsArray(); + System.arraycopy(myDoubleItemsArr, myLevelsArr[0], myDoubleItemsArr, myLevelsArr[0] + halfAdjPop, amount); + mine.setDoubleItemsArray(myDoubleItemsArr); + } else { + System.arraycopy(myDoubleItemsArr, myLevelsArr[0], myDoubleItemsArr, myLevelsArr[0] + halfAdjPop, amount); + } + } else { + if (mine.updatablMemory) { + myFloatItemsArr = mine.getFloatItemsArray(); + System.arraycopy(myFloatItemsArr, myLevelsArr[0], myFloatItemsArr, myLevelsArr[0] + halfAdjPop, amount); + mine.setFloatItemsArray(myFloatItemsArr); + } else { + System.arraycopy(myFloatItemsArr, myLevelsArr[0], myFloatItemsArr, myLevelsArr[0] + halfAdjPop, amount); + } + } + for (int lvl = 0; lvl < level; lvl++) { + mine.setLevelsArrayAtPlusEq(lvl, halfAdjPop); + } + } } /** @@ -64,6 +354,153 @@ static int computeTotalItemCapacity(final int k, final int m, final int numLevel return (int) total; } + static int currentLevelSize(final int level, final int numLevels, final int[] levels) { + if (level >= numLevels) { return 0; } + return levels[level + 1] - levels[level]; + } + + /** + * Given k, m, and numLevels, this computes and optionally prints the structure of the sketch when the given + * number of levels are completely filled. + * @param k the given user configured sketch parameter + * @param m the given user configured sketch parameter + * @param numLevels the given number of levels of the sketch + * @param printSketchStructure if true will print the details of the sketch structure at the given numLevels. + * @return LevelStats with the final summary of the sketch's cumulative N, + * and cumulative items at the given numLevels. + */ + static LevelStats getFinalSketchStatsAtNumLevels( + final int k, + final int m, + final int numLevels, + final boolean printSketchStructure) { + int cumItems = 0; + long cumN = 0; + if (printSketchStructure) { + println("SKETCH STRUCTURE:"); + println("Given K : " + k); + println("Given M : " + m); + println("Given NumLevels: " + numLevels); + printf("%6s %8s %12s %18s %18s\n", "Level", "Items", "CumItems", "N at Level", "CumN"); + } + for (int level = 0; level < numLevels; level++) { + final LevelStats lvlStats = getLevelCapacityItems(k, m, numLevels, level); + cumItems += lvlStats.items; + cumN += lvlStats.n; + if (printSketchStructure) { + printf("%6d %,8d %,12d %,18d %,18d\n", level, lvlStats.items, cumItems, lvlStats.n, cumN); + } + } + return new LevelStats(cumN, numLevels, cumItems); + } + + /** + * Given k, m, n, and the sketch type, this computes (and optionally prints) the growth scheme for a sketch as it + * grows large enough to accommodate a stream length of n items. + * @param k the given user configured sketch parameter + * @param m the given user configured sketch parameter + * @param n the desired stream length + * @param sketchType the given sketch type (DOUBLES_SKETCH or FLOATS_SKETCH) + * @param printGrowthScheme if true the entire growth scheme of the sketch will be printed. + * @return GrowthStats with the final values of the growth scheme + */ + static GrowthStats getGrowthSchemeForGivenN( + final int k, + final int m, + final long n, + final SketchType sketchType, + final boolean printGrowthScheme) { + int numLevels = 0; + LevelStats lvlStats; + final GrowthStats gStats = new GrowthStats(); + gStats.k = k; + gStats.m = m; + gStats.givenN = n; + gStats.sketchType = sketchType; + if (printGrowthScheme) { + println("GROWTH SCHEME:"); + println("Given SketchType: " + sketchType.toString()); + println("Given K : " + k); + println("Given M : " + m); + println("Given N : " + n); + printf("%10s %10s %20s %13s %15s\n", "NumLevels", "MaxItems", "MaxN", "CompactBytes", "UpdatableBytes"); + } + int compactBytes; + int updatableBytes; + final int typeBytes = (sketchType == DOUBLES_SKETCH) ? Double.BYTES : Float.BYTES; + do { + numLevels++; + lvlStats = getFinalSketchStatsAtNumLevels(k, m, numLevels, false); + final int maxItems = lvlStats.items; + final long maxN = lvlStats.n; + compactBytes = maxItems * typeBytes + numLevels * Integer.BYTES + 2 * typeBytes + DATA_START_ADR; + updatableBytes = compactBytes + Integer.BYTES; + if (printGrowthScheme) { + printf("%10d %,10d %,20d %,13d %,15d\n", numLevels, maxItems, maxN, compactBytes, updatableBytes); + } + } while (lvlStats.n < n); + gStats.maxN = lvlStats.n; + gStats.numLevels = lvlStats.numLevels; + gStats.maxItems = lvlStats.items; + gStats.compactBytes = compactBytes; + gStats.updatableBytes = updatableBytes; + return gStats; + } + + // constants were derived as the best fit to 99 percentile empirically measured max error in + // thousands of trials + static int getKFromEpsilon(final double epsilon, final boolean pmf) { + //Ensure that eps is >= than the lowest possible eps given MAX_K and pmf=false. + final double eps = max(epsilon, MIN_EPS); + final double kdbl = pmf + ? exp(log(PMF_COEF / eps) / PMF_EXP) + : exp(log(CDF_COEF / eps) / CDF_EXP); + final double krnd = round(kdbl); + final double del = abs(krnd - kdbl); + final int k = (int) (del < EPS_DELTA_THRESHOLD ? krnd : ceil(kdbl)); + return max(KllSketch.MIN_M, min(KllSketch.MAX_K, k)); + } + + /** + * Given k, m, numLevels, this computes the item capacity of a single level. + * @param k the given user sketch configuration parameter + * @param m the given user sketch configuration parameter + * @param numLevels the given number of levels of the sketch + * @param level the specific level to compute its item capacity + * @return LevelStats with the computed N and items for the given level. + */ + static LevelStats getLevelCapacityItems( + final int k, + final int m, + final int numLevels, + final int level) { + final int items = KllHelper.levelCapacity(k, numLevels, level, m); + final long n = (long)items << level; + return new LevelStats(n, numLevels, items); + } + + /** + * Gets the normalized rank error given k and pmf. + * Static method version of the getNormalizedRankError(boolean). + * @param k the configuration parameter + * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function. + * Otherwise, it is the "single-sided" normalized rank error for all the other queries. + * @return if pmf is true, the normalized rank error for the getPMF() function. + * Otherwise, it is the "single-sided" normalized rank error for all the other queries. + * @see KllDoublesSketch + */ + // constants were derived as the best fit to 99 percentile empirically measured max error in + // thousands of trials + static double getNormalizedRankError(final int k, final boolean pmf) { + return pmf + ? PMF_COEF / pow(k, PMF_EXP) + : CDF_COEF / pow(k, CDF_EXP); + } + + static int getNumRetainedAboveLevelZero(final int numLevels, final int[] levels) { + return levels[numLevels] - levels[1]; + } + /** * Returns the item capacity of a specific level. * @param k the accuracy parameter of the sketch. Because of the Java limits on array sizes, @@ -82,6 +519,402 @@ static int levelCapacity(final int k, final int numLevels, final int level, fina return (int) Math.max(m, intCapAux(k, depth)); } + /** + * This method is for direct Double and Float sketches only and does the following: + * + * The caller is responsible for filling these regions and updating the preamble. + * @param sketch The current sketch that needs to be expanded. + * @param newLevelsArrLen the element length of the new Levels array. + * @param newItemsArrLen the element length of the new Items array. + * @return the new expanded memory with preamble. + */ + static WritableMemory memorySpaceMgmt( + final KllSketch sketch, + final int newLevelsArrLen, + final int newItemsArrLen) { + final KllSketch.SketchType sketchType = sketch.sketchType; + final WritableMemory oldWmem = sketch.wmem; + final int startAdr = DATA_START_ADR; + final int typeBytes = (sketchType == DOUBLES_SKETCH) ? Double.BYTES : Float.BYTES; + + int requiredSketchBytes = startAdr; + requiredSketchBytes += newLevelsArrLen * Integer.BYTES; + requiredSketchBytes += 2 * typeBytes; + requiredSketchBytes += newItemsArrLen * typeBytes; + final WritableMemory newWmem; + + if (requiredSketchBytes > oldWmem.getCapacity()) { //Acquire new WritableMemory + newWmem = sketch.memReqSvr.request(oldWmem, requiredSketchBytes); + oldWmem.copyTo(0, newWmem, 0, startAdr); //copy preamble + } + else { //Expand or contract in current memory + newWmem = oldWmem; + } + + int offset = startAdr; + //LEVELS ARR + int lengthBytes = newLevelsArrLen * Integer.BYTES; + sketch.setLevelsArrayUpdatable(newWmem.writableRegion(offset, lengthBytes)); // + offset += lengthBytes; + //MIN MAX ARR + lengthBytes = 2 * typeBytes; + sketch.setMinMaxArrayUpdatable(newWmem.writableRegion(offset, lengthBytes)); + offset += lengthBytes; + //ITEMS ARR + lengthBytes = newItemsArrLen * typeBytes; + sketch.setItemsArrayUpdatable(newWmem.writableRegion(offset, lengthBytes)); + assert requiredSketchBytes <= newWmem.getCapacity(); + return newWmem; + } + + static String outputData(final boolean doubleType, final int numLevels, final int[] levelsArr, + final float[] floatItemsArr, final double[] doubleItemsArr) { + final StringBuilder sb = new StringBuilder(); + sb.append("### KLL items data {index, item}:").append(Util.LS); + if (levelsArr[0] > 0) { + sb.append(" Garbage:" + Util.LS); + if (doubleType) { + for (int i = 0; i < levelsArr[0]; i++) { + sb.append(" ").append(i + ", ").append(doubleItemsArr[i]).append(Util.LS); + } + } else { + for (int i = 0; i < levelsArr[0]; i++) { + sb.append(" ").append(i + ", ").append(floatItemsArr[i]).append(Util.LS); + } + } + } + int level = 0; + if (doubleType) { + while (level < numLevels) { + final int fromIndex = levelsArr[level]; + final int toIndex = levelsArr[level + 1]; // exclusive + if (fromIndex < toIndex) { + sb.append(" level[").append(level).append("]: offset: " + levelsArr[level] + " wt: " + (1 << level)); + sb.append(Util.LS); + } + + for (int i = fromIndex; i < toIndex; i++) { + sb.append(" ").append(i + ", ").append(doubleItemsArr[i]).append(Util.LS); + } + level++; + } + } + else { + while (level < numLevels) { + final int fromIndex = levelsArr[level]; + final int toIndex = levelsArr[level + 1]; // exclusive + if (fromIndex <= toIndex) { + sb.append(" level[").append(level).append("]: offset: " + levelsArr[level] + " wt: " + (1 << level)); + sb.append(Util.LS); + } + + for (int i = fromIndex; i < toIndex; i++) { + sb.append(" ").append(i + ", ").append(floatItemsArr[i]).append(Util.LS); + } + level++; + } + } + sb.append(" level[" + level + "]: offset: " + levelsArr[level] + " (Exclusive)"); + sb.append(Util.LS); + sb.append("### End items data").append(Util.LS); + + return sb.toString(); + } + + static String outputLevels(final int k, final int m, final int numLevels, final int[] levelsArr) { + final StringBuilder sb = new StringBuilder(); + sb.append("### KLL levels array:").append(Util.LS) + .append(" level, offset: nominal capacity, actual size").append(Util.LS); + int level = 0; + for ( ; level < numLevels; level++) { + sb.append(" ").append(level).append(", ").append(levelsArr[level]).append(": ") + .append(KllHelper.levelCapacity(k, numLevels, level, m)) + .append(", ").append(KllHelper.currentLevelSize(level, numLevels, levelsArr)).append(Util.LS); + } + sb.append(" ").append(level).append(", ").append(levelsArr[level]).append(": (Exclusive)") + .append(Util.LS); + sb.append("### End levels array").append(Util.LS); + return sb.toString(); + } + + static long sumTheSampleWeights(final int num_levels, final int[] levels) { + long total = 0; + long weight = 1; + for (int i = 0; i < num_levels; i++) { + total += weight * (levels[i + 1] - levels[i]); + weight *= 2; + } + return total; + } + + static byte[] toCompactByteArrayImpl(final KllSketch mine) { + final byte[] byteArr = new byte[mine.getCurrentCompactSerializedSizeBytes()]; + final WritableMemory wmem = WritableMemory.writableWrap(byteArr); + loadFirst8Bytes(mine, wmem, false); + if (mine.getN() == 0) { return byteArr; } //empty + final boolean doubleType = (mine.sketchType == DOUBLES_SKETCH); + + //load data + int offset = DATA_START_ADR_SINGLE_ITEM; + final int[] myLevelsArr = mine.getLevelsArray(); + if (mine.getN() == 1) { //single item + if (doubleType) { + wmem.putDouble(offset, mine.getDoubleItemsArray()[myLevelsArr[0]]); + } else { + wmem.putFloat(offset, mine.getFloatItemsArray()[myLevelsArr[0]]); + } + } else { // n > 1 + //remainder of preamble after first 8 bytes + setMemoryN(wmem, mine.getN()); + setMemoryMinK(wmem, mine.getMinK()); + setMemoryNumLevels(wmem, mine.getNumLevels()); + offset = DATA_START_ADR; + + //LOAD LEVELS ARR the last integer in levels_ is NOT serialized + final int len = myLevelsArr.length - 1; + wmem.putIntArray(offset, myLevelsArr, 0, len); + offset += len * Integer.BYTES; + + //LOAD MIN, MAX VALUES FOLLOWED BY ITEMS ARRAY + if (doubleType) { + wmem.putDouble(offset,mine. getMinDoubleValue()); + offset += Double.BYTES; + wmem.putDouble(offset, mine.getMaxDoubleValue()); + offset += Double.BYTES; + wmem.putDoubleArray(offset, mine.getDoubleItemsArray(), myLevelsArr[0], mine.getNumRetained()); + } else { + wmem.putFloat(offset, mine.getMinFloatValue()); + offset += Float.BYTES; + wmem.putFloat(offset, mine.getMaxFloatValue()); + offset += Float.BYTES; + wmem.putFloatArray(offset, mine.getFloatItemsArray(), myLevelsArr[0], mine.getNumRetained()); + } + } + return byteArr; + } + + @SuppressWarnings("null") + static String toStringImpl(final KllSketch mine, final boolean withLevels, final boolean withData) { + final boolean doubleType = (mine.sketchType == DOUBLES_SKETCH); + final int k = mine.getK(); + final int m = mine.getM(); + final String epsPct = String.format("%.3f%%", mine.getNormalizedRankError(false) * 100); + final String epsPMFPct = String.format("%.3f%%", mine.getNormalizedRankError(true) * 100); + final StringBuilder sb = new StringBuilder(); + final String skType = (mine.updatablMemory ? "Direct" : "") + (doubleType ? "Doubles" : "Floats"); + sb.append(Util.LS).append("### Kll").append(skType).append("Sketch Summary:").append(Util.LS); + sb.append(" K : ").append(k).append(Util.LS); + sb.append(" Dynamic min K : ").append(mine.getMinK()).append(Util.LS); + sb.append(" M : ").append(m).append(Util.LS); + sb.append(" N : ").append(mine.getN()).append(Util.LS); + sb.append(" Epsilon : ").append(epsPct).append(Util.LS); + sb.append(" Epsison PMF : ").append(epsPMFPct).append(Util.LS); + sb.append(" Empty : ").append(mine.isEmpty()).append(Util.LS); + sb.append(" Estimation Mode : ").append(mine.isEstimationMode()).append(Util.LS); + sb.append(" Levels : ").append(mine.getNumLevels()).append(Util.LS); + sb.append(" Level 0 Sorted : ").append(mine.isLevelZeroSorted()).append(Util.LS); + final int cap = (doubleType) ? mine.getDoubleItemsArray().length : mine.getFloatItemsArray().length; + sb.append(" Capacity Items : ").append(cap).append(Util.LS); + sb.append(" Retained Items : ").append(mine.getNumRetained()).append(Util.LS); + if (mine.updatablMemory) { + sb.append(" Updatable Storage Bytes: ").append(mine.getCurrentUpdatableSerializedSizeBytes()).append(Util.LS); + } else { + sb.append(" Compact Storage Bytes : ").append(mine.getCurrentCompactSerializedSizeBytes()).append(Util.LS); + } + + if (doubleType) { + sb.append(" Min Value : ").append(mine.getMinDoubleValue()).append(Util.LS); + sb.append(" Max Value : ").append(mine.getMaxDoubleValue()).append(Util.LS); + } else { + sb.append(" Min Value : ").append(mine.getMinFloatValue()).append(Util.LS); + sb.append(" Max Value : ").append(mine.getMaxFloatValue()).append(Util.LS); + } + sb.append("### End sketch summary").append(Util.LS); + + final int myNumLevels = mine.getNumLevels(); + final int[] myLevelsArr = mine.getLevelsArray(); + double[] myDoubleItemsArr = null; + float[] myFloatItemsArr = null; + if (doubleType) { + myDoubleItemsArr = mine.getDoubleItemsArray(); + } else { + myFloatItemsArr = mine.getFloatItemsArray(); + } + if (withLevels) { + sb.append(outputLevels(k, m, myNumLevels, myLevelsArr)); + } + if (withData) { + sb.append(outputData(doubleType, myNumLevels, myLevelsArr, myFloatItemsArr, myDoubleItemsArr)); + } + return sb.toString(); + } + + static byte[] toUpdatableByteArrayImpl(final KllSketch mine) { + final byte[] byteArr = new byte[mine.getCurrentUpdatableSerializedSizeBytes()]; + final WritableMemory wmem = WritableMemory.writableWrap(byteArr); + loadFirst8Bytes(mine, wmem, true); + //remainder of preamble after first 8 bytes + setMemoryN(wmem, mine.getN()); + setMemoryMinK(wmem, mine.getMinK()); + setMemoryNumLevels(wmem, mine.getNumLevels()); + + //load data + final boolean doubleType = (mine.sketchType == DOUBLES_SKETCH); + int offset = DATA_START_ADR; + + //LOAD LEVELS ARRAY the last integer in levels_ IS serialized + final int[] myLevelsArr = mine.getLevelsArray(); + final int len = myLevelsArr.length; + wmem.putIntArray(offset, myLevelsArr, 0, len); + offset += len * Integer.BYTES; + + //LOAD MIN, MAX VALUES FOLLOWED BY ITEMS ARRAY + if (doubleType) { + wmem.putDouble(offset, mine.getMinDoubleValue()); + offset += Double.BYTES; + wmem.putDouble(offset, mine.getMaxDoubleValue()); + offset += Double.BYTES; + final double[] doubleItemsArr = mine.getDoubleItemsArray(); + wmem.putDoubleArray(offset, doubleItemsArr, 0, doubleItemsArr.length); + } else { + wmem.putFloat(offset, mine.getMinFloatValue()); + offset += Float.BYTES; + wmem.putFloat(offset,mine.getMaxFloatValue()); + offset += Float.BYTES; + final float[] floatItemsArr = mine.getFloatItemsArray(); + wmem.putFloatArray(offset, floatItemsArr, 0, floatItemsArr.length); + } + return byteArr; + } + + /** + * Returns very conservative upper bound of the number of levels based on n. + * @param n the length of the stream + * @return floor( log_2(n) ) + */ + static int ubOnNumLevels(final long n) { + return 1 + Long.numberOfTrailingZeros(floorPowerOf2(n)); + } + + /** + * This grows the levels arr by 1 (if needed) and increases the capacity of the items array + * at the bottom. Only numLevels, the levels array and the items array are affected. + * @param mine the current sketch + */ + @SuppressWarnings("null") + private static void addEmptyTopLevelToCompletelyFullSketch(final KllSketch mine) { + final int[] myCurLevelsArr = mine.getLevelsArray(); + final int myCurNumLevels = mine.getNumLevels(); + final int myCurTotalItemsCapacity = myCurLevelsArr[myCurNumLevels]; + double minDouble = Double.NaN; + double maxDouble = Double.NaN; + float minFloat = Float.NaN; + float maxFloat = Float.NaN; + + double[] myCurDoubleItemsArr = null; + float[] myCurFloatItemsArr = null; + + final int myNewNumLevels; + final int[] myNewLevelsArr; + final int myNewTotalItemsCapacity; + + float[] myNewFloatItemsArr = null; + double[] myNewDoubleItemsArr = null; + + if (mine.sketchType == DOUBLES_SKETCH) { + minDouble = mine.getMinDoubleValue(); + maxDouble = mine.getMaxDoubleValue(); + myCurDoubleItemsArr = mine.getDoubleItemsArray(); + //assert we are following a certain growth scheme + assert myCurDoubleItemsArr.length == myCurTotalItemsCapacity; + } else { //FLOATS_SKETCH + minFloat = mine.getMinFloatValue(); + maxFloat = mine.getMaxFloatValue(); + myCurFloatItemsArr = mine.getFloatItemsArray(); + assert myCurFloatItemsArr.length == myCurTotalItemsCapacity; + } + assert myCurLevelsArr[0] == 0; //definition of full is part of the growth scheme + + final int deltaItemsCap = levelCapacity(mine.getK(), myCurNumLevels + 1, 0, mine.getM()); + myNewTotalItemsCapacity = myCurTotalItemsCapacity + deltaItemsCap; + + // Check if growing the levels arr if required. + // Note that merging MIGHT over-grow levels_, in which case we might not have to grow it + final boolean growLevelsArr = myCurLevelsArr.length < myCurNumLevels + 2; + + // GROW LEVELS ARRAY + if (growLevelsArr) { + //grow levels arr by one and copy the old data to the new array, extra space at the top. + myNewLevelsArr = Arrays.copyOf(myCurLevelsArr, myCurNumLevels + 2); + assert myNewLevelsArr.length == myCurLevelsArr.length + 1; + myNewNumLevels = myCurNumLevels + 1; + mine.incNumLevels(); //increment the class member + } else { + myNewLevelsArr = myCurLevelsArr; + myNewNumLevels = myCurNumLevels; + } + // This loop updates all level indices EXCLUDING the "extra" index at the top + for (int level = 0; level <= myNewNumLevels - 1; level++) { + myNewLevelsArr[level] += deltaItemsCap; + } + myNewLevelsArr[myNewNumLevels] = myNewTotalItemsCapacity; // initialize the new "extra" index at the top + + // GROW ITEMS ARRAY + if (mine.sketchType == DOUBLES_SKETCH) { + myNewDoubleItemsArr = new double[myNewTotalItemsCapacity]; + // copy and shift the current data into the new array + System.arraycopy(myCurDoubleItemsArr, 0, myNewDoubleItemsArr, deltaItemsCap, myCurTotalItemsCapacity); + } else { + myNewFloatItemsArr = new float[myNewTotalItemsCapacity]; + // copy and shift the current items data into the new array + System.arraycopy(myCurFloatItemsArr, 0, myNewFloatItemsArr, deltaItemsCap, myCurTotalItemsCapacity); + } + + //MEMORY SPACE MANAGEMENT + if (mine.updatablMemory) { + mine.wmem = memorySpaceMgmt(mine, myNewLevelsArr.length, myNewTotalItemsCapacity); + } + //update our sketch with new expanded spaces + mine.setNumLevels(myNewNumLevels); + mine.setLevelsArray(myNewLevelsArr); + if (mine.sketchType == DOUBLES_SKETCH) { + mine.setMinDoubleValue(minDouble); + mine.setMaxDoubleValue(maxDouble); + mine.setDoubleItemsArray(myNewDoubleItemsArr); + } else { //Float sketch + mine.setMinFloatValue(minFloat); + mine.setMaxFloatValue(maxFloat); + mine.setFloatItemsArray(myNewFloatItemsArr); + } + } + + /** + * Finds the first level starting with level 0 that exceeds its nominal capacity + * @param k configured size of sketch. Range [m, 2^16] + * @param m minimum level size. Default is 8. + * @param numLevels one-based number of current levels + * @return level to compact + */ + private static int findLevelToCompact(final int k, final int m, final int numLevels, final int[] levels) { + int level = 0; + while (true) { + assert level < numLevels; + final int pop = levels[level + 1] - levels[level]; + final int cap = KllHelper.levelCapacity(k, numLevels, level, m); + if (pop >= cap) { + return level; + } + level++; + } + } + /** * Computes the actual item capacity of a given level given its depth index. * If the depth of levels exceeds 30, this uses a folding technique to accurately compute the @@ -113,25 +946,44 @@ private static long intCapAuxAux(final long k, final int depth) { return result; } + private static void loadFirst8Bytes(final KllSketch sk, final WritableMemory wmem, + final boolean updatable) { + final boolean empty = sk.getN() == 0; + final boolean lvlZeroSorted = sk.isLevelZeroSorted(); + final boolean singleItem = sk.getN() == 1; + final boolean doubleType = (sk.sketchType == DOUBLES_SKETCH); + final int preInts = updatable + ? PREAMBLE_INTS_FULL + : (empty || singleItem) ? PREAMBLE_INTS_EMPTY_SINGLE : PREAMBLE_INTS_FULL; + //load the preamble + setMemoryPreInts(wmem, preInts); + final int server = updatable ? SERIAL_VERSION_UPDATABLE + : (singleItem ? SERIAL_VERSION_SINGLE : SERIAL_VERSION_EMPTY_FULL); + setMemorySerVer(wmem, server); + setMemoryFamilyID(wmem, Family.KLL.getID()); + setMemoryEmptyFlag(wmem, empty); + setMemoryLevelZeroSortedFlag(wmem, lvlZeroSorted); + setMemorySingleItemFlag(wmem, singleItem); + setMemoryDoubleSketchFlag(wmem, doubleType); + setMemoryUpdatableFlag(wmem, updatable); + setMemoryK(wmem, sk.getK()); + setMemoryM(wmem, sk.getM()); + } + /** - * This is the exact powers of 3 from 3^0 to 3^30 where the exponent is the index + * @param fmt format + * @param args arguments */ - private static final long[] powersOfThree = - new long[] {1, 3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441, - 1594323, 4782969, 14348907, 43046721, 129140163, 387420489, 1162261467, - 3486784401L, 10460353203L, 31381059609L, 94143178827L, 282429536481L, - 847288609443L, 2541865828329L, 7625597484987L, 22876792454961L, 68630377364883L, - 205891132094649L}; + private static void printf(final String fmt, final Object ... args) { + System.out.printf(fmt, args); //Disable + } - static long sumTheSampleWeights(final int num_levels, final int[] levels) { - long total = 0; - long weight = 1; - for (int i = 0; i < num_levels; i++) { - total += weight * (levels[i + 1] - levels[i]); - weight *= 2; - } - return total; + /** + * Println Object o + * @param o object to print + */ + private static void println(final Object o) { + System.out.println(o.toString()); } } - diff --git a/src/main/java/org/apache/datasketches/kll/KllMemoryValidate.java b/src/main/java/org/apache/datasketches/kll/KllMemoryValidate.java new file mode 100644 index 000000000..e64a4a1c0 --- /dev/null +++ b/src/main/java/org/apache/datasketches/kll/KllMemoryValidate.java @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.apache.datasketches.Family.idToFamily; +import static org.apache.datasketches.kll.KllMemoryValidate.MemoryInputError.SRC_NOT_KLL; +import static org.apache.datasketches.kll.KllMemoryValidate.MemoryInputError.EMPTYBIT_AND_PREINTS; +import static org.apache.datasketches.kll.KllMemoryValidate.MemoryInputError.UPDATABLEBIT_AND_SER_VER; +import static org.apache.datasketches.kll.KllMemoryValidate.MemoryInputError.EMPTYBIT_AND_SER_VER; +import static org.apache.datasketches.kll.KllMemoryValidate.MemoryInputError.EMPTYBIT_AND_SINGLEBIT; +import static org.apache.datasketches.kll.KllMemoryValidate.MemoryInputError.SINGLEBIT_AND_SER_VER; +import static org.apache.datasketches.kll.KllMemoryValidate.MemoryInputError.SINGLEBIT_AND_PREINTS; +import static org.apache.datasketches.kll.KllMemoryValidate.MemoryInputError.INVALID_PREINTS; +import static org.apache.datasketches.kll.KllMemoryValidate.MemoryInputError.memoryValidateThrow; +import static org.apache.datasketches.kll.KllPreambleUtil.DATA_START_ADR; +import static org.apache.datasketches.kll.KllPreambleUtil.DATA_START_ADR_SINGLE_ITEM; +import static org.apache.datasketches.kll.KllPreambleUtil.PREAMBLE_INTS_EMPTY_SINGLE; +import static org.apache.datasketches.kll.KllPreambleUtil.PREAMBLE_INTS_FULL; +import static org.apache.datasketches.kll.KllPreambleUtil.SERIAL_VERSION_EMPTY_FULL; +import static org.apache.datasketches.kll.KllPreambleUtil.SERIAL_VERSION_SINGLE; +import static org.apache.datasketches.kll.KllPreambleUtil.SERIAL_VERSION_UPDATABLE; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryDoubleSketchFlag; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryMinK; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryEmptyFlag; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryFamilyID; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryFlags; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryK; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryLevelZeroSortedFlag; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryM; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryN; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryNumLevels; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryPreInts; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemorySerVer; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemorySingleItemFlag; +import static org.apache.datasketches.kll.KllPreambleUtil.getMemoryUpdatableFlag; + +import org.apache.datasketches.Family; +import org.apache.datasketches.SketchesArgumentException; +import org.apache.datasketches.kll.KllPreambleUtil.Layout; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.WritableMemory; + +/** + * This class performs all the error checking of an incoming Memory object and extracts the key fields in the process. + * This is used by all sketches that read or import Memory objects. + * + * @author lrhodes + * + */ +final class KllMemoryValidate { + // first 8 bytes + final int preInts; // = extractPreInts(srcMem); + final int serVer; + final int familyID; + final String famName; + final int flags; + boolean empty; + boolean singleItem; + final boolean level0Sorted; + final boolean doublesSketch; + final boolean updatableMemory; + final int k; + final int m; + final int memCapacity; + + Layout layout; + // depending on the layout, the next 8-16 bytes of the preamble, may be filled with assumed values. + // For example, if the layout is compact & empty, n = 0, if compact and single, n = 1, etc. + long n; + // next 4 bytes + int minK; + int numLevels; + // derived + int capacityItems; //capacity of Items array for exporting and for Updatable form + int itemsRetained; //actual items retained in Compact form + int itemsArrStart; + int sketchBytes; + Memory levelsArrCompact; //if sk = empty or single, this is derived + Memory minMaxArrCompact; //if sk = empty or single, this is derived + Memory itemsArrCompact; //if sk = empty or single, this is derived + WritableMemory levelsArrUpdatable; + WritableMemory minMaxArrUpdatable; + WritableMemory itemsArrUpdatable; + + KllMemoryValidate(final Memory srcMem) { + memCapacity = (int) srcMem.getCapacity(); + preInts = getMemoryPreInts(srcMem); + serVer = getMemorySerVer(srcMem); + + familyID = getMemoryFamilyID(srcMem); + if (familyID != Family.KLL.getID()) { memoryValidateThrow(SRC_NOT_KLL, familyID); } + famName = idToFamily(familyID).toString(); + flags = getMemoryFlags(srcMem); + empty = getMemoryEmptyFlag(srcMem); + level0Sorted = getMemoryLevelZeroSortedFlag(srcMem); + singleItem = getMemorySingleItemFlag(srcMem); + doublesSketch = getMemoryDoubleSketchFlag(srcMem); + updatableMemory = getMemoryUpdatableFlag(srcMem); + k = getMemoryK(srcMem); + m = getMemoryM(srcMem); + KllHelper.checkM(m); + KllHelper.checkK(k, m); + if ((serVer == SERIAL_VERSION_UPDATABLE) ^ updatableMemory) { memoryValidateThrow(UPDATABLEBIT_AND_SER_VER, 1); } + + if (updatableMemory) { updatableMemoryValidate((WritableMemory) srcMem); } + else { compactMemoryValidate(srcMem); } + } + + void compactMemoryValidate(final Memory srcMem) { + if (empty && singleItem) { memoryValidateThrow(EMPTYBIT_AND_SINGLEBIT, flags); } + final int typeBytes = doublesSketch ? Double.BYTES : Float.BYTES; + final int sw = (empty ? 1 : 0) | (singleItem ? 4 : 0); + switch (sw) { + case 0: { //FULL_COMPACT + if (preInts != PREAMBLE_INTS_FULL) { memoryValidateThrow(INVALID_PREINTS, preInts); } + if (serVer != SERIAL_VERSION_EMPTY_FULL) { memoryValidateThrow(EMPTYBIT_AND_SER_VER, serVer); } + layout = doublesSketch ? Layout.DOUBLE_FULL_COMPACT : Layout.FLOAT_FULL_COMPACT; + n = getMemoryN(srcMem); + minK = getMemoryMinK(srcMem); + numLevels = getMemoryNumLevels(srcMem); + int offset = DATA_START_ADR; + + // LEVELS MEM + final int[] myLevelsArr = new int[numLevels + 1]; + srcMem.getIntArray(offset, myLevelsArr, 0, numLevels); //copies all except the last one + myLevelsArr[numLevels] = KllHelper.computeTotalItemCapacity(k, m, numLevels); //load the last one + levelsArrCompact = Memory.wrap(myLevelsArr); //separate from srcMem, + offset += (int)levelsArrCompact.getCapacity() - Integer.BYTES; // but one larger than srcMem + + minMaxArrCompact = srcMem.region(offset, 2L * typeBytes); // MIN/MAX MEM + offset += (int)minMaxArrCompact.getCapacity(); + + // ITEMS MEM + itemsArrStart = offset; + capacityItems = myLevelsArr[numLevels]; + itemsRetained = capacityItems - myLevelsArr[0]; + if (doublesSketch) { + final double[] myItemsArr = new double[capacityItems]; + srcMem.getDoubleArray(offset, myItemsArr, myLevelsArr[0], itemsRetained); + itemsArrCompact = Memory.wrap(myItemsArr); + } else { + final float[] myItemsArr = new float[capacityItems]; + srcMem.getFloatArray(offset, myItemsArr, myLevelsArr[0], itemsRetained); + itemsArrCompact = Memory.wrap(myItemsArr); + } + sketchBytes = offset + itemsRetained * typeBytes; + break; + } + case 1: { //EMPTY_COMPACT + if (preInts != PREAMBLE_INTS_EMPTY_SINGLE) { memoryValidateThrow(EMPTYBIT_AND_PREINTS, preInts); } + if (serVer != SERIAL_VERSION_EMPTY_FULL) { memoryValidateThrow(EMPTYBIT_AND_SER_VER, serVer); } + layout = doublesSketch ? Layout.DOUBLE_EMPTY_COMPACT : Layout.FLOAT_EMPTY_COMPACT; + n = 0; //assumed + minK = k; //assumed + numLevels = 1; //assumed + capacityItems = k; + itemsRetained = 0; + + levelsArrCompact = Memory.wrap(new int[] {k, k}); // LEVELS MEM + if (doublesSketch) { + minMaxArrCompact = Memory.wrap(new double[] {Double.NaN, Double.NaN}); // MIN/MAX MEM + itemsArrCompact = Memory.wrap(new double[k]); // ITEMS MEM + } else { //Floats Sketch + minMaxArrCompact = Memory.wrap(new float[] {Float.NaN, Float.NaN}); // MIN/MAX MEM + itemsArrCompact = Memory.wrap(new float[k]); // ITEMS MEM + } + sketchBytes = DATA_START_ADR_SINGLE_ITEM; //used for empty and single item + itemsArrStart = DATA_START_ADR_SINGLE_ITEM; + break; + } + case 4: { //SINGLE_COMPACT + if (preInts != PREAMBLE_INTS_EMPTY_SINGLE) { memoryValidateThrow(SINGLEBIT_AND_PREINTS, preInts); } + if (serVer != SERIAL_VERSION_SINGLE) { memoryValidateThrow(SINGLEBIT_AND_SER_VER, serVer); } + layout = doublesSketch ? Layout.DOUBLE_SINGLE_COMPACT : Layout.FLOAT_SINGLE_COMPACT; + n = 1; + minK = k; + numLevels = 1; + capacityItems = k; + itemsRetained = 1; + + levelsArrCompact = Memory.wrap(new int[] {k - 1, k}); // LEVELS MEM + if (doublesSketch) { + final double minMax = srcMem.getDouble(DATA_START_ADR_SINGLE_ITEM); + minMaxArrCompact = Memory.wrap(new double[] {minMax, minMax}); // MIN/MAX MEM + final double[] myDoubleItems = new double[k]; // ITEMS MEM + myDoubleItems[k - 1] = minMax; + itemsArrCompact = Memory.wrap(myDoubleItems); + } else { + final float minMax = srcMem.getFloat(DATA_START_ADR_SINGLE_ITEM); + minMaxArrCompact = Memory.wrap(new float[] {minMax, minMax}); // MIN/MAX MEM + final float[] myFloatItems = new float[k]; // ITEMS MEM + myFloatItems[k - 1] = minMax; + itemsArrCompact = Memory.wrap(myFloatItems); + } + sketchBytes = DATA_START_ADR_SINGLE_ITEM + typeBytes; + itemsArrStart = DATA_START_ADR_SINGLE_ITEM; + break; + } + default: //can not happen + } + } + + void updatableMemoryValidate(final WritableMemory wSrcMem) { + final int typeBytes = doublesSketch ? Double.BYTES : Float.BYTES; + if (preInts != PREAMBLE_INTS_FULL) { memoryValidateThrow(INVALID_PREINTS, preInts); } + layout = doublesSketch ? Layout.DOUBLE_UPDATABLE : Layout.FLOAT_UPDATABLE; + + n = getMemoryN(wSrcMem); + empty = n == 0; //empty & singleItem are set for convenience + singleItem = n == 1; // there is no error checking on these bits + minK = getMemoryMinK(wSrcMem); + numLevels = getMemoryNumLevels(wSrcMem); + + int offset = DATA_START_ADR; + + levelsArrUpdatable = wSrcMem.writableRegion(offset, (numLevels + 1L) * Integer.BYTES); //LEVELS + offset += (int)levelsArrUpdatable.getCapacity(); + + minMaxArrUpdatable = wSrcMem.writableRegion(offset, 2L * typeBytes); //MIN/MAX + offset += (int)minMaxArrUpdatable.getCapacity(); + + capacityItems = levelsArrUpdatable.getInt((long)numLevels * Integer.BYTES); //ITEMS + final int itemsArrBytes = capacityItems * typeBytes; + itemsArrStart = offset; + itemsArrUpdatable = wSrcMem.writableRegion(offset, itemsArrBytes); + sketchBytes = offset + itemsArrBytes; + } + + enum MemoryInputError { + SRC_NOT_KLL("FamilyID Field must be: " + Family.KLL.getID() + ", NOT: "), + EMPTYBIT_AND_PREINTS("Empty Bit: 1 -> PreInts: " + PREAMBLE_INTS_EMPTY_SINGLE + ", NOT: "), + EMPTYBIT_AND_SER_VER("Empty Bit: 1 -> SerVer: " + SERIAL_VERSION_EMPTY_FULL + ", NOT: "), + SINGLEBIT_AND_SER_VER("Single Item Bit: 1 -> SerVer: " + SERIAL_VERSION_SINGLE + ", NOT: "), + SINGLEBIT_AND_PREINTS("Single Item Bit: 1 -> PreInts: " + PREAMBLE_INTS_EMPTY_SINGLE + ", NOT: "), + INVALID_PREINTS("PreInts Must Be: " + PREAMBLE_INTS_FULL + ", NOT: "), + UPDATABLEBIT_AND_SER_VER("((SerVer == 3) ^ (Updatable Bit)) must = 0, NOT: "), + EMPTYBIT_AND_SINGLEBIT("Empty flag bit and SingleItem flag bit cannot both be set. Flags: "); + + private String msg; + + private MemoryInputError(final String msg) { + this.msg = msg; + } + + private String getMessage() { + return msg; + } + + final static void memoryValidateThrow(final MemoryInputError errType, final int value) { + throw new SketchesArgumentException(errType.getMessage() + value); + } + + } + +} diff --git a/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java b/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java new file mode 100644 index 000000000..0c5f7d273 --- /dev/null +++ b/src/main/java/org/apache/datasketches/kll/KllPreambleUtil.java @@ -0,0 +1,361 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.apache.datasketches.Util.zeroPad; + +import org.apache.datasketches.Util; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.WritableMemory; + +//@formatter:off + +/** + * This class defines the serialized data structure and provides access methods for the key fields. + * + *

The intent of the design of this class was to isolate the detailed knowledge of the bit and + * byte layout of the serialized form of the sketches derived from the base sketch classes into one place. + * This allows the possibility of the introduction of different serialization + * schemes with minimal impact on the rest of the library.

+ * + *

+ * LAYOUT: The low significance bytes of this long based data structure are on the right. + * The multi-byte primitives are stored in native byte order. + * The single byte fields are treated as unsigned.

+ * + *

An empty sketch requires only 8 bytes, which is only preamble. + * A serialized, non-empty KllDoublesSketch requires at least 16 bytes of preamble. + * A serialized, non-empty KllFloatsSketch requires at least 12 bytes of preamble.

+ * + *
{@code
+ * Serialized float sketch layout, more than one item:
+ *  Adr:
+ *      ||    7    |   6   |    5   |    4   |    3   |    2    |    1   |      0       |
+ *  0   || unused  |   M   |--------K--------|  Flags |  FamID  | SerVer | PreambleInts |
+ *      ||   15    |   14  |   13   |   12   |   11   |   10    |    9   |      8       |
+ *  1   ||---------------------------------N_LONG---------------------------------------|
+ *      ||         |       |        |   20   |   19   |    18   |   17   |      16      |
+ *  2   ||<-------Levels Arr Start----------]| unused |NumLevels|------Min K------------|
+ *      ||         |       |        |        |        |         |        |              |
+ *  ?   ||<-------Min/Max Arr Start---------]|[<----------Levels Arr End----------------|
+ *      ||         |       |        |        |        |         |        |              |
+ *  ?   ||<-----Float Items Arr Start-------]|[<---------Min/Max Arr End----------------|
+ *      ||         |       |        |        |        |         |        |              |
+ *  ?   ||         |       |        |        |[<-------Float Items Arr End--------------|
+ *
+ * Serialized float sketch layout, Empty (8 bytes) and Single Item (12 bytes):
+ *  Adr:
+ *      ||    7    |   6   |    5   |    4   |    3   |    2    |    1   |      0       |
+ *  0   || unused  |   M   |--------K--------|  Flags |  FamID  | SerVer | PreambleInts |
+ *      ||   15    |   14  |   13   |   12   |   11   |   10    |    9   |      8       |
+ *  1   ||                                   |-------------Single Item------------------|
+ *
+ *
+ *
+ * Serialized double sketch layout, more than one item:
+ *  Adr:
+ *      ||    7    |   6   |    5   |    4   |    3   |    2    |    1   |      0       |
+ *  0   || unused  |   M   |--------K--------|  Flags |  FamID  | SerVer | PreambleInts |
+ *      ||   15    |   14  |   13   |   12   |   11   |   10    |    9   |      8       |
+ *  1   ||---------------------------------N_LONG---------------------------------------|
+ *      ||   23    |   22  |   21   |   20   |   19   |    18   |   17   |      16      |
+ *  2   ||<-------Levels Arr Start----------]| unused |NumLevels|------Min K------------|
+ *      ||         |       |        |        |        |         |        |              |
+ *  ?   ||<-------Min/Max Arr Start---------]|[<----------Levels Arr End----------------|
+ *      ||         |       |        |        |        |         |        |              |
+ *  ?   ||<----Double Items Arr Start-------]|[<---------Min/Max Arr End----------------|
+ *      ||         |       |        |        |        |         |        |              |
+ *  ?   ||         |       |        |        |[<------Double Items Arr End--------------|
+ *
+ * Serialized double sketch layout, Empty (8 bytes) and Single Item (16 bytes):
+ *  Adr:
+ *      ||    7    |   6   |    5   |    4   |    3   |    2    |    1   |      0       |
+ *  0   || unused  |   M   |--------K--------|  Flags |  FamID  | SerVer | PreambleInts |
+ *      ||                                                               |      8       |
+ *  1   ||------------------------------Single Item-------------------------------------|
+ *
+ * The structure of the data block depends on Layout:
+ *
+ *   For FLOAT_SINGLE_COMPACT or DOUBLE_SINGLE_COMPACT:
+ *     The single data item is at offset DATA_START_ADR_SINGLE_ITEM = 8
+ *
+ *   For FLOAT_FULL_COMPACT:
+ *     The int[] levels array starts at offset DATA_START_ADR_FLOAT = 20 with a length of numLevels integers;
+ *     Followed by Float Min_Value, then Float Max_Value
+ *     Followed by an array of Floats of length retainedItems()
+ *
+ *   For DOUBLE_FULL_COMPACT
+ *     The int[] levels array starts at offset DATA_START_ADR_DOUBLE = 20 with a length of numLevels integers;
+ *     Followed by Double Min_Value, then Double Max_Value
+ *     Followed by an array of Doubles of length retainedItems()
+ *
+ *   For FLOAT_UPDATABLE
+ *     The int[] levels array starts at offset DATA_START_ADR_FLOAT = 20 with a length of (numLevels + 1) integers;
+ *     Followed by Float Min_Value, then Float Max_Value
+ *     Followed by an array of Floats of length KllHelper.computeTotalItemCapacity(...).
+ *
+ *   For DOUBLE_UPDATABLE
+ *     The int[] levels array starts at offset DATA_START_ADR_DOUBLE = 20 with a length of (numLevels + 1) integers;
+ *     Followed by Double Min_Value, then Double Max_Value
+ *     Followed by an array of Doubles of length KllHelper.computeTotalItemCapacity(...).
+ *
+ * }
+ * + * @author Lee Rhodes + */ +final class KllPreambleUtil { + + private KllPreambleUtil() {} + + static final String LS = System.getProperty("line.separator"); + + // Preamble byte addresses + static final int PREAMBLE_INTS_BYTE_ADR = 0; + static final int SER_VER_BYTE_ADR = 1; + static final int FAMILY_BYTE_ADR = 2; + static final int FLAGS_BYTE_ADR = 3; + static final int K_SHORT_ADR = 4; // to 5 + static final int M_BYTE_ADR = 6; + // 7 is reserved for future use + // SINGLE ITEM ONLY + static final int DATA_START_ADR_SINGLE_ITEM = 8; + + // MULTI-ITEM + static final int N_LONG_ADR = 8; // to 15 + static final int MIN_K_SHORT_ADR = 16; // to 17 + static final int NUM_LEVELS_BYTE_ADR = 18; + + // 19 is reserved for future use + static final int DATA_START_ADR = 20; // Full Sketch, not single item + + // Other static values + static final byte SERIAL_VERSION_EMPTY_FULL = 1; // Empty or full preamble, NOT single item format + static final byte SERIAL_VERSION_SINGLE = 2; // only single-item format + static final byte SERIAL_VERSION_UPDATABLE = 3; // + static final int PREAMBLE_INTS_EMPTY_SINGLE = 2; // for empty or single item + static final int PREAMBLE_INTS_FULL = 5; // Full preamble, not empty nor single item + + // Flag bit masks + static final int EMPTY_BIT_MASK = 1; + static final int LEVEL_ZERO_SORTED_BIT_MASK = 2; + static final int SINGLE_ITEM_BIT_MASK = 4; + static final int DOUBLES_SKETCH_BIT_MASK = 8; + static final int UPDATABLE_BIT_MASK = 16; + + enum Layout { + FLOAT_FULL_COMPACT, FLOAT_EMPTY_COMPACT, FLOAT_SINGLE_COMPACT, + DOUBLE_FULL_COMPACT, DOUBLE_EMPTY_COMPACT, DOUBLE_SINGLE_COMPACT, + FLOAT_UPDATABLE, DOUBLE_UPDATABLE } + + /** + * Returns a human readable string summary of the internal state of the given byte array. + * Used primarily in testing. + * + * @param byteArr the given byte array. + * @return the summary string. + */ + static String toString(final byte[] byteArr) { + final Memory mem = Memory.wrap(byteArr); + return toString(mem); + } + + /** + * Returns a human readable string summary of the internal state of the given Memory. + * Used primarily in testing. + * + * @param mem the given Memory + * @return the summary string. + */ + static String toString(final Memory mem) { + return memoryToString(mem); + } + + static String memoryToString(final Memory mem) { + final KllMemoryValidate memChk = new KllMemoryValidate(mem); + final int flags = memChk.flags & 0XFF; + final String flagsStr = (flags) + ", 0x" + (Integer.toHexString(flags)) + ", " + + zeroPad(Integer.toBinaryString(flags), 8); + final int preInts = memChk.preInts; + final StringBuilder sb = new StringBuilder(); + sb.append(Util.LS).append("### KLL SKETCH MEMORY SUMMARY:").append(LS); + sb.append("Byte 0 : Preamble Ints : ").append(preInts).append(LS); + sb.append("Byte 1 : SerVer : ").append(memChk.serVer).append(LS); + sb.append("Byte 2 : FamilyID : ").append(memChk.familyID).append(LS); + sb.append(" FamilyName : ").append(memChk.famName).append(LS); + sb.append("Byte 3 : Flags Field : ").append(flagsStr).append(LS); + sb.append(" Bit Flag Name").append(LS); + sb.append(" 0 EMPTY COMPACT : ").append(memChk.empty).append(LS); + sb.append(" 1 LEVEL_ZERO_SORTED : ").append(memChk.level0Sorted).append(LS); + sb.append(" 2 SINGLE_ITEM COMPACT: ").append(memChk.singleItem).append(LS); + sb.append(" 3 DOUBLES_SKETCH : ").append(memChk.doublesSketch).append(LS); + sb.append(" 4 UPDATABLE : ").append(memChk.updatableMemory).append(LS); + sb.append("Bytes 4-5 : K : ").append(memChk.k).append(LS); + sb.append("Byte 6 : Min Level Cap, M : ").append(memChk.m).append(LS); + sb.append("Byte 7 : (Reserved) : ").append(LS); + + switch (memChk.layout) { + case DOUBLE_FULL_COMPACT: + case FLOAT_FULL_COMPACT: + case FLOAT_UPDATABLE: + case DOUBLE_UPDATABLE: + { + sb.append("Bytes 8-15: N : ").append(memChk.n).append(LS); + sb.append("Bytes 16-17: MinK : ").append(memChk.minK).append(LS); + sb.append("Byte 18 : NumLevels : ").append(memChk.numLevels).append(LS); + break; + } + case FLOAT_EMPTY_COMPACT: + case FLOAT_SINGLE_COMPACT: + case DOUBLE_EMPTY_COMPACT: + case DOUBLE_SINGLE_COMPACT: + { + sb.append("Assumed : N : ").append(memChk.n).append(LS); + sb.append("Assumed : MinK : ").append(memChk.minK).append(LS); + sb.append("Assumed : NumLevels : ").append(memChk.numLevels).append(LS); + break; + } + default: break; //can never happen + } + sb.append("PreambleBytes : ").append(preInts * 4).append(LS); + sb.append("Sketch Bytes : ").append(memChk.sketchBytes).append(LS); + sb.append("Memory Capacity Bytes : ").append(mem.getCapacity()).append(LS); + sb.append("### END KLL Sketch Memory Summary").append(LS); + return sb.toString(); + } + + static int getMemoryPreInts(final Memory mem) { + return mem.getByte(PREAMBLE_INTS_BYTE_ADR) & 0XFF; + } + + static int getMemorySerVer(final Memory mem) { + return mem.getByte(SER_VER_BYTE_ADR) & 0XFF; + } + + static int getMemoryFamilyID(final Memory mem) { + return mem.getByte(FAMILY_BYTE_ADR) & 0XFF; + } + + static int getMemoryFlags(final Memory mem) { + return mem.getByte(FLAGS_BYTE_ADR) & 0XFF; + } + + static boolean getMemoryEmptyFlag(final Memory mem) { + return (getMemoryFlags(mem) & EMPTY_BIT_MASK) != 0; + } + + static boolean getMemoryLevelZeroSortedFlag(final Memory mem) { + return (getMemoryFlags(mem) & LEVEL_ZERO_SORTED_BIT_MASK) != 0; + } + + static boolean getMemorySingleItemFlag(final Memory mem) { + return (getMemoryFlags(mem) & SINGLE_ITEM_BIT_MASK) != 0; + } + + static boolean getMemoryDoubleSketchFlag(final Memory mem) { + return (getMemoryFlags(mem) & DOUBLES_SKETCH_BIT_MASK) != 0; + } + + static boolean getMemoryUpdatableFlag(final Memory mem) { + return (getMemoryFlags(mem) & UPDATABLE_BIT_MASK) != 0; + } + + static int getMemoryK(final Memory mem) { + return mem.getShort(K_SHORT_ADR) & 0XFFFF; + } + + static int getMemoryM(final Memory mem) { + return mem.getByte(M_BYTE_ADR) & 0XFF; + } + + static long getMemoryN(final Memory mem) { + return mem.getLong(N_LONG_ADR); + } + + static int getMemoryMinK(final Memory mem) { + return mem.getShort(MIN_K_SHORT_ADR) & 0XFFFF; + } + + static int getMemoryNumLevels(final Memory mem) { + return mem.getByte(NUM_LEVELS_BYTE_ADR) & 0XFF; + } + + static void setMemoryPreInts(final WritableMemory wmem, final int value) { + wmem.putByte(PREAMBLE_INTS_BYTE_ADR, (byte) value); + } + + static void setMemorySerVer(final WritableMemory wmem, final int value) { + wmem.putByte(SER_VER_BYTE_ADR, (byte) value); + } + + static void setMemoryFamilyID(final WritableMemory wmem, final int value) { + wmem.putByte(FAMILY_BYTE_ADR, (byte) value); + } + + static void setMemoryFlags(final WritableMemory wmem, final int value) { + wmem.putByte(FLAGS_BYTE_ADR, (byte) value); + } + + static void setMemoryEmptyFlag(final WritableMemory wmem, final boolean empty) { + final int flags = getMemoryFlags(wmem); + setMemoryFlags(wmem, empty ? flags | EMPTY_BIT_MASK : flags & ~EMPTY_BIT_MASK); + } + + static void setMemoryLevelZeroSortedFlag(final WritableMemory wmem, final boolean levelZeroSorted) { + final int flags = getMemoryFlags(wmem); + setMemoryFlags(wmem, levelZeroSorted ? flags | LEVEL_ZERO_SORTED_BIT_MASK : flags & ~LEVEL_ZERO_SORTED_BIT_MASK); + } + + static void setMemorySingleItemFlag(final WritableMemory wmem, final boolean singleItem) { + final int flags = getMemoryFlags(wmem); + setMemoryFlags(wmem, singleItem ? flags | SINGLE_ITEM_BIT_MASK : flags & ~SINGLE_ITEM_BIT_MASK); + } + + static void setMemoryDoubleSketchFlag(final WritableMemory wmem, final boolean doubleSketch) { + final int flags = getMemoryFlags(wmem); + setMemoryFlags(wmem, doubleSketch ? flags | DOUBLES_SKETCH_BIT_MASK : flags & ~DOUBLES_SKETCH_BIT_MASK); + } + + static void setMemoryUpdatableFlag(final WritableMemory wmem, final boolean updatable) { + final int flags = getMemoryFlags(wmem); + setMemoryFlags(wmem, updatable ? flags | UPDATABLE_BIT_MASK : flags & ~UPDATABLE_BIT_MASK); + } + + static void setMemoryK(final WritableMemory wmem, final int value) { + wmem.putShort(K_SHORT_ADR, (short) value); + } + + static void setMemoryM(final WritableMemory wmem, final int value) { + wmem.putByte(M_BYTE_ADR, (byte) value); + } + + static void setMemoryN(final WritableMemory wmem, final long value) { + wmem.putLong(N_LONG_ADR, value); + } + + static void setMemoryMinK(final WritableMemory wmem, final int value) { + wmem.putShort(MIN_K_SHORT_ADR, (short) value); + } + + static void setMemoryNumLevels(final WritableMemory wmem, final int value) { + wmem.putByte(NUM_LEVELS_BYTE_ADR, (byte) value); + } + +} + diff --git a/src/main/java/org/apache/datasketches/kll/KllSketch.java b/src/main/java/org/apache/datasketches/kll/KllSketch.java new file mode 100644 index 000000000..faa5d1081 --- /dev/null +++ b/src/main/java/org/apache/datasketches/kll/KllSketch.java @@ -0,0 +1,457 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.apache.datasketches.kll.KllPreambleUtil.DATA_START_ADR; +import static org.apache.datasketches.kll.KllPreambleUtil.DATA_START_ADR_SINGLE_ITEM; +import static org.apache.datasketches.kll.KllPreambleUtil.N_LONG_ADR; +import static org.apache.datasketches.kll.KllSketch.SketchType.DOUBLES_SKETCH; +import static org.apache.datasketches.kll.KllSketch.SketchType.FLOATS_SKETCH; + +import java.util.Random; + +import org.apache.datasketches.SketchesArgumentException; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.MemoryRequestServer; +import org.apache.datasketches.memory.WritableMemory; + +/* + * Sampled stream data (floats or doubles) is stored as an array or as part of a Memory object. + * This array is partitioned into sections called levels and the indices into the array of items + * are tracked by a small integer array called levels or levels array. + * The data for level i lies in positions levelsArray[i] through levelsArray[i + 1] - 1 inclusive. + * Hence, the levelsArray must contain (numLevels + 1) indices. + * The valid portion of items array is completely packed and sorted, except for level 0, + * which is filled from the top down. Any items below the index levelsArray[0] is garbage and will be + * overwritten by subsequent updates. + * + * Invariants: + * 1) After a compaction, or an update, or a merge, every level is sorted except for level zero. + * 2) After a compaction, (sum of capacities) - (sum of items) >= 1, + * so there is room for least 1 more item in level zero. + * 3) There are no gaps except at the bottom, so if levels_[0] = 0, + * the sketch is exactly filled to capacity and must be compacted or the itemsArray and levelsArray + * must be expanded to include more levels. + * 4) Sum of weights of all retained items == N. + * 5) Current total item capacity = itemsArray.length = levelsArray[numLevels]. + */ + +/** + * This class is the root of the KLL sketch class hierarchy. It includes the public API that is independent + * of either sketch type (float or double) and independent of whether the sketch is targeted for use on the + * heap or Direct (off-heap. + * + *

Please refer to the documentation in the package-info:
+ * {@link org.apache.datasketches.kll}

+ * + * @author Lee Rhodes, Kevin Lang + */ +public abstract class KllSketch { + + public enum SketchType { FLOATS_SKETCH, DOUBLES_SKETCH } + + enum Error { + TGT_IS_IMMUTABLE("Given sketch Memory is immutable, cannot write."), + SRC_MUST_BE_DIRECT("Given sketch must be of type Direct."), + SRC_MUST_BE_DOUBLE("Given sketch must be of type Double."), + SRC_MUST_BE_FLOAT("Given sketch must be of type Float."), + SRC_CANNOT_BE_DIRECT("Given sketch cannot be of type Direct."), + MUST_NOT_CALL("This is an artifact of inheritance and should never be called."); + + private String msg; + + private Error(final String msg) { + this.msg = msg; + } + + final static void kllSketchThrow(final Error errType) { + throw new SketchesArgumentException(errType.getMessage()); + } + + private String getMessage() { + return msg; + } + } + + /** + * The default value of K + */ + public static final int DEFAULT_K = 200; + + /** + * The maximum value of K + */ + public static final int MAX_K = (1 << 16) - 1; // serialized as an unsigned short + + /** + * The default value of M. The parameter m is the minimum level size in number of items. + * Currently, the public default is 8, but this can be overridden using Package Private methods to + * 2, 4, 6 or 8, and the sketch works just fine. The value 8 was chosen as a compromise between speed and size. + * Choosing smaller values of m less than 8 will make the sketch slower. + */ + static final int DEFAULT_M = 8; + + /** + * The maximum value of M. + * @see #DEFAULT_M + */ + static final int MAX_M = 8; + + /** + * The minimum value of M. + * @see #DEFAULT_M + */ + static final int MIN_M = 2; + + static final Random random = new Random(); + final SketchType sketchType; + final MemoryRequestServer memReqSvr; + final boolean updatablMemory; + WritableMemory wmem; + + /** + * Constructor + * @param sketchType either DOUBLE_SKETCH or FLOAT_SKETCH + * @param wmem the current WritableMemory or null + * @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory + */ + KllSketch(final SketchType sketchType, final WritableMemory wmem, final MemoryRequestServer memReqSvr) { + this.sketchType = sketchType; + this.wmem = wmem; + if (wmem != null) { + this.updatablMemory = memReqSvr != null; + this.memReqSvr = memReqSvr; + } else { + this.updatablMemory = false; + this.memReqSvr = null; + } + } + + /** + * Gets the approximate value of k to use given epsilon, the normalized rank error. + * @param epsilon the normalized rank error between zero and one. + * @param pmf if true, this function returns the value of k assuming the input epsilon + * is the desired "double-sided" epsilon for the getPMF() function. Otherwise, this function + * returns the value of k assuming the input epsilon is the desired "single-sided" + * epsilon for all the other queries. + * + *

Please refer to the documentation in the package-info:
+ * {@link org.apache.datasketches.kll}

+ * @return the value of k given a value of epsilon. + */ + public static int getKFromEpsilon(final double epsilon, final boolean pmf) { + return KllHelper.getKFromEpsilon(epsilon, pmf); + } + + /** + * Returns upper bound on the compact serialized size of a FloatsSketch given a parameter + * k and stream length. This method can be used if allocation of storage + * is necessary beforehand. + * @param k parameter that controls size of the sketch and accuracy of estimates + * @param n stream length + * @return upper bound on the compact serialized size + * @deprecated use {@link #getMaxSerializedSizeBytes(int, long, SketchType, boolean)} instead. + * Version 3.2.0 + */ + @Deprecated + public static int getMaxSerializedSizeBytes(final int k, final long n) { + final KllHelper.GrowthStats gStats = + KllHelper.getGrowthSchemeForGivenN(k, KllSketch.DEFAULT_M, n, FLOATS_SKETCH, false); + return gStats.compactBytes; + } + + /** + * Returns upper bound on the serialized size of a KllSketch given the following parameters. + * @param k parameter that controls size of the sketch and accuracy of estimates + * @param n stream length + * @param sketchType either DOUBLES_SKETCH or FLOATS_SKETCH + * @param updatableMemory true if updatableMemory form, otherwise the standard compact form. + * @return upper bound on the serialized size of a KllSketch. + */ + public static int getMaxSerializedSizeBytes(final int k, final long n, + final SketchType sketchType, final boolean updatableMemory) { + final KllHelper.GrowthStats gStats = + KllHelper.getGrowthSchemeForGivenN(k, KllSketch.DEFAULT_M, n, sketchType, false); + return updatableMemory ? gStats.updatableBytes : gStats.compactBytes; + } + + /** + * Gets the normalized rank error given k and pmf. + * Static method version of the getNormalizedRankError(boolean). + * @param k the configuration parameter + * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function. + * Otherwise, it is the "single-sided" normalized rank error for all the other queries. + * @return if pmf is true, the normalized rank error for the getPMF() function. + * Otherwise, it is the "single-sided" normalized rank error for all the other queries. + */ + public static double getNormalizedRankError(final int k, final boolean pmf) { + return KllHelper.getNormalizedRankError(k, pmf); + } + + //numItems can be either numRetained, or current max capacity at given K and numLevels. + static int getCurrentSerializedSizeBytes(final int numLevels, final int numItems, + final SketchType sketchType, final boolean updatableMemory) { + final int typeBytes = (sketchType == DOUBLES_SKETCH) ? Double.BYTES : Float.BYTES; + int levelsBytes = 0; + if (updatableMemory) { + levelsBytes = (numLevels + 1) * Integer.BYTES; + } else { + if (numItems == 0) { return N_LONG_ADR; } + if (numItems == 1) { return DATA_START_ADR_SINGLE_ITEM + typeBytes; } + levelsBytes = numLevels * Integer.BYTES; + } + return DATA_START_ADR + levelsBytes + (numItems + 2) * typeBytes; //+2 is for min & max + } + + /** + * Returns the current compact number of bytes this sketch would require to store. + * @return the current compact number of bytes this sketch would require to store. + */ + public final int getCurrentCompactSerializedSizeBytes() { + return KllSketch.getCurrentSerializedSizeBytes(getNumLevels(), getNumRetained(), sketchType, false); + } + + /** + * Returns the current updatableMemory number of bytes this sketch would require to store. + * @return the current updatableMemory number of bytes this sketch would require to store. + */ + public final int getCurrentUpdatableSerializedSizeBytes() { + final int itemCap = KllHelper.computeTotalItemCapacity(getK(), getM(), getNumLevels()); + return KllSketch.getCurrentSerializedSizeBytes(getNumLevels(), itemCap, sketchType, true); + } + + /** + * Returns the user configured parameter k + * @return the user configured parameter k + */ + public abstract int getK(); + + /** + * Returns the length of the input stream in items. + * @return stream length + */ + public abstract long getN(); + + /** + * Gets the approximate rank error of this sketch normalized as a fraction between zero and one. + * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function. + * Otherwise, it is the "single-sided" normalized rank error for all the other queries. + * The epsilon value returned is a best fit to 99 percentile empirically measured max error in + * thousands of trials + * @return if pmf is true, returns the normalized rank error for the getPMF() function. + * Otherwise, it is the "single-sided" normalized rank error for all the other queries. + * + *

Please refer to the documentation in the package-info:
+ * {@link org.apache.datasketches.kll}

+ */ + public final double getNormalizedRankError(final boolean pmf) { + return getNormalizedRankError(getMinK(), pmf); + } + + /** + * Returns the number of retained items (samples) in the sketch. + * @return the number of retained items (samples) in the sketch + */ + public final int getNumRetained() { + return getLevelsArray()[getNumLevels()] - getLevelsArray()[0]; + } + + /** + * Returns the current number of bytes this Sketch would require if serialized. + * @return the number of bytes this sketch would require if serialized. + */ + public int getSerializedSizeBytes() { + return (updatablMemory) + ? getCurrentUpdatableSerializedSizeBytes() + : getCurrentCompactSerializedSizeBytes(); + } + + /** + * This returns the WritableMemory for Direct type sketches, + * otherwise returns null. + * @return the WritableMemory for Direct type sketches, otherwise null. + */ + public WritableMemory getWritableMemory() { + return wmem; + } + + /** + * Returns true if this sketch is empty. + * @return empty flag + */ + public final boolean isEmpty() { + return getN() == 0; + } + + /** + * Returns true if this sketch is in estimation mode. + * @return estimation mode flag + */ + public final boolean isEstimationMode() { + return getNumLevels() > 1; + } + + public final boolean isUpdatableMemory() { + return updatablMemory; + } + + /** + * Returns true if the backing resource of this is identical with the backing resource + * of that. The capacities must be the same. If this is a region, + * the region offset must also be the same. + * @param that A different non-null object + * @return true if the backing resource of this is the same as the backing resource + * of that. + */ + public final boolean isSameResource(final Memory that) { + return wmem.isSameResource(that); + } + + /** + * This resets the current sketch back to zero entries. + * It retains key parameters such as k and + * SketchType (double or float). + */ + public abstract void reset(); + + /** + * Returns serialized sketch in a compact byte array form. + * @return serialized sketch in a compact byte array form. + */ + public byte[] toByteArray() { + return KllHelper.toCompactByteArrayImpl(this); + } + + @Override + public final String toString() { + return toString(false, false); + } + + /** + * Returns a summary of the sketch as a string. + * @param withLevels if true include information about levels + * @param withData if true include sketch data + * @return string representation of sketch summary + */ + public String toString(final boolean withLevels, final boolean withData) { + return KllHelper.toStringImpl(this, withLevels, withData); + } + + /** + * Returns serialized sketch in an updatableMemory byte array form. + * @return serialized sketch in an updatableMemory byte array form. + */ + public byte[] toUpdatableByteArray() { + return KllHelper.toUpdatableByteArrayImpl(this); + } + + /** + * @return full size of internal items array including garbage. + */ + abstract double[] getDoubleItemsArray(); + + abstract double getDoubleItemsArrayAt(int index); + + /** + * @return full size of internal items array including garbage. + */ + abstract float[] getFloatItemsArray(); + + abstract float getFloatItemsArrayAt(int index); + + abstract int[] getLevelsArray(); + + abstract int getLevelsArrayAt(int index); + + /** + * Returns the configured parameter m, which is the minimum level size in number of items. + * Currently, the public default is 8, but this can be overridden using Package Private methods to + * 2, 4, 6 or 8, and the sketch works just fine. The value 8 was chosen as a compromise between speed and size. + * Choosing smaller values of m will make the sketch much slower. + * @return the configured parameter m + */ + abstract int getM(); + + abstract double getMaxDoubleValue(); + + abstract float getMaxFloatValue(); + + abstract double getMinDoubleValue(); + + abstract float getMinFloatValue(); + + /** + * MinK is the value of K that results from a merge with a sketch configured with a value of K lower than + * the k of this sketch. This value is then used in computing the estimated upper and lower bounds of error. + * @return The minimum K as a result of merging with lower values of k. + */ + abstract int getMinK(); + + abstract int getNumLevels(); + + abstract void incN(); + + abstract void incNumLevels(); + + boolean isDoublesSketch() { return sketchType == DOUBLES_SKETCH; } + + boolean isFloatsSketch() { return sketchType == FLOATS_SKETCH; } + + abstract boolean isLevelZeroSorted(); + + abstract void setDoubleItemsArray(double[] floatItems); + + abstract void setDoubleItemsArrayAt(int index, double value); + + abstract void setFloatItemsArray(float[] floatItems); + + abstract void setFloatItemsArrayAt(int index, float value); + + abstract void setItemsArrayUpdatable(WritableMemory itemsMem); + + abstract void setLevelsArray(int[] levelsArr); + + abstract void setLevelsArrayAt(int index, int value); + + abstract void setLevelsArrayAtMinusEq(int index, int minusEq); + + abstract void setLevelsArrayAtPlusEq(int index, int plusEq); + + abstract void setLevelsArrayUpdatable(WritableMemory levelsMem); + + abstract void setLevelZeroSorted(boolean sorted); + + abstract void setMaxDoubleValue(double value); + + abstract void setMaxFloatValue(float value); + + abstract void setMinDoubleValue(double value); + + abstract void setMinFloatValue(float value); + + abstract void setMinK(int minK); + + abstract void setMinMaxArrayUpdatable(WritableMemory minMaxMem); + + abstract void setN(long n); + + abstract void setNumLevels(int numLevels); + +} diff --git a/src/main/java/org/apache/datasketches/kll/package-info.java b/src/main/java/org/apache/datasketches/kll/package-info.java index 141c25f8a..3071c9766 100644 --- a/src/main/java/org/apache/datasketches/kll/package-info.java +++ b/src/main/java/org/apache/datasketches/kll/package-info.java @@ -35,16 +35,17 @@ * *

The normalized rank (rank) of any specific value is defined as its * absolute rank divided by N. - * Thus, the normalized rank is a value between zero and one. + * Thus, the normalized rank is a value in the interval [0.0, 1.0). * In the documentation and Javadocs for this sketch absolute rank is never used so any * reference to just rank should be interpreted to mean normalized rank. * *

This sketch is configured with a parameter k, which affects the size of the sketch * and its estimation error. * - *

The estimation error is commonly called epsilon (or eps) and is a fraction - * between zero and one. Larger values of k result in smaller values of epsilon. - * Epsilon is always with respect to the rank and cannot be applied to the + *

In the research literature, the estimation error is commonly called epsilon + * (or eps) and is a fraction between zero and one. + * Larger values of k result in smaller values of epsilon. + * The epsilon error is always with respect to the rank and cannot be applied to the * corresponding values. * *

The relationship between the normalized rank and the corresponding values can be viewed @@ -147,6 +148,25 @@ *

  • Then vlo ≤ v ≤ vhi, with 99% confidence.
  • * * + *

    The current implementations of the KLL sketch in the DataSketches Java library component include:

    + * + * + * + *

    Please visit our website: DataSketches Home Page for more + * information.

    + * * @author Kevin Lang * @author Alexander Saydakov * @author Lee Rhodes diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesAuxiliary.java b/src/main/java/org/apache/datasketches/quantiles/DoublesAuxiliary.java index 40d4d3501..307917d12 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesAuxiliary.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesAuxiliary.java @@ -70,12 +70,12 @@ final class DoublesAuxiliary { /** * Get the estimated quantile given a fractional rank. - * @param fRank the fractional rank where: 0 ≤ fRank ≤ 1.0. + * @param rank the normalized rank where: 0 ≤ rank ≤ 1.0. * @return the estimated quantile */ - double getQuantile(final double fRank) { - checkFractionalRankBounds(fRank); - final long pos = QuantilesHelper.posOfPhi(fRank, auxN_); + double getQuantile(final double rank) { + checkFractionalRankBounds(rank); + final long pos = QuantilesHelper.posOfRank(rank, auxN_); return approximatelyAnswerPositionalQuery(pos); } diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsAuxiliary.java b/src/main/java/org/apache/datasketches/quantiles/ItemsAuxiliary.java index 9a617d431..4905fc221 100644 --- a/src/main/java/org/apache/datasketches/quantiles/ItemsAuxiliary.java +++ b/src/main/java/org/apache/datasketches/quantiles/ItemsAuxiliary.java @@ -79,13 +79,13 @@ final class ItemsAuxiliary { /** * Get the estimated quantile given a fractional rank. - * @param fRank the fractional rank where: 0 ≤ fRank ≤ 1.0. + * @param rank the normalized rank where: 0 ≤ rank ≤ 1.0. * @return the estimated quantile */ - T getQuantile(final double fRank) { - checkFractionalRankBounds(fRank); + T getQuantile(final double rank) { + checkFractionalRankBounds(rank); if (auxN_ <= 0) { return null; } - final long pos = QuantilesHelper.posOfPhi(fRank, auxN_); + final long pos = QuantilesHelper.posOfRank(rank, auxN_); return approximatelyAnswerPositionalQuery(pos); } diff --git a/src/main/java/org/apache/datasketches/quantiles/PreambleUtil.java b/src/main/java/org/apache/datasketches/quantiles/PreambleUtil.java index 45990bca9..2b3f53952 100644 --- a/src/main/java/org/apache/datasketches/quantiles/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/quantiles/PreambleUtil.java @@ -31,23 +31,23 @@ //@formatter:off /** - * This class defines the preamble data structure and provides basic utilities for some of the key - * fields. + * This class defines the serialized data structure and provides access methods for the key fields. + * *

    The intent of the design of this class was to isolate the detailed knowledge of the bit and - * byte layout of the serialized form of the sketches derived from the Sketch class into one place. + * byte layout of the serialized form of the sketches derived from the base sketch classes into one place. * This allows the possibility of the introduction of different serialization * schemes with minimal impact on the rest of the library.

    * *

    - * MAP: Low significance bytes of this long data structure are on the right. However, the - * multi-byte integers (int and long) are stored in native byte order. The - * byte values are treated as unsigned.

    + * LAYOUT: The low significance bytes of this long based data structure are on the right. + * The multi-byte primitives are stored in native byte order. + * The single byte fields are treated as unsigned.

    * *

    An empty ItemsSketch, on-heap DoublesSketch or compact off-heap DoublesSketch only require 8 - * bytes. An off-heap UpdateDoublesSketch and all non-empty skethces require at least 16 bytes of + * bytes. An off-heap UpdateDoublesSketch and all non-empty sketches require at least 16 bytes of * preamble.

    * - *
    + * 
    {@code
      * Long || Start Byte Adr: Common for both DoublesSketch and ItemsSketch
      * Adr:
      *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0          |
    @@ -66,7 +66,7 @@
      *
      *      ||   39   |   38   |   37   |   36   |   35   |   34   |   33   |    32          |
      *  4   ||---------------------------START OF COMBINED BUfFER----------------------------|
    - *  
    + * }
    * * @author Lee Rhodes */ diff --git a/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchIteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchIteratorTest.java new file mode 100644 index 000000000..4c7033342 --- /dev/null +++ b/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchIteratorTest.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import org.apache.datasketches.memory.DefaultMemoryRequestServer; +import org.apache.datasketches.memory.WritableMemory; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class KllDirectDoublesSketchIteratorTest { + private static final DefaultMemoryRequestServer memReqSvr = new DefaultMemoryRequestServer(); + + @Test + public void emptySketch() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + KllDoublesSketchIterator it = sketch.iterator(); + Assert.assertFalse(it.next()); + } + + @Test + public void oneItemSketch() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + sketch.update(0); + KllDoublesSketchIterator it = sketch.iterator(); + Assert.assertTrue(it.next()); + Assert.assertEquals(it.getValue(), 0f); + Assert.assertEquals(it.getWeight(), 1); + Assert.assertFalse(it.next()); + } + + @Test + public void bigSketches() { + for (int n = 1000; n < 100000; n += 2000) { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + for (int i = 0; i < n; i++) { + sketch.update(i); + } + KllDoublesSketchIterator it = sketch.iterator(); + int count = 0; + int weight = 0; + while (it.next()) { + count++; + weight += it.getWeight(); + } + Assert.assertEquals(count, sketch.getNumRetained()); + Assert.assertEquals(weight, n); + } + } + + private static KllDirectDoublesSketch getDDSketch(final int k, final int n) { + KllDoublesSketch sk = new KllDoublesSketch(k); + for (int i = 1; i <= n; i++) { sk.update(i); } + byte[] byteArr = sk.toUpdatableByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + + KllDirectDoublesSketch ddsk = KllDirectDoublesSketch.writableWrap(wmem, memReqSvr); + return ddsk; + } + +} + diff --git a/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java new file mode 100644 index 000000000..e07d7d3c7 --- /dev/null +++ b/src/test/java/org/apache/datasketches/kll/KllDirectDoublesSketchTest.java @@ -0,0 +1,643 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; + +import org.apache.datasketches.SketchesArgumentException; +import org.apache.datasketches.memory.DefaultMemoryRequestServer; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.WritableMemory; +import org.testng.annotations.Test; + +@SuppressWarnings("javadoc") +public class KllDirectDoublesSketchTest { + + private static final double PMF_EPS_FOR_K_8 = 0.35; // PMF rank error (epsilon) for k=8 + private static final double PMF_EPS_FOR_K_128 = 0.025; // PMF rank error (epsilon) for k=128 + private static final double PMF_EPS_FOR_K_256 = 0.013; // PMF rank error (epsilon) for k=256 + private static final double NUMERIC_NOISE_TOLERANCE = 1E-6; + private static final DefaultMemoryRequestServer memReqSvr = new DefaultMemoryRequestServer(); + + @Test + public void empty() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + sketch.update(Double.NaN); // this must not change anything + assertTrue(sketch.isEmpty()); + assertEquals(sketch.getN(), 0); + assertEquals(sketch.getNumRetained(), 0); + assertTrue(Double.isNaN(sketch.getRank(0))); + assertTrue(Double.isNaN(sketch.getMinValue())); + assertTrue(Double.isNaN(sketch.getMaxValue())); + assertTrue(Double.isNaN(sketch.getQuantile(0.5))); + assertNull(sketch.getQuantiles(new double[] {0})); + assertNull(sketch.getPMF(new double[] {0})); + assertNotNull(sketch.toString(true, true)); + assertNotNull(sketch.toString()); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void getQuantileInvalidArg() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + sketch.update(1); + sketch.getQuantile(-1.0); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void getQuantilesInvalidArg() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + sketch.update(1); + sketch.getQuantiles(new double[] {2.0}); + } + + @Test + public void oneItem() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + sketch.update(1); + assertFalse(sketch.isEmpty()); + assertEquals(sketch.getN(), 1); + assertEquals(sketch.getNumRetained(), 1); + assertEquals(sketch.getRank(1), 0.0); + assertEquals(sketch.getRank(2), 1.0); + assertEquals(sketch.getMinValue(), 1.0); + assertEquals(sketch.getMaxValue(), 1.0); + assertEquals(sketch.getQuantile(0.5), 1.0); + } + + @Test + public void manyItemsEstimationMode() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + final int n = 1_000_000; + + for (int i = 0; i < n; i++) { + sketch.update(i); + } + assertEquals(sketch.getN(), n); + + // test getRank + for (int i = 0; i < n; i++) { + final double trueRank = (double) i / n; + assertEquals(sketch.getRank(i), trueRank, PMF_EPS_FOR_K_256, "for value " + i); + } + + // test getPMF + final double[] pmf = sketch.getPMF(new double[] {n / 2.0}); // split at median + assertEquals(pmf.length, 2); + assertEquals(pmf[0], 0.5, PMF_EPS_FOR_K_256); + assertEquals(pmf[1], 0.5, PMF_EPS_FOR_K_256); + + assertEquals(sketch.getMinValue(), 0f); // min value is exact + assertEquals(sketch.getQuantile(0), 0f); // min value is exact + assertEquals(sketch.getMaxValue(), n - 1f); // max value is exact + assertEquals(sketch.getQuantile(1), n - 1f); // max value is exact + + // check at every 0.1 percentage point + final double[] fractions = new double[1001]; + final double[] reverseFractions = new double[1001]; // check that ordering doesn't matter + for (int i = 0; i <= 1000; i++) { + fractions[i] = (double) i / 1000; + reverseFractions[1000 - i] = fractions[i]; + } + final double[] quantiles = sketch.getQuantiles(fractions); + final double[] reverseQuantiles = sketch.getQuantiles(reverseFractions); + double previousQuantile = 0; + for (int i = 0; i <= 1000; i++) { + final double quantile = sketch.getQuantile(fractions[i]); + assertEquals(quantile, quantiles[i]); + assertEquals(quantile, reverseQuantiles[1000 - i]); + assertTrue(previousQuantile <= quantile); + previousQuantile = quantile; + } + } + + @Test + public void getRankGetCdfGetPmfConsistency() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + final int n = 1000; + final double[] values = new double[n]; + for (int i = 0; i < n; i++) { + sketch.update(i); + values[i] = i; + } + final double[] ranks = sketch.getCDF(values); + final double[] pmf = sketch.getPMF(values); + double sumPmf = 0; + for (int i = 0; i < n; i++) { + assertEquals(ranks[i], sketch.getRank(values[i]), NUMERIC_NOISE_TOLERANCE, + "rank vs CDF for value " + i); + sumPmf += pmf[i]; + assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i); + } + sumPmf += pmf[n]; + assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE); + assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE); + } + + @Test + public void merge() { + final KllDirectDoublesSketch sketch1 = getDDSketch(200, 0); + final KllDirectDoublesSketch sketch2 = getDDSketch(200, 0); + final int n = 10_000; + for (int i = 0; i < n; i++) { + sketch1.update(i * 1.0); + sketch2.update((2 * n - i - 1) * 1.0); + } + + assertEquals(sketch1.getMinValue(), 0.0); + assertEquals(sketch1.getMaxValue(), (n - 1) * 1.0); + + assertEquals(sketch2.getMinValue(), n * 1.0); + assertEquals(sketch2.getMaxValue(), (2 * n - 1) * 1.0); + + sketch1.merge(sketch2); + + assertFalse(sketch1.isEmpty()); + assertEquals(sketch1.getN(), 2L * n); + assertEquals(sketch1.getMinValue(), 0.0); + assertEquals(sketch1.getMaxValue(), (2 * n - 1) * 1.0); + assertEquals(sketch1.getQuantile(0.5), n * 1.0, n * PMF_EPS_FOR_K_256); + } + + @Test + public void mergeLowerK() { + final KllDirectDoublesSketch sketch1 = getDDSketch(256, 0); + final KllDirectDoublesSketch sketch2 = getDDSketch(128, 0); + final int n = 10_000; + for (int i = 0; i < n; i++) { + sketch1.update(i); + sketch2.update(2 * n - i - 1); + } + + assertEquals(sketch1.getMinValue(), 0.0f); + assertEquals(sketch1.getMaxValue(), n - 1f); + + assertEquals(sketch2.getMinValue(), n); + assertEquals(sketch2.getMaxValue(), 2f * n - 1f); + + assertTrue(sketch1.getNormalizedRankError(false) < sketch2.getNormalizedRankError(false)); + assertTrue(sketch1.getNormalizedRankError(true) < sketch2.getNormalizedRankError(true)); + sketch1.merge(sketch2); + + // sketch1 must get "contaminated" by the lower K in sketch2 + assertEquals(sketch1.getNormalizedRankError(false), sketch2.getNormalizedRankError(false)); + assertEquals(sketch1.getNormalizedRankError(true), sketch2.getNormalizedRankError(true)); + + assertFalse(sketch1.isEmpty()); + assertEquals(sketch1.getN(), 2 * n); + assertEquals(sketch1.getMinValue(), 0); + assertEquals(sketch1.getMaxValue(), 2.0 * n - 1.0); + assertEquals(sketch1.getQuantile(0.5), n, n * PMF_EPS_FOR_K_128); + } + + @Test + public void mergeEmptyLowerK() { + final KllDirectDoublesSketch sketch1 = getDDSketch(256, 0); + final KllDirectDoublesSketch sketch2 = getDDSketch(128, 0); + final int n = 10000; + for (int i = 0; i < n; i++) { + sketch1.update(i); + } + + // rank error should not be affected by a merge with an empty sketch with lower K + final double rankErrorBeforeMerge = sketch1.getNormalizedRankError(true); + sketch1.merge(sketch2); + assertEquals(sketch1.getNormalizedRankError(true), rankErrorBeforeMerge); + + assertFalse(sketch1.isEmpty()); + assertEquals(sketch1.getN(), n); + assertEquals(sketch1.getMinValue(), 0); + assertEquals(sketch1.getMaxValue(), n - 1.0); + assertEquals(sketch1.getQuantile(0.5), n / 2.0, n / 2 * PMF_EPS_FOR_K_256); + + //merge the other way + sketch2.merge(sketch1); + assertFalse(sketch1.isEmpty()); + assertEquals(sketch1.getN(), n); + assertEquals(sketch1.getMinValue(), 0); + assertEquals(sketch1.getMaxValue(), n - 1.0); + assertEquals(sketch1.getQuantile(0.5), n / 2.0, n / 2 * PMF_EPS_FOR_K_256); + } + + @Test + public void mergeExactModeLowerK() { + final KllDirectDoublesSketch sketch1 = getDDSketch(256, 0); + final KllDirectDoublesSketch sketch2 = getDDSketch(128, 0); + final int n = 10000; + for (int i = 0; i < n; i++) { + sketch1.update(i); + } + sketch2.update(1); + + // rank error should not be affected by a merge with a sketch in exact mode with lower K + final double rankErrorBeforeMerge = sketch1.getNormalizedRankError(true); + sketch1.merge(sketch2); + assertEquals(sketch1.getNormalizedRankError(true), rankErrorBeforeMerge); + } + + @Test + public void mergeMinMinValueFromOther() { + final KllDirectDoublesSketch sketch1 = getDDSketch(200, 0); + final KllDirectDoublesSketch sketch2 = getDDSketch(200, 0); + sketch1.update(1); + sketch2.update(2); + sketch2.merge(sketch1); + assertEquals(sketch2.getMinValue(), 1.0); + } + + @Test + public void mergeMinAndMaxFromOther() { + final KllDirectDoublesSketch sketch1 = getDDSketch(200, 0); + final KllDirectDoublesSketch sketch2 = getDDSketch(200, 0); + for (int i = 1; i <= 1_000_000; i++) { + sketch1.update(i); + } + sketch2.merge(sketch1); + assertEquals(sketch2.getMinValue(), 1); + assertEquals(sketch2.getMaxValue(), 1_000_000); + } + + @SuppressWarnings("unused") + @Test(expectedExceptions = SketchesArgumentException.class) + public void kTooSmall() { + final KllDirectDoublesSketch sketch1 = getDDSketch(KllSketch.DEFAULT_M - 1, 0); + } + + @SuppressWarnings("unused") + @Test(expectedExceptions = SketchesArgumentException.class) + public void kTooLarge() { + final KllDirectDoublesSketch sketch1 = getDDSketch(KllSketch.MAX_K + 1, 0); + } + + @Test + public void minK() { + final KllDirectDoublesSketch sketch = getDDSketch(KllSketch.DEFAULT_M, 0); + for (int i = 0; i < 1000; i++) { + sketch.update(i); + } + assertEquals(sketch.getK(), KllSketch.DEFAULT_M); + assertEquals(sketch.getQuantile(0.5), 500, 500 * PMF_EPS_FOR_K_8); + } + + @Test + public void maxK() { + final KllDirectDoublesSketch sketch = getDDSketch(KllSketch.MAX_K, 0); + for (int i = 0; i < 1000; i++) { + sketch.update(i); + } + assertEquals(sketch.getK(), KllSketch.MAX_K); + assertEquals(sketch.getQuantile(0.5), 500, 500 * PMF_EPS_FOR_K_256); + } + + @Test + public void serializeDeserializeEmptyViaCompactHeapify() { + final KllDirectDoublesSketch sketch1 = getDDSketch(200, 0); + final byte[] bytes = sketch1.toByteArray(); + final KllDoublesSketch sketch2 = KllDoublesSketch.heapify(Memory.wrap(bytes)); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); + assertTrue(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); + assertEquals(sketch2.getN(), sketch1.getN()); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertTrue(Double.isNaN(sketch2.getMinValue())); + assertTrue(Double.isNaN(sketch2.getMaxValue())); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); + } + + @Test + public void serializeDeserializeEmptyViaUpdatableWritableWrap() { + final KllDirectDoublesSketch sketch1 = getDDSketch(200, 0); + final byte[] bytes = sketch1.toUpdatableByteArray(); + final KllDirectDoublesSketch sketch2 = + KllDirectDoublesSketch.writableWrap(WritableMemory.writableWrap(bytes),memReqSvr); + assertEquals(bytes.length, sketch1.getCurrentUpdatableSerializedSizeBytes()); + assertTrue(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); + assertEquals(sketch2.getN(), sketch1.getN()); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertTrue(Double.isNaN(sketch2.getMinValue())); + assertTrue(Double.isNaN(sketch2.getMaxValue())); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); + } + + @Test + public void serializeDeserializeOneItemViaCompactHeapify() { + final KllDirectDoublesSketch sketch1 = getDDSketch(200, 0); + sketch1.update(1); + final byte[] bytes = sketch1.toByteArray(); + final KllDoublesSketch sketch2 = KllDoublesSketch.heapify(Memory.wrap(bytes)); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); + assertFalse(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), 1); + assertEquals(sketch2.getN(), 1); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertFalse(Double.isNaN(sketch2.getMinValue())); + assertFalse(Double.isNaN(sketch2.getMaxValue())); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), 8 + Double.BYTES); + } + + @Test + public void serializeDeserializeOneItemViaUpdatableWritableWrap() { + final KllDirectDoublesSketch sketch1 = getDDSketch(200, 0); + sketch1.update(1); + final byte[] bytes = sketch1.toUpdatableByteArray(); + final KllDirectDoublesSketch sketch2 = + KllDirectDoublesSketch.writableWrap(WritableMemory.writableWrap(bytes),memReqSvr); + assertEquals(bytes.length, sketch1.getCurrentUpdatableSerializedSizeBytes()); + assertFalse(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), 1); + assertEquals(sketch2.getN(), 1); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertFalse(Double.isNaN(sketch2.getMinValue())); + assertFalse(Double.isNaN(sketch2.getMaxValue())); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), 8 + Double.BYTES); + } + + @Test + public void serializeDeserializeFullViaCompactHeapify() { + final KllDirectDoublesSketch sketch1 = getDDSketch(200, 0); + final int n = 1000; + for (int i = 0; i < n; i++) { + sketch1.update(i); + } + final byte[] bytes = sketch1.toByteArray(); + final KllDoublesSketch sketch2 = KllDoublesSketch.heapify(Memory.wrap(bytes)); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); + assertFalse(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); + assertEquals(sketch2.getN(), sketch1.getN()); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertEquals(sketch2.getMinValue(), sketch1.getMinValue()); + assertEquals(sketch2.getMaxValue(), sketch1.getMaxValue()); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); + } + + @Test + public void serializeDeserializeFullViaUpdatableWritableWrap() { + final KllDirectDoublesSketch sketch1 = getDDSketch(200, 0); + final int n = 1000; + for (int i = 0; i < n; i++) { + sketch1.update(i); + } + final byte[] bytes = sketch1.toUpdatableByteArray(); + final KllDirectDoublesSketch sketch2 = + KllDirectDoublesSketch.writableWrap(WritableMemory.writableWrap(bytes),memReqSvr); + assertEquals(bytes.length, sketch1.getCurrentUpdatableSerializedSizeBytes()); + assertFalse(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); + assertEquals(sketch2.getN(), sketch1.getN()); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertEquals(sketch2.getMinValue(), sketch1.getMinValue()); + assertEquals(sketch2.getMaxValue(), sketch1.getMaxValue()); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void outOfOrderSplitPoints() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + sketch.update(0); + sketch.getCDF(new double[] {1, 0}); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void nanSplitPoint() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + sketch.update(0); + sketch.getCDF(new double[] {Double.NaN}); + } + + @Test + public void getQuantiles() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 0); + sketch.update(1); + sketch.update(2); + sketch.update(3); + final double[] quantiles1 = sketch.getQuantiles(new double[] {0, 0.5, 1}); + final double[] quantiles2 = sketch.getQuantiles(3); + assertEquals(quantiles1, quantiles2); + assertEquals(quantiles1[0], 1.0); + assertEquals(quantiles1[1], 2.0); + assertEquals(quantiles1[2], 3.0); + } + + @Test + public void checkSimpleMergeDirect() { //used for troubleshooting + int k = 20; + int n1 = 21; + int n2 = 43; + KllDoublesSketch sk1 = new KllDoublesSketch(k); + KllDoublesSketch sk2 = new KllDoublesSketch(k); + for (int i = 1; i <= n1; i++) { + sk1.update(i); + } + for (int i = 1; i <= n2; i++) { + sk2.update(i + 100); + } + println("SK1:"); + println(sk1.toString(true, true)); + println("SK2:"); + println(sk2.toString(true, true)); + WritableMemory wmem1 = WritableMemory.writableWrap(sk1.toUpdatableByteArray()); + WritableMemory wmem2 = WritableMemory.writableWrap(sk2.toUpdatableByteArray()); + KllDirectDoublesSketch dsk1 = KllDirectDoublesSketch.writableWrap(wmem1, memReqSvr); + KllDirectDoublesSketch dsk2 = KllDirectDoublesSketch.writableWrap(wmem2, memReqSvr); + println("BEFORE MERGE"); + println(dsk1.toString(true, true)); + dsk1.merge(dsk2); + println("AFTER MERGE"); + println(dsk1.toString(true, true)); + } + + @Test + public void checkSketchInitializeDirectDoubleUpdatableMem() { + int k = 20; //don't change this + KllDirectDoublesSketch sk; + KllDoublesSketch sk2; + byte[] compBytes; + WritableMemory wmem; + + println("#### CASE: DOUBLE FULL DIRECT FROM UPDATABLE"); + sk2 = new KllDoublesSketch(k); + for (int i = 1; i <= k + 1; i++) { sk2.update(i); } + //println(sk2.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDirectDoublesSketch.writableWrap(wmem, memReqSvr); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxDoubleValue(), 21.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: DOUBLE EMPTY HEAPIFIED FROM UPDATABLE"); + sk2 = new KllDoublesSketch(k); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDirectDoublesSketch.writableWrap(wmem, memReqSvr); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), Double.NaN); + assertEquals(sk.getMinDoubleValue(), Double.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: DOUBLE SINGLE HEAPIFIED FROM UPDATABLE"); + sk2 = new KllDoublesSketch(k); + sk2.update(1); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDirectDoublesSketch.writableWrap(wmem, memReqSvr); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), 1.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkGetWritableMemory() { + final KllDirectDoublesSketch sketch = getDDSketch(200, 200); + assertEquals(sketch.getK(), 200); + assertEquals(sketch.getN(), 200); + assertFalse(sketch.isEmpty()); + assertTrue(sketch.isUpdatableMemory()); + assertFalse(sketch.isEstimationMode()); + assertTrue(sketch.isDoublesSketch()); + assertFalse(sketch.isLevelZeroSorted()); + assertFalse(sketch.isFloatsSketch()); + + final WritableMemory wmem = sketch.getWritableMemory(); + final KllDoublesSketch sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), 200); + assertEquals(sk.getN(), 200); + assertFalse(sk.isEmpty()); + assertFalse(sk.isUpdatableMemory()); + assertFalse(sk.isEstimationMode()); + assertTrue(sk.isDoublesSketch()); + assertFalse(sk.isLevelZeroSorted()); + assertFalse(sk.isFloatsSketch()); + } + + @Test + public void checkReset() { + WritableMemory dstMem = WritableMemory.allocate(6000); + KllDirectDoublesSketch sk = KllDirectDoublesSketch.newInstance(20, dstMem, memReqSvr); + for (int i = 1; i <= 100; i++) { sk.update(i); } + long n1 = sk.getN(); + double min1 = sk.getMinValue(); + double max1 = sk.getMaxValue(); + sk.reset(); + for (int i = 1; i <= 100; i++) { sk.update(i); } + long n2 = sk.getN(); + double min2 = sk.getMinValue(); + double max2 = sk.getMaxValue(); + assertEquals(n2, n1); + assertEquals(min2, min1); + assertEquals(max2, max1); + } + + @Test + public void checkHeapify() { + WritableMemory dstMem = WritableMemory.allocate(6000); + KllDirectDoublesSketch sk = KllDirectDoublesSketch.newInstance(20, dstMem, memReqSvr); + for (int i = 1; i <= 100; i++) { sk.update(i); } + KllDoublesSketch sk2 = KllDirectDoublesSketch.heapify(dstMem); + assertEquals(sk2.getMinValue(), 1.0); + assertEquals(sk2.getMaxValue(), 100.0); + } + + @Test + public void checkMergeKllDoublesSketch() { + WritableMemory dstMem = WritableMemory.allocate(6000); + KllDirectDoublesSketch sk = KllDirectDoublesSketch.newInstance(20, dstMem, memReqSvr); + for (int i = 1; i <= 21; i++) { sk.update(i); } + KllDoublesSketch sk2 = new KllDoublesSketch(20); + for (int i = 1; i <= 21; i++ ) { sk2.update(i + 100); } + sk.merge(sk2); + } + + @Test + public void checkReverseMergeKllDoubleSketch() { + WritableMemory dstMem = WritableMemory.allocate(6000); + KllDirectDoublesSketch sk = KllDirectDoublesSketch.newInstance(20, dstMem, memReqSvr); + for (int i = 1; i <= 21; i++) { sk.update(i); } + KllDoublesSketch sk2 = new KllDoublesSketch(20); + for (int i = 1; i <= 21; i++ ) { sk2.update(i + 100); } + sk2.merge(sk); + } + +// @Test +// public void checkWrapKllDoubleSketch() { +// KllDoublesSketch sk = new KllDoublesSketch(20); +// for (int i = 1; i <= 21; i++ ) { sk.update(i); } +// Memory srcMem = Memory.wrap(sk.toByteArray()); +// KllDirectDoublesSketch sk2 = KllDirectDoublesSketch.writableWrap(srcMem, memReqSvr); +// } + + private static KllDirectDoublesSketch getDDSketch(final int k, final int n) { + KllDoublesSketch sk = new KllDoublesSketch(k); + for (int i = 1; i <= n; i++) { sk.update(i); } + byte[] byteArr = sk.toUpdatableByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + KllDirectDoublesSketch ddsk = KllDirectDoublesSketch.writableWrap(wmem, memReqSvr); + return ddsk; + } + + @Test + public void printlnTest() { + println("PRINTING: " + this.getClass().getName()); + } + + /** + * @param s value to print + */ + static void println(final String s) { + //System.out.println(s); //disable here + } + +} diff --git a/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchIteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchIteratorTest.java new file mode 100644 index 000000000..9b54a7a2a --- /dev/null +++ b/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchIteratorTest.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import org.apache.datasketches.memory.DefaultMemoryRequestServer; +import org.apache.datasketches.memory.WritableMemory; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class KllDirectFloatsSketchIteratorTest { + private static final DefaultMemoryRequestServer memReqSvr = new DefaultMemoryRequestServer(); + + @Test + public void emptySketch() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + KllFloatsSketchIterator it = sketch.iterator(); + Assert.assertFalse(it.next()); + } + + @Test + public void oneItemSketch() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + sketch.update(0); + KllFloatsSketchIterator it = sketch.iterator(); + Assert.assertTrue(it.next()); + Assert.assertEquals(it.getValue(), 0f); + Assert.assertEquals(it.getWeight(), 1); + Assert.assertFalse(it.next()); + } + + @Test + public void bigSketches() { + for (int n = 1000; n < 100000; n += 2000) { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + for (int i = 0; i < n; i++) { + sketch.update(i); + } + KllFloatsSketchIterator it = sketch.iterator(); + int count = 0; + int weight = 0; + while (it.next()) { + count++; + weight += it.getWeight(); + } + Assert.assertEquals(count, sketch.getNumRetained()); + Assert.assertEquals(weight, n); + } + } + + private static KllDirectFloatsSketch getDFSketch(final int k, final int n) { + KllFloatsSketch sk = new KllFloatsSketch(k); + for (int i = 1; i <= n; i++) { sk.update(i); } + byte[] byteArr = sk.toUpdatableByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + + KllDirectFloatsSketch dfsk = KllDirectFloatsSketch.writableWrap(wmem, memReqSvr); + return dfsk; + } + +} + diff --git a/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java new file mode 100644 index 000000000..025004380 --- /dev/null +++ b/src/test/java/org/apache/datasketches/kll/KllDirectFloatsSketchTest.java @@ -0,0 +1,614 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; + +import org.apache.datasketches.SketchesArgumentException; +import org.apache.datasketches.memory.DefaultMemoryRequestServer; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.WritableMemory; +import org.testng.annotations.Test; + +public class KllDirectFloatsSketchTest { + + private static final double PMF_EPS_FOR_K_8 = 0.35; // PMF rank error (epsilon) for k=8 + private static final double PMF_EPS_FOR_K_128 = 0.025; // PMF rank error (epsilon) for k=128 + private static final double PMF_EPS_FOR_K_256 = 0.013; // PMF rank error (epsilon) for k=256 + private static final double NUMERIC_NOISE_TOLERANCE = 1E-6; + private static final DefaultMemoryRequestServer memReqSvr = new DefaultMemoryRequestServer(); + + @Test + public void empty() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + sketch.update(Float.NaN); // this must not change anything + assertTrue(sketch.isEmpty()); + assertEquals(sketch.getN(), 0); + assertEquals(sketch.getNumRetained(), 0); + assertTrue(Double.isNaN(sketch.getRank(0))); + assertTrue(Float.isNaN(sketch.getMinValue())); + assertTrue(Float.isNaN(sketch.getMaxValue())); + assertTrue(Float.isNaN(sketch.getQuantile(0.5))); + assertNull(sketch.getQuantiles(new double[] {0})); + assertNull(sketch.getPMF(new float[] {0})); + assertNotNull(sketch.toString(true, true)); + assertNotNull(sketch.toString()); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void getQuantileInvalidArg() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + sketch.update(1); + sketch.getQuantile(-1.0); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void getQuantilesInvalidArg() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + sketch.update(1); + sketch.getQuantiles(new double[] {2.0}); + } + + @Test + public void oneItem() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + sketch.update(1); + assertFalse(sketch.isEmpty()); + assertEquals(sketch.getN(), 1); + assertEquals(sketch.getNumRetained(), 1); + assertEquals(sketch.getRank(1), 0.0); + assertEquals(sketch.getRank(2), 1.0); + assertEquals(sketch.getMinValue(), 1f); + assertEquals(sketch.getMaxValue(), 1f); + assertEquals(sketch.getQuantile(0.5), 1f); + } + + @Test + public void manyItemsEstimationMode() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + final int n = 1_000_000; + + for (int i = 0; i < n; i++) { + sketch.update(i); + } + assertEquals(sketch.getN(), n); + + // test getRank + for (int i = 0; i < n; i++) { + final double trueRank = (double) i / n; + assertEquals(sketch.getRank(i), trueRank, PMF_EPS_FOR_K_256, "for value " + i); + } + + // test getPMF + final double[] pmf = sketch.getPMF(new float[] {n / 2}); // split at median + assertEquals(pmf.length, 2); + assertEquals(pmf[0], 0.5, PMF_EPS_FOR_K_256); + assertEquals(pmf[1], 0.5, PMF_EPS_FOR_K_256); + + assertEquals(sketch.getMinValue(), 0f); // min value is exact + assertEquals(sketch.getQuantile(0), 0f); // min value is exact + assertEquals(sketch.getMaxValue(), n - 1f); // max value is exact + assertEquals(sketch.getQuantile(1), n - 1f); // max value is exact + + // check at every 0.1 percentage point + final double[] fractions = new double[1001]; + final double[] reverseFractions = new double[1001]; // check that ordering doesn't matter + for (int i = 0; i <= 1000; i++) { + fractions[i] = (double) i / 1000; + reverseFractions[1000 - i] = fractions[i]; + } + final float[] quantiles = sketch.getQuantiles(fractions); + final float[] reverseQuantiles = sketch.getQuantiles(reverseFractions); + double previousQuantile = 0; + for (int i = 0; i <= 1000; i++) { + final double quantile = sketch.getQuantile(fractions[i]); + assertEquals(quantile, quantiles[i]); + assertEquals(quantile, reverseQuantiles[1000 - i]); + assertTrue(previousQuantile <= quantile); + previousQuantile = quantile; + } + } + + @Test + public void getRankGetCdfGetPmfConsistency() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + final int n = 1000; + final float[] values = new float[n]; + for (int i = 0; i < n; i++) { + sketch.update(i); + values[i] = i; + } + final double[] ranks = sketch.getCDF(values); + final double[] pmf = sketch.getPMF(values); + double sumPmf = 0; + for (int i = 0; i < n; i++) { + assertEquals(ranks[i], sketch.getRank(values[i]), NUMERIC_NOISE_TOLERANCE, + "rank vs CDF for value " + i); + sumPmf += pmf[i]; + assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i); + } + sumPmf += pmf[n]; + assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE); + assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE); + } + + @Test + public void merge() { + final KllDirectFloatsSketch sketch1 = getDFSketch(200, 0); + final KllDirectFloatsSketch sketch2 = getDFSketch(200, 0); + final int n = 10_000; + for (int i = 0; i < n; i++) { + sketch1.update(i * 1.0F); + sketch2.update((2 * n - i - 1) * 1.0F); + } + + assertEquals(sketch1.getMinValue(), 0.0); + assertEquals(sketch1.getMaxValue(), (n - 1) * 1.0); + + assertEquals(sketch2.getMinValue(), n * 1.0); + assertEquals(sketch2.getMaxValue(), (2 * n - 1) * 1.0); + + sketch1.merge(sketch2); + + assertFalse(sketch1.isEmpty()); + assertEquals(sketch1.getN(), 2L * n); + assertEquals(sketch1.getMinValue(), 0.0); + assertEquals(sketch1.getMaxValue(), (2 * n - 1) * 1.0F); + assertEquals(sketch1.getQuantile(0.5), n * 1.0F, n * PMF_EPS_FOR_K_256); + } + + @Test + public void mergeLowerK() { + final KllDirectFloatsSketch sketch1 = getDFSketch(256, 0); + final KllDirectFloatsSketch sketch2 = getDFSketch(128, 0); + final int n = 10_000; + for (int i = 0; i < n; i++) { + sketch1.update(i); + sketch2.update(2 * n - i - 1); + } + + assertEquals(sketch1.getMinValue(), 0.0f); + assertEquals(sketch1.getMaxValue(), n - 1f); + + assertEquals(sketch2.getMinValue(), n); + assertEquals(sketch2.getMaxValue(), 2f * n - 1f); + + assertTrue(sketch1.getNormalizedRankError(false) < sketch2.getNormalizedRankError(false)); + assertTrue(sketch1.getNormalizedRankError(true) < sketch2.getNormalizedRankError(true)); + sketch1.merge(sketch2); + + // sketch1 must get "contaminated" by the lower K in sketch2 + assertEquals(sketch1.getNormalizedRankError(false), sketch2.getNormalizedRankError(false)); + assertEquals(sketch1.getNormalizedRankError(true), sketch2.getNormalizedRankError(true)); + + assertFalse(sketch1.isEmpty()); + assertEquals(sketch1.getN(), 2 * n); + assertEquals(sketch1.getMinValue(), 0f); + assertEquals(sketch1.getMaxValue(), 2f * n - 1f); + assertEquals(sketch1.getQuantile(0.5), n, n * PMF_EPS_FOR_K_128); + } + + @Test + public void mergeEmptyLowerK() { + final KllDirectFloatsSketch sketch1 = getDFSketch(256, 0); + final KllDirectFloatsSketch sketch2 = getDFSketch(128, 0); + final int n = 10000; + for (int i = 0; i < n; i++) { + sketch1.update(i); + } + + // rank error should not be affected by a merge with an empty sketch with lower K + final double rankErrorBeforeMerge = sketch1.getNormalizedRankError(true); + sketch1.merge(sketch2); + assertEquals(sketch1.getNormalizedRankError(true), rankErrorBeforeMerge); + + assertFalse(sketch1.isEmpty()); + assertEquals(sketch1.getN(), n); + assertEquals(sketch1.getMinValue(), 0f); + assertEquals(sketch1.getMaxValue(), n - 1f); + assertEquals(sketch1.getQuantile(0.5), n / 2f, n / 2 * PMF_EPS_FOR_K_256); + + //merge the other way + sketch2.merge(sketch1); + assertFalse(sketch1.isEmpty()); + assertEquals(sketch1.getN(), n); + assertEquals(sketch1.getMinValue(), 0f); + assertEquals(sketch1.getMaxValue(), n - 1f); + assertEquals(sketch1.getQuantile(0.5), n / 2f, n / 2 * PMF_EPS_FOR_K_256); + } + + @Test + public void mergeExactModeLowerK() { + final KllDirectFloatsSketch sketch1 = getDFSketch(256, 0); + final KllDirectFloatsSketch sketch2 = getDFSketch(128, 0); + final int n = 10000; + for (int i = 0; i < n; i++) { + sketch1.update(i); + } + sketch2.update(1); + + // rank error should not be affected by a merge with a sketch in exact mode with lower K + final double rankErrorBeforeMerge = sketch1.getNormalizedRankError(true); + sketch1.merge(sketch2); + assertEquals(sketch1.getNormalizedRankError(true), rankErrorBeforeMerge); + } + + @Test + public void mergeMinMinValueFromOther() { + final KllDirectFloatsSketch sketch1 = getDFSketch(200, 0); + final KllDirectFloatsSketch sketch2 = getDFSketch(200, 0); + sketch1.update(1); + sketch2.update(2); + sketch2.merge(sketch1); + assertEquals(sketch2.getMinValue(), 1.0F); + } + + @Test + public void mergeMinAndMaxFromOther() { + final KllDirectFloatsSketch sketch1 = getDFSketch(8, 0); //was 200 + final KllDirectFloatsSketch sketch2 = getDFSketch(8, 0); //was 200 + for (int i = 1; i <= 9; i++) { //was 1_000_000 + sketch1.update(i); + } + //System.out.println(sketch1.toString(true, true)); + sketch2.merge(sketch1); + assertEquals(sketch2.getMinValue(), 1F); + assertEquals(sketch2.getMaxValue(), 9F); //was 1_000_000 + } + + @SuppressWarnings("unused") + @Test(expectedExceptions = SketchesArgumentException.class) + public void kTooSmall() { + final KllDirectFloatsSketch sketch1 = getDFSketch(KllSketch.DEFAULT_M - 1, 0); + } + + @SuppressWarnings("unused") + @Test(expectedExceptions = SketchesArgumentException.class) + public void kTooLarge() { + final KllDirectFloatsSketch sketch1 = getDFSketch(KllSketch.MAX_K + 1, 0); + } + + @Test + public void minK() { + final KllDirectFloatsSketch sketch = getDFSketch(KllSketch.DEFAULT_M, 0); + for (int i = 0; i < 1000; i++) { + sketch.update(i); + } + assertEquals(sketch.getK(), KllSketch.DEFAULT_M); + assertEquals(sketch.getQuantile(0.5), 500, 500 * PMF_EPS_FOR_K_8); + } + + @Test + public void maxK() { + final KllDirectFloatsSketch sketch = getDFSketch(KllSketch.MAX_K, 0); + for (int i = 0; i < 1000; i++) { + sketch.update(i); + } + assertEquals(sketch.getK(), KllSketch.MAX_K); + assertEquals(sketch.getQuantile(0.5), 500, 500 * PMF_EPS_FOR_K_256); + } + + @Test + public void serializeDeserializeEmptyViaCompactHeapify() { + final KllDirectFloatsSketch sketch1 = getDFSketch(200, 0); + final byte[] bytes = sketch1.toByteArray(); + final KllFloatsSketch sketch2 = KllFloatsSketch.heapify(Memory.wrap(bytes)); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); + assertTrue(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); + assertEquals(sketch2.getN(), sketch1.getN()); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertTrue(Double.isNaN(sketch2.getMinValue())); + assertTrue(Double.isNaN(sketch2.getMaxValue())); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); + } + + @Test + public void serializeDeserializeEmptyViaUpdatableWritableWrap() { + final KllDirectFloatsSketch sketch1 = getDFSketch(200, 0); + final byte[] bytes = sketch1.toUpdatableByteArray(); + final KllDirectFloatsSketch sketch2 = + KllDirectFloatsSketch.writableWrap(WritableMemory.writableWrap(bytes),memReqSvr); + assertEquals(bytes.length, sketch1.getCurrentUpdatableSerializedSizeBytes()); + assertTrue(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); + assertEquals(sketch2.getN(), sketch1.getN()); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertTrue(Double.isNaN(sketch2.getMinValue())); + assertTrue(Double.isNaN(sketch2.getMaxValue())); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); + } + + @Test + public void serializeDeserializeOneItemViaCompactHeapify() { + final KllDirectFloatsSketch sketch1 = getDFSketch(200, 0); + sketch1.update(1); + final byte[] bytes = sketch1.toByteArray(); + final KllFloatsSketch sketch2 = KllFloatsSketch.heapify(Memory.wrap(bytes)); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); + assertFalse(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), 1); + assertEquals(sketch2.getN(), 1); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertFalse(Double.isNaN(sketch2.getMinValue())); + assertFalse(Double.isNaN(sketch2.getMaxValue())); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), 8 + Float.BYTES); + } + + @Test + public void serializeDeserializeOneItemViaUpdatableWritableWrap() { + final KllDirectFloatsSketch sketch1 = getDFSketch(200, 0); + sketch1.update(1); + final byte[] bytes = sketch1.toUpdatableByteArray(); + final KllDirectFloatsSketch sketch2 = + KllDirectFloatsSketch.writableWrap(WritableMemory.writableWrap(bytes),memReqSvr); + assertEquals(bytes.length, sketch1.getCurrentUpdatableSerializedSizeBytes()); + assertFalse(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), 1); + assertEquals(sketch2.getN(), 1); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertFalse(Double.isNaN(sketch2.getMinValue())); + assertFalse(Double.isNaN(sketch2.getMaxValue())); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), 8 + Float.BYTES); + } + + @Test + public void serializeDeserializeFullViaCompactHeapify() { + final KllDirectFloatsSketch sketch1 = getDFSketch(200, 0); + final int n = 1000; + for (int i = 0; i < n; i++) { sketch1.update(i); } + final byte[] bytes = sketch1.toByteArray(); + final KllFloatsSketch sketch2 = KllFloatsSketch.heapify(Memory.wrap(bytes)); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); + assertFalse(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); + assertEquals(sketch2.getN(), sketch1.getN()); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertEquals(sketch2.getMinValue(), sketch1.getMinValue()); + assertEquals(sketch2.getMaxValue(), sketch1.getMaxValue()); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); + } + + @Test + public void serializeDeserializeFullViaUpdatableWritableWrap() { + final KllDirectFloatsSketch sketch1 = getDFSketch(200, 0); + final int n = 1000; + for (int i = 0; i < n; i++) { + sketch1.update(i); + } + final byte[] bytes = sketch1.toUpdatableByteArray(); + final KllDirectFloatsSketch sketch2 = + KllDirectFloatsSketch.writableWrap(WritableMemory.writableWrap(bytes),memReqSvr); + assertEquals(bytes.length, sketch1.getCurrentUpdatableSerializedSizeBytes()); + assertFalse(sketch2.isEmpty()); + assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); + assertEquals(sketch2.getN(), sketch1.getN()); + assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); + assertEquals(sketch2.getMinValue(), sketch1.getMinValue()); + assertEquals(sketch2.getMaxValue(), sketch1.getMaxValue()); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void outOfOrderSplitPoints() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + sketch.update(0); + sketch.getCDF(new float[] {1, 0}); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void nanSplitPoint() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + sketch.update(0); + sketch.getCDF(new float[] {Float.NaN}); + } + + @Test + public void getQuantiles() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 0); + sketch.update(1); + sketch.update(2); + sketch.update(3); + final float[] quantiles1 = sketch.getQuantiles(new double[] {0, 0.5, 1}); + final float[] quantiles2 = sketch.getQuantiles(3); + assertEquals(quantiles1, quantiles2); + assertEquals(quantiles1[0], 1f); + assertEquals(quantiles1[1], 2f); + assertEquals(quantiles1[2], 3f); + } + + @Test + public void checkSimpleMergeDirect() { //used for troubleshooting + int k = 20; + int n1 = 21; + int n2 = 43; + KllFloatsSketch sk1 = new KllFloatsSketch(k); + KllFloatsSketch sk2 = new KllFloatsSketch(k); + for (int i = 1; i <= n1; i++) { + sk1.update(i); + } + for (int i = 1; i <= n2; i++) { + sk2.update(i + 100); + } + println("SK1:"); + println(sk1.toString(true, true)); + println("SK2:"); + println(sk2.toString(true, true)); + WritableMemory wmem1 = WritableMemory.writableWrap(sk1.toUpdatableByteArray()); + WritableMemory wmem2 = WritableMemory.writableWrap(sk2.toUpdatableByteArray()); + KllDirectFloatsSketch dsk1 = KllDirectFloatsSketch.writableWrap(wmem1, memReqSvr); + KllDirectFloatsSketch dsk2 = KllDirectFloatsSketch.writableWrap(wmem2, memReqSvr); + println("BEFORE MERGE"); + println(dsk1.toString(true, true)); + dsk1.merge(dsk2); + println("AFTER MERGE"); + println(dsk1.toString(true, true)); + } + + @Test + public void checkSketchInitializeDirectDoubleUpdatableMem() { + int k = 20; //don't change this + KllDirectFloatsSketch sk; + KllFloatsSketch sk2; + byte[] compBytes; + WritableMemory wmem; + + println("#### CASE: DOUBLE FULL DIRECT FROM UPDATABLE"); + sk2 = new KllFloatsSketch(k); + for (int i = 1; i <= k + 1; i++) { sk2.update(i); } + //println(sk2.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDirectFloatsSketch.writableWrap(wmem, memReqSvr); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxFloatValue(), 21.0); + assertEquals(sk.getMinFloatValue(), 1.0); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: DOUBLE EMPTY HEAPIFIED FROM UPDATABLE"); + sk2 = new KllFloatsSketch(k); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDirectFloatsSketch.writableWrap(wmem, memReqSvr); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), Double.NaN); + assertEquals(sk.getMinFloatValue(), Double.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: DOUBLE SINGLE HEAPIFIED FROM UPDATABLE"); + sk2 = new KllFloatsSketch(k); + sk2.update(1); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDirectFloatsSketch.writableWrap(wmem, memReqSvr); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), 1.0); + assertEquals(sk.getMinFloatValue(), 1.0); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkGetWritableMemory() { + final KllDirectFloatsSketch sketch = getDFSketch(200, 200); + assertEquals(sketch.getK(), 200); + assertEquals(sketch.getN(), 200); + assertFalse(sketch.isEmpty()); + assertTrue(sketch.isUpdatableMemory()); + assertFalse(sketch.isEstimationMode()); + assertTrue(sketch.isFloatsSketch()); + assertFalse(sketch.isLevelZeroSorted()); + assertFalse(sketch.isDoublesSketch()); + + final WritableMemory wmem = sketch.getWritableMemory(); + final KllFloatsSketch sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), 200); + assertEquals(sk.getN(), 200); + assertFalse(sk.isEmpty()); + assertFalse(sk.isUpdatableMemory()); + assertFalse(sk.isEstimationMode()); + assertTrue(sk.isFloatsSketch()); + assertFalse(sk.isLevelZeroSorted()); + assertFalse(sk.isDoublesSketch()); + } + + @Test + public void checkReset() { + WritableMemory dstMem = WritableMemory.allocate(3000); + KllDirectFloatsSketch sk = KllDirectFloatsSketch.newInstance(20, dstMem, memReqSvr); + for (int i = 1; i <= 100; i++) { sk.update(i); } + long n1 = sk.getN(); + float min1 = sk.getMinValue(); + float max1 = sk.getMaxValue(); + sk.reset(); + for (int i = 1; i <= 100; i++) { sk.update(i); } + long n2 = sk.getN(); + float min2 = sk.getMinValue(); + float max2 = sk.getMaxValue(); + assertEquals(n2, n1); + assertEquals(min2, min1); + assertEquals(max2, max1); + } + + @Test + public void checkHeapify() { + WritableMemory dstMem = WritableMemory.allocate(6000); + KllDirectFloatsSketch sk = KllDirectFloatsSketch.newInstance(20, dstMem, memReqSvr); + for (int i = 1; i <= 100; i++) { sk.update(i); } + KllFloatsSketch sk2 = KllDirectFloatsSketch.heapify(dstMem); + assertEquals(sk2.getMinValue(), 1.0); + assertEquals(sk2.getMaxValue(), 100.0); + } + + private static KllDirectFloatsSketch getDFSketch(final int k, final int n) { + KllFloatsSketch sk = new KllFloatsSketch(k); + for (int i = 1; i <= n; i++) { sk.update(i); } + byte[] byteArr = sk.toUpdatableByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + + KllDirectFloatsSketch dfsk = KllDirectFloatsSketch.writableWrap(wmem, memReqSvr); + return dfsk; + } + + @Test + public void printlnTest() { + println("PRINTING: " + this.getClass().getName()); + } + + /** + * @param s value to print + */ + static void println(final String s) { + //System.out.println(s); //disable here + } + +} diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java index 391052a64..64a995038 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchIteratorTest.java @@ -61,6 +61,4 @@ public void bigSketches() { } } - } - diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java index 7eeea733d..b149d3c1c 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java @@ -77,19 +77,20 @@ public void oneItem() { assertEquals(sketch.getNumRetained(), 1); assertEquals(sketch.getRank(1), 0.0); assertEquals(sketch.getRank(2), 1.0); - assertEquals(sketch.getMinValue(), 1f); - assertEquals(sketch.getMaxValue(), 1f); - assertEquals(sketch.getQuantile(0.5), 1f); + assertEquals(sketch.getMinValue(), 1.0); + assertEquals(sketch.getMaxValue(), 1.0); + assertEquals(sketch.getQuantile(0.5), 1.0); } @Test public void manyItemsEstimationMode() { final KllDoublesSketch sketch = new KllDoublesSketch(); - final int n = 1000000; + final int n = 1_000_000; + for (int i = 0; i < n; i++) { sketch.update(i); - assertEquals(sketch.getN(), i + 1); } + assertEquals(sketch.getN(), n); // test getRank for (int i = 0; i < n; i++) { @@ -125,7 +126,7 @@ public void manyItemsEstimationMode() { assertTrue(previousQuantile <= quantile); previousQuantile = quantile; } -} + } @Test public void getRankGetCdfGetPmfConsistency() { @@ -156,23 +157,23 @@ public void merge() { final KllDoublesSketch sketch2 = new KllDoublesSketch(); final int n = 10000; for (int i = 0; i < n; i++) { - sketch1.update(i); - sketch2.update(2 * n - i - 1); + sketch1.update(i * 1.0); + sketch2.update((2 * n - i - 1) * 1.0); } - assertEquals(sketch1.getMinValue(), 0.0f); - assertEquals(sketch1.getMaxValue(), n - 1f); + assertEquals(sketch1.getMinValue(), 0.0); + assertEquals(sketch1.getMaxValue(), (n - 1) * 1.0); - assertEquals(sketch2.getMinValue(), n); - assertEquals(sketch2.getMaxValue(), 2f * n - 1f); + assertEquals(sketch2.getMinValue(), n * 1.0); + assertEquals(sketch2.getMaxValue(), (2 * n - 1) * 1.0); sketch1.merge(sketch2); assertFalse(sketch1.isEmpty()); assertEquals(sketch1.getN(), 2L * n); - assertEquals(sketch1.getMinValue(), 0f); - assertEquals(sketch1.getMaxValue(), 2f * n - 1); - assertEquals(sketch1.getQuantile(0.5), n, n * PMF_EPS_FOR_K_256); + assertEquals(sketch1.getMinValue(), 0.0); + assertEquals(sketch1.getMaxValue(), (2 * n - 1) * 1.0); + assertEquals(sketch1.getQuantile(0.5), n * 1.0, n * PMF_EPS_FOR_K_256); } @Test @@ -189,7 +190,7 @@ public void mergeLowerK() { assertEquals(sketch1.getMaxValue(), n - 1f); assertEquals(sketch2.getMinValue(), n); - assertEquals(sketch2.getMaxValue(), 2f * n - 1f); + assertEquals(sketch2.getMaxValue(), 2f * n - 1.0); assertTrue(sketch1.getNormalizedRankError(false) < sketch2.getNormalizedRankError(false)); assertTrue(sketch1.getNormalizedRankError(true) < sketch2.getNormalizedRankError(true)); @@ -201,8 +202,8 @@ public void mergeLowerK() { assertFalse(sketch1.isEmpty()); assertEquals(sketch1.getN(), 2 * n); - assertEquals(sketch1.getMinValue(), 0f); - assertEquals(sketch1.getMaxValue(), 2f * n - 1f); + assertEquals(sketch1.getMinValue(), 0); + assertEquals(sketch1.getMaxValue(), 2f * n - 1.0); assertEquals(sketch1.getQuantile(0.5), n, n * PMF_EPS_FOR_K_128); } @@ -222,17 +223,17 @@ public void mergeEmptyLowerK() { assertFalse(sketch1.isEmpty()); assertEquals(sketch1.getN(), n); - assertEquals(sketch1.getMinValue(), 0f); - assertEquals(sketch1.getMaxValue(), n - 1f); - assertEquals(sketch1.getQuantile(0.5), n / 2f, n / 2 * PMF_EPS_FOR_K_256); + assertEquals(sketch1.getMinValue(), 0); + assertEquals(sketch1.getMaxValue(), n - 1.0); + assertEquals(sketch1.getQuantile(0.5), n / 2.0, n / 2 * PMF_EPS_FOR_K_256); //merge the other way sketch2.merge(sketch1); assertFalse(sketch1.isEmpty()); assertEquals(sketch1.getN(), n); assertEquals(sketch1.getMinValue(), 0f); - assertEquals(sketch1.getMaxValue(), n - 1f); - assertEquals(sketch1.getQuantile(0.5), n / 2f, n / 2 * PMF_EPS_FOR_K_256); + assertEquals(sketch1.getMaxValue(), n - 1.0); + assertEquals(sketch1.getQuantile(0.5), n / 2.0, n / 2 * PMF_EPS_FOR_K_256); } @Test @@ -258,50 +259,50 @@ public void mergeMinMinValueFromOther() { sketch1.update(1); sketch2.update(2); sketch2.merge(sketch1); - assertEquals(sketch2.getMinValue(), 1.0F); + assertEquals(sketch2.getMinValue(), 1.0); } @Test public void mergeMinAndMaxFromOther() { final KllDoublesSketch sketch1 = new KllDoublesSketch(); - for (int i = 0; i < 1000000; i++) { + for (int i = 1; i <= 1_000_000; i++) { sketch1.update(i); } final KllDoublesSketch sketch2 = new KllDoublesSketch(); sketch2.merge(sketch1); - assertEquals(sketch2.getMinValue(), 0F); - assertEquals(sketch2.getMaxValue(), 999999F); + assertEquals(sketch2.getMinValue(), 1); + assertEquals(sketch2.getMaxValue(), 1_000_000); } @SuppressWarnings("unused") @Test(expectedExceptions = SketchesArgumentException.class) public void kTooSmall() { - new KllDoublesSketch(BaseKllSketch.MIN_K - 1); + new KllDoublesSketch(KllSketch.DEFAULT_M - 1); } @SuppressWarnings("unused") @Test(expectedExceptions = SketchesArgumentException.class) public void kTooLarge() { - new KllDoublesSketch(BaseKllSketch.MAX_K + 1); + new KllDoublesSketch(KllSketch.MAX_K + 1); } @Test public void minK() { - final KllDoublesSketch sketch = new KllDoublesSketch(BaseKllSketch.MIN_K); + final KllDoublesSketch sketch = new KllDoublesSketch(KllSketch.DEFAULT_M); for (int i = 0; i < 1000; i++) { sketch.update(i); } - assertEquals(sketch.getK(), BaseKllSketch.MIN_K); + assertEquals(sketch.getK(), KllSketch.DEFAULT_M); assertEquals(sketch.getQuantile(0.5), 500, 500 * PMF_EPS_FOR_K_8); } @Test public void maxK() { - final KllDoublesSketch sketch = new KllDoublesSketch(BaseKllSketch.MAX_K); + final KllDoublesSketch sketch = new KllDoublesSketch(KllSketch.MAX_K); for (int i = 0; i < 1000; i++) { sketch.update(i); } - assertEquals(sketch.getK(), BaseKllSketch.MAX_K); + assertEquals(sketch.getK(), KllSketch.MAX_K); assertEquals(sketch.getQuantile(0.5), 500, 500 * PMF_EPS_FOR_K_256); } @@ -310,14 +311,14 @@ public void serializeDeserializeEmpty() { final KllDoublesSketch sketch1 = new KllDoublesSketch(); final byte[] bytes = sketch1.toByteArray(); final KllDoublesSketch sketch2 = KllDoublesSketch.heapify(Memory.wrap(bytes)); - assertEquals(bytes.length, sketch1.getSerializedSizeBytes()); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); assertTrue(sketch2.isEmpty()); assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); assertEquals(sketch2.getN(), sketch1.getN()); assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); assertTrue(Double.isNaN(sketch2.getMinValue())); assertTrue(Double.isNaN(sketch2.getMaxValue())); - assertEquals(sketch2.getSerializedSizeBytes(), sketch1.getSerializedSizeBytes()); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); } @Test @@ -326,16 +327,26 @@ public void serializeDeserializeOneItem() { sketch1.update(1); final byte[] bytes = sketch1.toByteArray(); final KllDoublesSketch sketch2 = KllDoublesSketch.heapify(Memory.wrap(bytes)); - assertEquals(bytes.length, sketch1.getSerializedSizeBytes()); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); assertFalse(sketch2.isEmpty()); assertEquals(sketch2.getNumRetained(), 1); assertEquals(sketch2.getN(), 1); assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); assertFalse(Double.isNaN(sketch2.getMinValue())); assertFalse(Double.isNaN(sketch2.getMaxValue())); - assertEquals(sketch2.getSerializedSizeBytes(), 8 + Double.BYTES); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), 8 + Double.BYTES); } + //@Test //not implemented from C++ yet + //public void deserializeOneItemV1() throws Exception { + // final byte[] bytes = getResourceBytes("kll_sketch_float_one_item_v1.sk"); + // final KllFloatsSketch sketch = KllFloatsSketch.heapify(Memory.wrap(bytes)); + // assertFalse(sketch.isEmpty()); + // assertFalse(sketch.isEstimationMode()); + // assertEquals(sketch.getN(), 1); + // assertEquals(sketch.getNumRetained(), 1); + //} + @Test public void serializeDeserialize() { final KllDoublesSketch sketch1 = new KllDoublesSketch(); @@ -345,14 +356,14 @@ public void serializeDeserialize() { } final byte[] bytes = sketch1.toByteArray(); final KllDoublesSketch sketch2 = KllDoublesSketch.heapify(Memory.wrap(bytes)); - assertEquals(bytes.length, sketch1.getSerializedSizeBytes()); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); assertFalse(sketch2.isEmpty()); assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); assertEquals(sketch2.getN(), sketch1.getN()); assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); assertEquals(sketch2.getMinValue(), sketch1.getMinValue()); assertEquals(sketch2.getMaxValue(), sketch1.getMaxValue()); - assertEquals(sketch2.getSerializedSizeBytes(), sketch1.getSerializedSizeBytes()); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); } @Test(expectedExceptions = SketchesArgumentException.class) @@ -369,33 +380,6 @@ public void nanSplitPoint() { sketch.getCDF(new double[] {Double.NaN}); } - @Test - public void getMaxSerializedSizeBytes() { - final int sizeBytes = - KllDoublesSketch.getMaxSerializedSizeBytes(BaseKllSketch.DEFAULT_K, 1_000_000_000); - assertEquals(sizeBytes, 6184); - } - - @Test - public void checkUbOnNumLevels() { - assertEquals(KllHelper.ubOnNumLevels(0), 1); - } - - @Test - public void checkIntCapAux() { - int lvlCap = KllHelper.levelCapacity(10, 61, 0, 8); - assertEquals(lvlCap, 8); - lvlCap = KllHelper.levelCapacity(10, 61, 60, 8); - assertEquals(lvlCap, 10); - } - - @Test - public void checkSuperLargeKandLevels() { - //This is beyond what the sketch can be configured for. - final int size = KllHelper.computeTotalItemCapacity(1 << 29, 8, 61); - assertEquals(size, 1_610_612_846); - } - @Test public void getQuantiles() { final KllDoublesSketch sketch = new KllDoublesSketch(); @@ -405,9 +389,26 @@ public void getQuantiles() { final double[] quantiles1 = sketch.getQuantiles(new double[] {0, 0.5, 1}); final double[] quantiles2 = sketch.getQuantiles(3); assertEquals(quantiles1, quantiles2); - assertEquals(quantiles1[0], 1f); - assertEquals(quantiles1[1], 2f); - assertEquals(quantiles1[2], 3f); + assertEquals(quantiles1[0], 1.0); + assertEquals(quantiles1[1], 2.0); + assertEquals(quantiles1[2], 3.0); + } + + @Test + public void checkReset() { + KllDoublesSketch sk = new KllDoublesSketch(20); + for (int i = 1; i <= 100; i++) { sk.update(i); } + long n1 = sk.getN(); + double min1 = sk.getMinValue(); + double max1 = sk.getMaxValue(); + sk.reset(); + for (int i = 1; i <= 100; i++) { sk.update(i); } + long n2 = sk.getN(); + double min2 = sk.getMinValue(); + double max2 = sk.getMaxValue(); + assertEquals(n2, n1); + assertEquals(min2, min1); + assertEquals(max2, max1); } } diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesValidationTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesValidationTest.java index ec1087d70..61d33f44b 100644 --- a/src/test/java/org/apache/datasketches/kll/KllDoublesValidationTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllDoublesValidationTest.java @@ -217,8 +217,8 @@ public void checkTestResults() { } int numLevels = sketch.getNumLevels(); int numSamples = sketch.getNumRetained(); - int[] levels = sketch.getLevels(); - long hashedSamples = simpleHashOfSubArray(sketch.getItems(), levels[0], numSamples); + int[] levels = sketch.getLevelsArray(); + long hashedSamples = simpleHashOfSubArray(sketch.getDoubleItemsArray(), levels[0], numSamples); System.out.print(testI); assert correctResultsWithReset[(7 * testI) + 4] == numLevels; assert correctResultsWithReset[(7 * testI) + 5] == numSamples; diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java index 7732efce8..33d829fcc 100644 --- a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchIteratorTest.java @@ -22,7 +22,6 @@ import org.testng.Assert; import org.testng.annotations.Test; -@SuppressWarnings("javadoc") public class KllFloatsSketchIteratorTest { @Test diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java index fd2313b03..e1a35f584 100644 --- a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java @@ -86,7 +86,8 @@ public void oneItem() { @Test public void manyItemsEstimationMode() { final KllFloatsSketch sketch = new KllFloatsSketch(); - final int n = 1000000; + final int n = 1_000_000; + for (int i = 0; i < n; i++) { sketch.update(i); assertEquals(sketch.getN(), i + 1); @@ -99,7 +100,7 @@ public void manyItemsEstimationMode() { } // test getPMF - final double[] pmf = sketch.getPMF(new float[] {n / 2}); // split at median + final double[] pmf = sketch.getPMF(new float[] {n / 2.0F}); // split at median assertEquals(pmf.length, 2); assertEquals(pmf[0], 0.5, PMF_EPS_FOR_K_256); assertEquals(pmf[1], 0.5, PMF_EPS_FOR_K_256); @@ -157,23 +158,23 @@ public void merge() { final KllFloatsSketch sketch2 = new KllFloatsSketch(); final int n = 10000; for (int i = 0; i < n; i++) { - sketch1.update(i); - sketch2.update(2 * n - i - 1); + sketch1.update(i * 1.0f); + sketch2.update((2 * n - i - 1) * 1.0f); } assertEquals(sketch1.getMinValue(), 0.0f); - assertEquals(sketch1.getMaxValue(), n - 1f); + assertEquals(sketch1.getMaxValue(), (n - 1) * 1.0f); - assertEquals(sketch2.getMinValue(), n); - assertEquals(sketch2.getMaxValue(), 2f * n - 1f); + assertEquals(sketch2.getMinValue(), n * 1.0f); + assertEquals(sketch2.getMaxValue(), (2 * n - 1) * 1.0f); sketch1.merge(sketch2); assertFalse(sketch1.isEmpty()); assertEquals(sketch1.getN(), 2L * n); - assertEquals(sketch1.getMinValue(), 0f); - assertEquals(sketch1.getMaxValue(), 2f * n - 1); - assertEquals(sketch1.getQuantile(0.5), n, n * PMF_EPS_FOR_K_256); + assertEquals(sketch1.getMinValue(), 0.0f); + assertEquals(sketch1.getMaxValue(), (2 * n - 1) * 1.0f); + assertEquals(sketch1.getQuantile(0.5), n * 1.0f, n * PMF_EPS_FOR_K_256); } @Test @@ -265,44 +266,44 @@ public void mergeMinMinValueFromOther() { @Test public void mergeMinAndMaxFromOther() { final KllFloatsSketch sketch1 = new KllFloatsSketch(); - for (int i = 0; i < 1000000; i++) { + for (int i = 1; i <= 1_000_000; i++) { sketch1.update(i); } - final KllFloatsSketch sketch2 = new KllFloatsSketch(); + final KllFloatsSketch sketch2 = new KllFloatsSketch(10); sketch2.merge(sketch1); - assertEquals(sketch2.getMinValue(), 0F); - assertEquals(sketch2.getMaxValue(), 999999F); + assertEquals(sketch2.getMinValue(), 1F); + assertEquals(sketch2.getMaxValue(), 1_000_000F); } @SuppressWarnings("unused") @Test(expectedExceptions = SketchesArgumentException.class) public void kTooSmall() { - new KllFloatsSketch(BaseKllSketch.MIN_K - 1); + new KllFloatsSketch(KllSketch.DEFAULT_M - 1); } @SuppressWarnings("unused") @Test(expectedExceptions = SketchesArgumentException.class) public void kTooLarge() { - new KllFloatsSketch(BaseKllSketch.MAX_K + 1); + new KllFloatsSketch(KllSketch.MAX_K + 1); } @Test public void minK() { - final KllFloatsSketch sketch = new KllFloatsSketch(BaseKllSketch.MIN_K); + final KllFloatsSketch sketch = new KllFloatsSketch(KllSketch.DEFAULT_M); for (int i = 0; i < 1000; i++) { sketch.update(i); } - assertEquals(sketch.getK(), BaseKllSketch.MIN_K); + assertEquals(sketch.getK(), KllSketch.DEFAULT_M); assertEquals(sketch.getQuantile(0.5), 500, 500 * PMF_EPS_FOR_K_8); } @Test public void maxK() { - final KllFloatsSketch sketch = new KllFloatsSketch(BaseKllSketch.MAX_K); + final KllFloatsSketch sketch = new KllFloatsSketch(KllSketch.MAX_K); for (int i = 0; i < 1000; i++) { sketch.update(i); } - assertEquals(sketch.getK(), BaseKllSketch.MAX_K); + assertEquals(sketch.getK(), KllSketch.MAX_K); assertEquals(sketch.getQuantile(0.5), 500, 500 * PMF_EPS_FOR_K_256); } @@ -311,14 +312,14 @@ public void serializeDeserializeEmpty() { final KllFloatsSketch sketch1 = new KllFloatsSketch(); final byte[] bytes = sketch1.toByteArray(); final KllFloatsSketch sketch2 = KllFloatsSketch.heapify(Memory.wrap(bytes)); - assertEquals(bytes.length, sketch1.getSerializedSizeBytes()); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); assertTrue(sketch2.isEmpty()); assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); assertEquals(sketch2.getN(), sketch1.getN()); assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); assertTrue(Float.isNaN(sketch2.getMinValue())); assertTrue(Float.isNaN(sketch2.getMaxValue())); - assertEquals(sketch2.getSerializedSizeBytes(), sketch1.getSerializedSizeBytes()); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); } @Test @@ -327,14 +328,14 @@ public void serializeDeserializeOneItem() { sketch1.update(1); final byte[] bytes = sketch1.toByteArray(); final KllFloatsSketch sketch2 = KllFloatsSketch.heapify(Memory.wrap(bytes)); - assertEquals(bytes.length, sketch1.getSerializedSizeBytes()); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); assertFalse(sketch2.isEmpty()); assertEquals(sketch2.getNumRetained(), 1); assertEquals(sketch2.getN(), 1); assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); assertFalse(Float.isNaN(sketch2.getMinValue())); assertFalse(Float.isNaN(sketch2.getMaxValue())); - assertEquals(sketch2.getSerializedSizeBytes(), 8 + Float.BYTES); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), 8 + Float.BYTES); } @Test @@ -356,14 +357,14 @@ public void serializeDeserialize() { } final byte[] bytes = sketch1.toByteArray(); final KllFloatsSketch sketch2 = KllFloatsSketch.heapify(Memory.wrap(bytes)); - assertEquals(bytes.length, sketch1.getSerializedSizeBytes()); + assertEquals(bytes.length, sketch1.getCurrentCompactSerializedSizeBytes()); assertFalse(sketch2.isEmpty()); assertEquals(sketch2.getNumRetained(), sketch1.getNumRetained()); assertEquals(sketch2.getN(), sketch1.getN()); assertEquals(sketch2.getNormalizedRankError(false), sketch1.getNormalizedRankError(false)); assertEquals(sketch2.getMinValue(), sketch1.getMinValue()); assertEquals(sketch2.getMaxValue(), sketch1.getMaxValue()); - assertEquals(sketch2.getSerializedSizeBytes(), sketch1.getSerializedSizeBytes()); + assertEquals(sketch2.getCurrentCompactSerializedSizeBytes(), sketch1.getCurrentCompactSerializedSizeBytes()); } @Test(expectedExceptions = SketchesArgumentException.class) @@ -380,33 +381,6 @@ public void nanSplitPoint() { sketch.getCDF(new float[] {Float.NaN}); } - @Test - public void getMaxSerializedSizeBytes() { - final int sizeBytes = - KllFloatsSketch.getMaxSerializedSizeBytes(BaseKllSketch.DEFAULT_K, 1_000_000_000); - assertEquals(sizeBytes, 3160); - } - - @Test - public void checkUbOnNumLevels() { - assertEquals(KllHelper.ubOnNumLevels(0), 1); - } - - @Test - public void checkIntCapAux() { - int lvlCap = KllHelper.levelCapacity(10, 61, 0, 8); - assertEquals(lvlCap, 8); - lvlCap = KllHelper.levelCapacity(10, 61, 60, 8); - assertEquals(lvlCap, 10); - } - - @Test - public void checkSuperLargeKandLevels() { - //This is beyond what the sketch can be configured for. - final int size = KllHelper.computeTotalItemCapacity(1 << 29, 8, 61); - assertEquals(size, 1_610_612_846); - } - @Test public void getQuantiles() { final KllFloatsSketch sketch = new KllFloatsSketch(); @@ -421,4 +395,36 @@ public void getQuantiles() { assertEquals(quantiles1[2], 3f); } + @SuppressWarnings("deprecation") + @Test + public void checkDeprecatedMethods() { + final int k = 200; + final int n = 200; + int bytes = KllSketch.getMaxSerializedSizeBytes(k, n); //assumed float before + assertEquals(bytes, 832); + KllFloatsSketch sk = new KllFloatsSketch(k); + for (int i = 1; i <= n; i++) { sk.update(i); } + final byte[] byteArr = sk.toByteArray(); + assertEquals(byteArr.length, 832); + bytes = sk.getSerializedSizeBytes(); + assertEquals(bytes, 832); + } + + @Test + public void checkReset() { + KllFloatsSketch sk = new KllFloatsSketch(20); + for (int i = 1; i <= 100; i++) { sk.update(i); } + long n1 = sk.getN(); + float min1 = sk.getMinValue(); + float max1 = sk.getMaxValue(); + sk.reset(); + for (int i = 1; i <= 100; i++) { sk.update(i); } + long n2 = sk.getN(); + float min2 = sk.getMinValue(); + float max2 = sk.getMaxValue(); + assertEquals(n2, n1); + assertEquals(min2, min1); + assertEquals(max2, max1); + } + } diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsValidationTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsValidationTest.java index 71de641ed..9d3227c5b 100644 --- a/src/test/java/org/apache/datasketches/kll/KllFloatsValidationTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllFloatsValidationTest.java @@ -218,8 +218,8 @@ public void checkTestResults() { } int numLevels = sketch.getNumLevels(); int numSamples = sketch.getNumRetained(); - int[] levels = sketch.getLevels(); - long hashedSamples = simpleHashOfSubArray(sketch.getItems(), levels[0], numSamples); + int[] levels = sketch.getLevelsArray(); + long hashedSamples = simpleHashOfSubArray(sketch.getFloatItemsArray(), levels[0], numSamples); System.out.print(testI); assert correctResultsWithReset[(7 * testI) + 4] == numLevels; assert correctResultsWithReset[(7 * testI) + 5] == numSamples; diff --git a/src/test/java/org/apache/datasketches/kll/KllHelperTest.java b/src/test/java/org/apache/datasketches/kll/KllHelperTest.java new file mode 100644 index 000000000..791bdd5c5 --- /dev/null +++ b/src/test/java/org/apache/datasketches/kll/KllHelperTest.java @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.apache.datasketches.kll.KllHelper.checkM; +import static org.apache.datasketches.kll.KllSketch.SketchType.DOUBLES_SKETCH; +import static org.apache.datasketches.kll.KllSketch.SketchType.FLOATS_SKETCH; +import static org.testng.Assert.assertEquals; + +import org.apache.datasketches.SketchesArgumentException; +import org.apache.datasketches.kll.KllSketch.SketchType; +import org.apache.datasketches.memory.Memory; +import org.testng.annotations.Test; + +@SuppressWarnings("unused") +public class KllHelperTest { + + /** + * Println Object o + * @param o object to print + */ + static void println(Object o) { + //System.out.println(o.toString()); + } + + @Test + public void checkCheckM() { + try { + checkM(0); + } catch (SketchesArgumentException e) {} + try { + checkM(3); + } catch (SketchesArgumentException e) {} + try { + checkM(10); + } catch (SketchesArgumentException e) {} + } + + @Test + public void checkGetKFromEps() { + final int k = KllSketch.DEFAULT_K; + final double eps = KllHelper.getNormalizedRankError(k, false); + final double epsPmf = KllHelper.getNormalizedRankError(k, true); + final int kEps = KllSketch.getKFromEpsilon(eps, false); + final int kEpsPmf = KllSketch.getKFromEpsilon(epsPmf, true); + assertEquals(kEps, k); + assertEquals(kEpsPmf, k); + } + + @Test + public void checkIntCapAux() { + int lvlCap = KllHelper.levelCapacity(10, 61, 0, 8); + assertEquals(lvlCap, 8); + lvlCap = KllHelper.levelCapacity(10, 61, 60, 8); + assertEquals(lvlCap, 10); + } + + @Test + public void checkSuperLargeKandLevels() { + //This is beyond what the sketch can be configured for. + final int size = KllHelper.computeTotalItemCapacity(1 << 29, 8, 61); + assertEquals(size, 1_610_612_846); + } + + @Test + public void checkUbOnNumLevels() { + assertEquals(KllHelper.ubOnNumLevels(0), 1); + } + + @Test + public void checkUpdatableSerDe() { + KllDoublesSketch sk = new KllDoublesSketch(200); + for (int i = 1; i <= 533; i++) { sk.update(i); } + int retained = sk.getNumRetained(); + int numLevels = ((KllHeapSketch)sk).getNumLevels(); + println("NumLevels: " + numLevels); + println("NumRetained: " + retained); + + byte[] compByteArr1 = sk.toByteArray(); + int compBytes1 = compByteArr1.length; + println("compBytes1: " + compBytes1); + + byte[] upByteArr1 = sk.toUpdatableByteArray(); + int upBytes1 = upByteArr1.length; + println("upBytes1: " + upBytes1); + + Memory mem; + KllDoublesSketch sk2; + + mem = Memory.wrap(compByteArr1); + sk2 = KllDoublesSketch.heapify(mem); + byte[] compByteArr2 = sk2.toByteArray(); + int compBytes2 = compByteArr2.length; + println("compBytes2: " + compBytes2); + assertEquals(compBytes1, compBytes2); + assertEquals(sk2.getNumRetained(), retained); + + mem = Memory.wrap(compByteArr2); + sk2 = KllDoublesSketch.heapify(mem); + byte[] upByteArr2 = sk2.toUpdatableByteArray(); + int upBytes2 = upByteArr2.length; + println("upBytes2: " + upBytes2); + assertEquals(upBytes1, upBytes2); + assertEquals(sk2.getNumRetained(), retained); + } + + + @Test + public void getMaxCompactDoublesSerializedSizeBytes() { + final int sizeBytes = KllSketch.getMaxSerializedSizeBytes(KllSketch.DEFAULT_K, 1L << 30, DOUBLES_SKETCH, false); + assertEquals(sizeBytes, 5704); + } + + @Test + public void getMaxCompactFloatsSerializedSizeBytes() { + final int sizeBytes = KllSketch.getMaxSerializedSizeBytes(KllSketch.DEFAULT_K, 1L << 30, FLOATS_SKETCH, false); + assertEquals(sizeBytes, 2908); + } + + @Test + public void getMaxUpdatableDoubleSerializedSizeBytes() { + final int sizeBytes = KllSketch.getMaxSerializedSizeBytes(KllSketch.DEFAULT_K, 1L << 30, DOUBLES_SKETCH, true); + assertEquals(sizeBytes, 5708); + } + + @Test + public void getMaxUpdatableFloatsSerializedSizeBytes() { + final int sizeBytes = KllSketch.getMaxSerializedSizeBytes(KllSketch.DEFAULT_K, 1L << 30, FLOATS_SKETCH, true); + assertEquals(sizeBytes, 2912); + } + + @Test + public void getStatsAtNumLevels() { + int k = 200; + int m = 8; + int numLevels = 23; + KllHelper.LevelStats lvlStats = KllHelper.getFinalSketchStatsAtNumLevels(k, m, numLevels, false); + assertEquals(lvlStats.items, 697); + assertEquals(lvlStats.n, 1257766904); + } + + @Test + public void getStatsAtNumLevels2() { + int k = 20; + int m = 8; + int numLevels = 2; + KllHelper.LevelStats lvlStats = KllHelper.getFinalSketchStatsAtNumLevels(k, KllSketch.DEFAULT_M, numLevels, false); + assertEquals(lvlStats.numLevels, 2); + assertEquals(lvlStats.items, 33); + } + + @Test + public void testGetAllLevelStats() { + long n = 1L << 30; + int k = 200; + int m = 8; + KllHelper.GrowthStats gStats = KllHelper.getGrowthSchemeForGivenN(k, m, n, DOUBLES_SKETCH, false); + assertEquals(gStats.compactBytes, 5704); + } + + @Test + public void testGetAllLevelStats2() { + long n = 533; + int k = 200; + int m = 8; + KllHelper.GrowthStats gStats = KllHelper.getGrowthSchemeForGivenN(k, KllSketch.DEFAULT_M, n, DOUBLES_SKETCH, false); + assertEquals(gStats.numLevels, 2); + assertEquals(gStats.maxItems, 333); + + } +} diff --git a/src/test/java/org/apache/datasketches/kll/KllMemoryValidateTest.java b/src/test/java/org/apache/datasketches/kll/KllMemoryValidateTest.java new file mode 100644 index 000000000..324954156 --- /dev/null +++ b/src/test/java/org/apache/datasketches/kll/KllMemoryValidateTest.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.apache.datasketches.kll.KllPreambleUtil.*; + +import org.apache.datasketches.Family; +import org.apache.datasketches.SketchesArgumentException; +import org.apache.datasketches.memory.WritableMemory; +import org.testng.annotations.Test; + +@SuppressWarnings("unused") +public class KllMemoryValidateTest { + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidFamily() { + KllFloatsSketch sk = new KllFloatsSketch(); + byte[] byteArr = sk.toByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemoryFamilyID(wmem, Family.KLL.getID() - 1); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidSerVer() { + KllFloatsSketch sk = new KllFloatsSketch(); + byte[] byteArr = sk.toByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemorySerVer(wmem, SERIAL_VERSION_EMPTY_FULL - 1); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidEmptyAndSingle() { + KllFloatsSketch sk = new KllFloatsSketch(); + byte[] byteArr = sk.toByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemoryFlags(wmem, EMPTY_BIT_MASK | SINGLE_ITEM_BIT_MASK); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidUpdatableAndSerVer() { + KllFloatsSketch sk = new KllFloatsSketch(); + byte[] byteArr = sk.toByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemoryFlags(wmem, UPDATABLE_BIT_MASK); + setMemorySerVer(wmem, SERIAL_VERSION_EMPTY_FULL); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidSingleAndPreInts() { + KllFloatsSketch sk = new KllFloatsSketch(); + sk.update(1); + byte[] byteArr = sk.toByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemoryPreInts(wmem, PREAMBLE_INTS_FULL); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidSingleAndSerVer() { + KllFloatsSketch sk = new KllFloatsSketch(); + sk.update(1); + byte[] byteArr = sk.toByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemorySerVer(wmem, SERIAL_VERSION_EMPTY_FULL); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidEmptyDoublesAndPreIntsFull() { + KllDoublesSketch sk = new KllDoublesSketch(); + byte[] byteArr = sk.toByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemoryPreInts(wmem, PREAMBLE_INTS_FULL); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidSingleDoubleCompactAndSerVer() { + KllDoublesSketch sk = new KllDoublesSketch(); + sk.update(1); + byte[] byteArr = sk.toByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemorySerVer(wmem, SERIAL_VERSION_EMPTY_FULL); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidDoubleUpdatableAndPreInts() { + KllDoublesSketch sk = new KllDoublesSketch(); + byte[] byteArr = sk.toUpdatableByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemoryPreInts(wmem, PREAMBLE_INTS_EMPTY_SINGLE); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidFloatFullAndPreInts() { + KllFloatsSketch sk = new KllFloatsSketch(); + sk.update(1); sk.update(2); + byte[] byteArr = sk.toByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemoryPreInts(wmem, PREAMBLE_INTS_EMPTY_SINGLE); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidFloatUpdatableFullAndPreInts() { + KllFloatsSketch sk = new KllFloatsSketch(); + sk.update(1); sk.update(2); + byte[] byteArr = sk.toUpdatableByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemoryPreInts(wmem, PREAMBLE_INTS_EMPTY_SINGLE); + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkInvalidDoubleCompactSingleAndPreInts() { + KllDoublesSketch sk = new KllDoublesSketch(); + sk.update(1); + byte[] byteArr = sk.toByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + setMemoryPreInts(wmem, PREAMBLE_INTS_FULL);//should be 2, single + KllMemoryValidate memVal = new KllMemoryValidate(wmem); + } + +} + diff --git a/src/test/java/org/apache/datasketches/kll/MiscDirectDoublesTest.java b/src/test/java/org/apache/datasketches/kll/MiscDirectDoublesTest.java new file mode 100644 index 000000000..eedf5e44e --- /dev/null +++ b/src/test/java/org/apache/datasketches/kll/MiscDirectDoublesTest.java @@ -0,0 +1,440 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +import java.util.Objects; + +import org.apache.datasketches.memory.DefaultMemoryRequestServer; +import org.apache.datasketches.memory.WritableMemory; +import org.testng.annotations.Test; + +public class MiscDirectDoublesTest { + static final String LS = System.getProperty("line.separator"); + private static final DefaultMemoryRequestServer memReqSvr = new DefaultMemoryRequestServer(); + + @Test + public void checkBounds() { + final KllDirectDoublesSketch sk = getDDSketch(200, 0); + for (int i = 0; i < 1000; i++) { + sk.update(i); + } + final double eps = sk.getNormalizedRankError(false); + final double est = sk.getQuantile(0.5); + final double ub = sk.getQuantileUpperBound(0.5); + final double lb = sk.getQuantileLowerBound(0.5); + assertEquals(ub, sk.getQuantile(.5 + eps)); + assertEquals(lb, sk.getQuantile(0.5 - eps)); + println("Ext : " + est); + println("UB : " + ub); + println("LB : " + lb); + } + + @Test + public void checkMisc() { + final KllDirectDoublesSketch sk = getDDSketch(8, 0); + assertTrue(Objects.isNull(sk.getQuantiles(10))); + //sk.toString(true, true); + for (int i = 0; i < 20; i++) { sk.update(i); } + //sk.toString(true, true); + //sk.toByteArray(); + final double[] items = sk.getDoubleItemsArray(); + assertEquals(items.length, 16); + final int[] levels = sk.getLevelsArray(); + assertEquals(levels.length, 3); + assertEquals(sk.getNumLevels(), 2); + } + + //@Test //enable static println(..) for visual checking + public void visualCheckToString() { + final KllDirectDoublesSketch sk = getDDSketch(20, 0); + for (int i = 0; i < 10; i++) { sk.update(i + 1); } + println(sk.toString(true, true)); + + final KllDirectDoublesSketch sk2 = getDDSketch(20, 0); + for (int i = 0; i < 400; i++) { sk2.update(i + 1); } + println("\n" + sk2.toString(true, true)); + + sk2.merge(sk); + final String s2 = sk2.toString(true, true); + println(LS + s2); + } + + //@Test + public void viewCompactions() { + final KllDirectDoublesSketch sk = getDDSketch(20, 0); + show(sk, 20); + show(sk, 21); //compaction 1 + show(sk, 43); + show(sk, 44); //compaction 2 + show(sk, 54); + show(sk, 55); //compaction 3 + show(sk, 73); + show(sk, 74); //compaction 4 + show(sk, 88); + show(sk, 89); //compaction 5 + show(sk, 96); + show(sk, 97); //compaction 6 + show(sk, 108); + } + + private static void show(final KllDirectDoublesSketch sk, int limit) { + int i = (int) sk.getN(); + for ( ; i < limit; i++) { sk.update(i + 1); } + println(sk.toString(true, true)); + } + + @Test + public void checkSketchInitializeDoubleHeap() { + int k = 20; //don't change this + KllDirectDoublesSketch sk; + + //println("#### CASE: DOUBLE FULL HEAP"); + sk = getDDSketch(k, 0); + for (int i = 1; i <= k + 1; i++) { sk.update(i); } + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxDoubleValue(), 21.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + //println("#### CASE: DOUBLE HEAP EMPTY"); + sk = getDDSketch(k, 0); + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), Double.NaN); + assertEquals(sk.getMinDoubleValue(), Double.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + //println("#### CASE: DOUBLE HEAP SINGLE"); + sk = getDDSketch(k, 0); + sk.update(1); + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), 1.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkSketchInitializeDoubleHeapifyCompactMem() { + int k = 20; //don't change this + KllDoublesSketch sk; + KllDirectDoublesSketch sk2; + byte[] compBytes; + WritableMemory wmem; + + //println("#### CASE: DOUBLE FULL HEAPIFIED FROM COMPACT"); + sk2 = getDDSketch(k, 0); + for (int i = 1; i <= k + 1; i++) { sk2.update(i); } + //println(sk.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxDoubleValue(), 21.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + //println("#### CASE: DOUBLE EMPTY HEAPIFIED FROM COMPACT"); + sk2 = getDDSketch(k, 0); + //println(sk.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), Double.NaN); + assertEquals(sk.getMinDoubleValue(), Double.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + //println("#### CASE: DOUBLE SINGLE HEAPIFIED FROM COMPACT"); + sk2 = getDDSketch(k, 0); + sk2.update(1); + //println(sk2.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), 1.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkSketchInitializeDoubleHeapifyUpdatableMem() { + int k = 20; //don't change this + KllDoublesSketch sk; + KllDirectDoublesSketch sk2; + byte[] compBytes; + WritableMemory wmem; + + //println("#### CASE: DOUBLE FULL HEAPIFIED FROM UPDATABLE"); + sk2 = getDDSketch(k, 0); + for (int i = 1; i <= k + 1; i++) { sk2.update(i); } + //println(sk2.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxDoubleValue(), 21.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + // println("#### CASE: DOUBLE EMPTY HEAPIFIED FROM UPDATABLE"); + sk2 = getDDSketch(k, 0); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), Double.NaN); + assertEquals(sk.getMinDoubleValue(), Double.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + //println("#### CASE: DOUBLE SINGLE HEAPIFIED FROM UPDATABLE"); + sk2 = getDDSketch(k, 0); + sk2.update(1); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), 1.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkMemoryToStringDoubleUpdatable() { + int k = 20; //don't change this + KllDirectDoublesSketch sk; + KllDirectDoublesSketch sk2; + byte[] upBytes; + byte[] upBytes2; + WritableMemory wmem; + String s; + + println("#### CASE: DOUBLE FULL UPDATABLE"); + sk = getDDSketch(k, 0); + for (int i = 1; i <= k + 1; i++) { sk.update(i); } + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDirectDoublesSketch.writableWrap(wmem, memReqSvr); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + + println("#### CASE: DOUBLE EMPTY UPDATABLE"); + sk = getDDSketch(k, 0); + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDirectDoublesSketch.writableWrap(wmem, memReqSvr); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + + println("#### CASE: DOUBLE SINGLE UPDATABL"); + sk = getDDSketch(k, 0); + sk.update(1); + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDirectDoublesSketch.writableWrap(wmem, memReqSvr); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + } + + @Test + public void checkSimpleMerge() { + int k = 20; + int n1 = 21; + int n2 = 21; + KllDirectDoublesSketch sk1 = getDDSketch(k, 0); + KllDirectDoublesSketch sk2 = getDDSketch(k, 0); + for (int i = 1; i <= n1; i++) { + sk1.update(i); + } + for (int i = 1; i <= n2; i++) { + sk2.update(i + 100); + } + println(sk1.toString(true, true)); + println(sk2.toString(true, true)); + sk1.merge(sk2); + println(sk1.toString(true, true)); + assertEquals(sk1.getMaxValue(), 121.0); + assertEquals(sk1.getMinValue(), 1.0); + } + + @Test + public void checkSizes() { + KllDirectDoublesSketch sk = getDDSketch(20, 0); + for (int i = 1; i <= 21; i++) { sk.update(i); } + //println(sk.toString(true, true)); + byte[] byteArr1 = sk.toUpdatableByteArray(); + int size1 = sk.getCurrentUpdatableSerializedSizeBytes(); + assertEquals(size1, byteArr1.length); + byte[] byteArr2 = sk.toByteArray(); + int size2 = sk.getCurrentCompactSerializedSizeBytes(); + assertEquals(size2, byteArr2.length); + } + + @Test + public void checkNewInstance() { + int k = 200; + WritableMemory dstMem = WritableMemory.allocate(6000); + KllDirectDoublesSketch sk = KllDirectDoublesSketch.newInstance(k, dstMem, memReqSvr); + for (int i = 1; i <= 10_000; i++) {sk.update(i); } + assertEquals(sk.getMinValue(), 1.0); + assertEquals(sk.getMaxValue(), 10000.0); + //println(sk.toString(true, true)); + } + + @Test + public void checkDifferentM() { + int k = 20; + int m = 4; + WritableMemory dstMem = WritableMemory.allocate(1000); + KllDirectDoublesSketch sk = KllDirectDoublesSketch.newInstance(k, m, dstMem, memReqSvr); + for (int i = 1; i <= 200; i++) {sk.update(i); } + assertEquals(sk.getMinValue(), 1.0); + assertEquals(sk.getMaxValue(), 200.0); + } + + private static KllDirectDoublesSketch getDDSketch(final int k, final int n) { + KllDoublesSketch sk = new KllDoublesSketch(k); + for (int i = 1; i <= n; i++) { sk.update(i); } + byte[] byteArr = sk.toUpdatableByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + KllDirectDoublesSketch ddsk = KllDirectDoublesSketch.writableWrap(wmem, memReqSvr); + return ddsk; + } + + @Test + public void printlnTest() { + println("PRINTING: " + this.getClass().getName()); + } + + /** + * @param s value to print + */ + static void println(final String s) { + //System.out.println(s); //disable here + } + +} diff --git a/src/test/java/org/apache/datasketches/kll/MiscDirectFloatsTest.java b/src/test/java/org/apache/datasketches/kll/MiscDirectFloatsTest.java new file mode 100644 index 000000000..597ebe5dc --- /dev/null +++ b/src/test/java/org/apache/datasketches/kll/MiscDirectFloatsTest.java @@ -0,0 +1,440 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.kll; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +import java.util.Objects; + +import org.apache.datasketches.memory.DefaultMemoryRequestServer; +import org.apache.datasketches.memory.WritableMemory; +import org.testng.annotations.Test; + +public class MiscDirectFloatsTest { + static final String LS = System.getProperty("line.separator"); + private static final DefaultMemoryRequestServer memReqSvr = new DefaultMemoryRequestServer(); + + @Test + public void checkBounds() { + final KllDirectFloatsSketch sk = getDFSketch(200, 0); + for (int i = 0; i < 1000; i++) { + sk.update(i); + } + final double eps = sk.getNormalizedRankError(false); + final float est = sk.getQuantile(0.5); + final float ub = sk.getQuantileUpperBound(0.5); + final float lb = sk.getQuantileLowerBound(0.5); + assertEquals(ub, sk.getQuantile(.5 + eps)); + assertEquals(lb, sk.getQuantile(0.5 - eps)); + println("Ext : " + est); + println("UB : " + ub); + println("LB : " + lb); + } + + @Test + public void checkMisc() { + final KllDirectFloatsSketch sk = getDFSketch(8, 0); + assertTrue(Objects.isNull(sk.getQuantiles(10))); + //sk.toString(true, true); + for (int i = 0; i < 20; i++) { sk.update(i); } + //sk.toString(true, true); + //sk.toByteArray(); + final float[] items = sk.getFloatItemsArray(); + assertEquals(items.length, 16); + final int[] levels = sk.getLevelsArray(); + assertEquals(levels.length, 3); + assertEquals(sk.getNumLevels(), 2); + } + + //@Test //enable static println(..) for visual checking + public void visualCheckToString() { + final KllDirectFloatsSketch sk = getDFSketch(20, 0); + for (int i = 0; i < 10; i++) { sk.update(i + 1); } + println(sk.toString(true, true)); + + final KllDirectFloatsSketch sk2 = getDFSketch(20, 0); + for (int i = 0; i < 400; i++) { sk2.update(i + 1); } + println("\n" + sk2.toString(true, true)); + + sk2.merge(sk); + final String s2 = sk2.toString(true, true); + println(LS + s2); + } + + //@Test + public void viewCompactions() { + final KllDirectFloatsSketch sk = getDFSketch(20, 0); + show(sk, 20); + show(sk, 21); //compaction 1 + show(sk, 43); + show(sk, 44); //compaction 2 + show(sk, 54); + show(sk, 55); //compaction 3 + show(sk, 73); + show(sk, 74); //compaction 4 + show(sk, 88); + show(sk, 89); //compaction 5 + show(sk, 96); + show(sk, 97); //compaction 6 + show(sk, 108); + } + + private static void show(final KllDirectFloatsSketch sk, int limit) { + int i = (int) sk.getN(); + for ( ; i < limit; i++) { sk.update(i + 1); } + println(sk.toString(true, true)); + } + + @Test + public void checkSketchInitializeFloatHeap() { + int k = 20; //don't change this + KllDirectFloatsSketch sk; + + //println("#### CASE: FLOAT FULL HEAP"); + sk = getDFSketch(k, 0); + for (int i = 1; i <= k + 1; i++) { sk.update(i); } + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxFloatValue(), 21.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + //println("#### CASE: FLOAT HEAP EMPTY"); + sk = getDFSketch(k, 0); + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), Float.NaN); + assertEquals(sk.getMinFloatValue(), Float.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + //println("#### CASE: FLOAT HEAP SINGLE"); + sk = getDFSketch(k, 0); + sk.update(1); + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), 1.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkSketchInitializeFloatHeapifyCompactMem() { + int k = 20; //don't change this + KllFloatsSketch sk; + KllDirectFloatsSketch sk2; + byte[] compBytes; + WritableMemory wmem; + + //println("#### CASE: FLOAT FULL HEAPIFIED FROM COMPACT"); + sk2 = getDFSketch(k, 0); + for (int i = 1; i <= k + 1; i++) { sk2.update(i); } + //println(sk.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxFloatValue(), 21.0F); + assertEquals(sk.getMinFloatValue(), 1.0f); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + //println("#### CASE: FLOAT EMPTY HEAPIFIED FROM COMPACT"); + sk2 = getDFSketch(k, 0); + //println(sk.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), Float.NaN); + assertEquals(sk.getMinFloatValue(), Float.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + //println("#### CASE: FLOAT SINGLE HEAPIFIED FROM COMPACT"); + sk2 = getDFSketch(k, 0); + sk2.update(1); + //println(sk2.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), 1.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkSketchInitializeFloatHeapifyUpdatableMem() { + int k = 20; //don't change this + KllFloatsSketch sk; + KllDirectFloatsSketch sk2; + byte[] compBytes; + WritableMemory wmem; + + //println("#### CASE: FLOAT FULL HEAPIFIED FROM UPDATABLE"); + sk2 = getDFSketch(k, 0); + for (int i = 1; i <= k + 1; i++) { sk2.update(i); } + //println(sk2.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxFloatValue(), 21.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + // println("#### CASE: FLOAT EMPTY HEAPIFIED FROM UPDATABLE"); + sk2 = getDFSketch(k, 0); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), Float.NaN); + assertEquals(sk.getMinFloatValue(), Float.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + //println("#### CASE: FLOAT SINGLE HEAPIFIED FROM UPDATABLE"); + sk2 = getDFSketch(k, 0); + sk2.update(1); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + //println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), 1.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkMemoryToStringFloatUpdatable() { + int k = 20; //don't change this + KllDirectFloatsSketch sk; + KllDirectFloatsSketch sk2; + byte[] upBytes; + byte[] upBytes2; + WritableMemory wmem; + String s; + + println("#### CASE: FLOAT FULL UPDATABLE"); + sk = getDFSketch(k, 0); + for (int i = 1; i <= k + 1; i++) { sk.update(i); } + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDirectFloatsSketch.writableWrap(wmem, memReqSvr); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + + println("#### CASE: FLOAT EMPTY UPDATABLE"); + sk = getDFSketch(k, 0); + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDirectFloatsSketch.writableWrap(wmem, memReqSvr); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + + println("#### CASE: FLOAT SINGLE UPDATABL"); + sk = getDFSketch(k, 0); + sk.update(1); + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDirectFloatsSketch.writableWrap(wmem, memReqSvr); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + } + + @Test + public void checkSimpleMerge() { + int k = 20; + int n1 = 21; + int n2 = 21; + KllDirectFloatsSketch sk1 = getDFSketch(k, 0); + KllDirectFloatsSketch sk2 = getDFSketch(k, 0); + for (int i = 1; i <= n1; i++) { + sk1.update(i); + } + for (int i = 1; i <= n2; i++) { + sk2.update(i + 100); + } + println(sk1.toString(true, true)); + println(sk2.toString(true, true)); + sk1.merge(sk2); + println(sk1.toString(true, true)); + assertEquals(sk1.getMaxValue(), 121.0F); + assertEquals(sk1.getMinValue(), 1.0F); + } + + @Test + public void checkSizes() { + KllDirectFloatsSketch sk = getDFSketch(20, 0); + for (int i = 1; i <= 21; i++) { sk.update(i); } + //println(sk.toString(true, true)); + byte[] byteArr1 = sk.toUpdatableByteArray(); + int size1 = sk.getCurrentUpdatableSerializedSizeBytes(); + assertEquals(size1, byteArr1.length); + byte[] byteArr2 = sk.toByteArray(); + int size2 = sk.getCurrentCompactSerializedSizeBytes(); + assertEquals(size2, byteArr2.length); + } + + @Test + public void checkNewInstance() { + int k = 200; + WritableMemory dstMem = WritableMemory.allocate(3000); + KllDirectFloatsSketch sk = KllDirectFloatsSketch.newInstance(k, dstMem, memReqSvr); + for (int i = 1; i <= 10_000; i++) {sk.update(i); } + assertEquals(sk.getMinValue(), 1.0F); + assertEquals(sk.getMaxValue(), 10000.0F); + //println(sk.toString(true, true)); + } + + @Test + public void checkDifferentM() { + int k = 20; + int m = 4; + WritableMemory dstMem = WritableMemory.allocate(1000); + KllDirectFloatsSketch sk = KllDirectFloatsSketch.newInstance(k, m, dstMem, memReqSvr); + for (int i = 1; i <= 200; i++) {sk.update(i); } + assertEquals(sk.getMinValue(), 1.0); + assertEquals(sk.getMaxValue(), 200.0); + } + + private static KllDirectFloatsSketch getDFSketch(final int k, final int n) { + KllFloatsSketch sk = new KllFloatsSketch(k); + for (int i = 1; i <= n; i++) { sk.update(i); } + byte[] byteArr = sk.toUpdatableByteArray(); + WritableMemory wmem = WritableMemory.writableWrap(byteArr); + KllDirectFloatsSketch dfsk = KllDirectFloatsSketch.writableWrap(wmem, memReqSvr); + return dfsk; + } + + @Test + public void printlnTest() { + println("PRINTING: " + this.getClass().getName()); + } + + /** + * @param s value to print + */ + static void println(final String s) { + //System.out.println(s); //disable here + } + +} diff --git a/src/test/java/org/apache/datasketches/kll/MiscDoublesTest.java b/src/test/java/org/apache/datasketches/kll/MiscDoublesTest.java index 791684eb9..276f52776 100644 --- a/src/test/java/org/apache/datasketches/kll/MiscDoublesTest.java +++ b/src/test/java/org/apache/datasketches/kll/MiscDoublesTest.java @@ -20,12 +20,13 @@ package org.apache.datasketches.kll; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import java.util.Objects; -import org.apache.datasketches.SketchesArgumentException; import org.apache.datasketches.memory.WritableMemory; +import org.apache.datasketches.SketchesArgumentException; import org.testng.annotations.Test; /** @@ -33,17 +34,7 @@ */ @SuppressWarnings("javadoc") public class MiscDoublesTest { - - @Test - public void checkGetKFromEps() { - final int k = BaseKllSketch.DEFAULT_K; - final double eps = BaseKllSketch.getNormalizedRankError(k, false); - final double epsPmf = BaseKllSketch.getNormalizedRankError(k, true); - final int kEps = BaseKllSketch.getKFromEpsilon(eps, false); - final int kEpsPmf = BaseKllSketch.getKFromEpsilon(epsPmf, true); - assertEquals(kEps, k); - assertEquals(kEpsPmf, k); - } + static final String LS = System.getProperty("line.separator"); @Test public void checkBounds() { @@ -66,7 +57,7 @@ public void checkBounds() { public void checkHeapifyExceptions1() { KllDoublesSketch sk = new KllDoublesSketch(); WritableMemory wmem = WritableMemory.writableWrap(sk.toByteArray()); - wmem.putByte(6, (byte)4); //corrupt M + wmem.putByte(6, (byte)3); //corrupt with odd M KllDoublesSketch.heapify(wmem); } @@ -106,31 +97,399 @@ public void checkHeapifyExceptions5() { @Test public void checkMisc() { - KllDoublesSketch sk = new KllDoublesSketch(8, true); + KllDoublesSketch sk = new KllDoublesSketch(8); assertTrue(Objects.isNull(sk.getQuantiles(10))); sk.toString(true, true); for (int i = 0; i < 20; i++) { sk.update(i); } sk.toString(true, true); sk.toByteArray(); - final double[] items = sk.getItems(); + final double[] items = sk.getDoubleItemsArray(); assertEquals(items.length, 16); - final int[] levels = sk.getLevels(); + final int[] levels = sk.getLevelsArray(); assertEquals(levels.length, 3); assertEquals(sk.getNumLevels(), 2); } - //@Test //requires visual check - public void visualCheck() { + //@Test //enable static println(..) for visual checking + public void visualCheckToString() { final KllDoublesSketch sketch = new KllDoublesSketch(20); for (int i = 0; i < 10; i++) { sketch.update(i + 1); } - println(sketch.toString(true, true)); + final String s1 = sketch.toString(true, true); + println(s1); final KllDoublesSketch sketch2 = new KllDoublesSketch(20); for (int i = 0; i < 400; i++) { sketch2.update(i + 1); } println("\n" + sketch2.toString(true, true)); sketch2.merge(sketch); - println("\n" + sketch2.toString(true, true)); + final String s2 = sketch2.toString(true, true); + println(LS + s2); + } + + @Test + public void viewCompactions() { + KllDoublesSketch sk = new KllDoublesSketch(20); + show(sk, 20); + show(sk, 21); //compaction 1 + show(sk, 43); + show(sk, 44); //compaction 2 + show(sk, 54); + show(sk, 55); //compaction 3 + show(sk, 73); + show(sk, 74); //compaction 4 + show(sk, 88); + show(sk, 89); //compaction 5 + show(sk, 96); + show(sk, 97); //compaction 6 + show(sk, 108); + } + + private static void show(final KllDoublesSketch sk, int limit) { + int i = (int) sk.getN(); + for ( ; i < limit; i++) { sk.update(i + 1); } + println(sk.toString(true, true)); + } + + @Test + public void checkGrowLevels() { + KllDoublesSketch sk = new KllDoublesSketch(20); + for (int i = 1; i <= 21; i++) { sk.update(i); } + assertEquals(sk.getNumLevels(), 2); + assertEquals(sk.getDoubleItemsArray().length, 33); + assertEquals(sk.getLevelsArray()[2], 33); + } + + @Test + public void checkSketchInitializeDoubleHeap() { + int k = 20; //don't change this + KllDoublesSketch sk; + + println("#### CASE: DOUBLE FULL HEAP"); + sk = new KllDoublesSketch(k); + for (int i = 1; i <= k + 1; i++) { sk.update(i); } + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxDoubleValue(), 21.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: DOUBLE HEAP EMPTY"); + sk = new KllDoublesSketch(k); + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), Double.NaN); + assertEquals(sk.getMinDoubleValue(), Double.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: DOUBLE HEAP SINGLE"); + sk = new KllDoublesSketch(k); + sk.update(1); + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), 1.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkSketchInitializeDoubleHeapifyCompactMem() { + int k = 20; //don't change this + KllDoublesSketch sk; + KllDoublesSketch sk2; + byte[] compBytes; + WritableMemory wmem; + + println("#### CASE: DOUBLE FULL HEAPIFIED FROM COMPACT"); + sk2 = new KllDoublesSketch(k); + for (int i = 1; i <= k + 1; i++) { sk2.update(i); } + //println(sk.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxDoubleValue(), 21.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: DOUBLE EMPTY HEAPIFIED FROM COMPACT"); + sk2 = new KllDoublesSketch(k); + //println(sk.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), Double.NaN); + assertEquals(sk.getMinDoubleValue(), Double.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: DOUBLE SINGLE HEAPIFIED FROM COMPACT"); + sk2 = new KllDoublesSketch(k); + sk2.update(1); + println(sk2.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), 1.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkSketchInitializeDoubleHeapifyUpdatableMem() { + int k = 20; //don't change this + KllDoublesSketch sk; + KllDoublesSketch sk2; + byte[] compBytes; + WritableMemory wmem; + + println("#### CASE: DOUBLE FULL HEAPIFIED FROM UPDATABLE"); + sk2 = new KllDoublesSketch(k); + for (int i = 1; i <= k + 1; i++) { sk2.update(i); } + //println(sk2.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxDoubleValue(), 21.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: DOUBLE EMPTY HEAPIFIED FROM UPDATABLE"); + sk2 = new KllDoublesSketch(k); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), Double.NaN); + assertEquals(sk.getMinDoubleValue(), Double.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: DOUBLE SINGLE HEAPIFIED FROM UPDATABLE"); + sk2 = new KllDoublesSketch(k); + sk2.update(1); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllDoublesSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getDoubleItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxDoubleValue(), 1.0); + assertEquals(sk.getMinDoubleValue(), 1.0); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkMemoryToStringDoubleCompact() { + int k = 20; // don't change this + KllDoublesSketch sk; + KllDoublesSketch sk2; + byte[] compBytes; + byte[] compBytes2; + WritableMemory wmem; + String s; + + println("#### CASE: DOUBLE FULL COMPACT"); + sk = new KllDoublesSketch(k); + for (int i = 1; i <= k + 1; i++) { sk.update(i); } + compBytes = sk.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDoublesSketch.heapify(wmem); + compBytes2 = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(compBytes, compBytes2); + + println("#### CASE: DOUBLE EMPTY COMPACT"); + sk = new KllDoublesSketch(20); + compBytes = sk.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDoublesSketch.heapify(wmem); + compBytes2 = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(compBytes, compBytes2); + + println("#### CASE: DOUBLE SINGLE COMPACT"); + sk = new KllDoublesSketch(20); + sk.update(1); + compBytes = sk.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDoublesSketch.heapify(wmem); + compBytes2 = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(compBytes, compBytes2); + } + + @Test + public void checkMemoryToStringDoubleUpdatable() { + int k = 20; //don't change this + KllDoublesSketch sk; + KllDoublesSketch sk2; + byte[] upBytes; + byte[] upBytes2; + WritableMemory wmem; + String s; + + println("#### CASE: DOUBLE FULL UPDATABLE"); + sk = new KllDoublesSketch(20); + for (int i = 1; i <= k + 1; i++) { sk.update(i); } + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDoublesSketch.heapify(wmem); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + + println("#### CASE: DOUBLE EMPTY UPDATABLE"); + sk = new KllDoublesSketch(k); + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDoublesSketch.heapify(wmem); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + + println("#### CASE: DOUBLE SINGLE UPDATABL"); + sk = new KllDoublesSketch(k); + sk.update(1); + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllDoublesSketch.heapify(wmem); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + } + + @Test + public void checkSimpleMerge() { + int k = 20; + int n1 = 21; + int n2 = 21; + KllDoublesSketch sk1 = new KllDoublesSketch(k); + KllDoublesSketch sk2 = new KllDoublesSketch(k); + for (int i = 1; i <= n1; i++) { + sk1.update(i); + } + for (int i = 1; i <= n2; i++) { + sk2.update(i + 100); + } + println(sk1.toString(true, true)); + println(sk2.toString(true, true)); + sk1.merge(sk2); + println(sk1.toString(true, true)); } @Test diff --git a/src/test/java/org/apache/datasketches/kll/MiscFloatsTest.java b/src/test/java/org/apache/datasketches/kll/MiscFloatsTest.java index f7a0aeebc..a53199a20 100644 --- a/src/test/java/org/apache/datasketches/kll/MiscFloatsTest.java +++ b/src/test/java/org/apache/datasketches/kll/MiscFloatsTest.java @@ -20,13 +20,13 @@ package org.apache.datasketches.kll; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import java.util.Objects; import org.apache.datasketches.SketchesArgumentException; import org.apache.datasketches.memory.WritableMemory; - import org.testng.annotations.Test; /** @@ -34,17 +34,7 @@ */ @SuppressWarnings("javadoc") public class MiscFloatsTest { - - @Test - public void checkGetKFromEps() { - final int k = BaseKllSketch.DEFAULT_K; - final double eps = BaseKllSketch.getNormalizedRankError(k, false); - final double epsPmf = BaseKllSketch.getNormalizedRankError(k, true); - final int kEps = BaseKllSketch.getKFromEpsilon(eps, false); - final int kEpsPmf = BaseKllSketch.getKFromEpsilon(epsPmf, true); - assertEquals(kEps, k); - assertEquals(kEpsPmf, k); - } + static final String LS = System.getProperty("line.separator"); @Test public void checkBounds() { @@ -67,7 +57,7 @@ public void checkBounds() { public void checkHeapifyExceptions1() { KllFloatsSketch sk = new KllFloatsSketch(); WritableMemory wmem = WritableMemory.writableWrap(sk.toByteArray()); - wmem.putByte(6, (byte)4); //corrupt M + wmem.putByte(6, (byte)3); //corrupt with odd M KllFloatsSketch.heapify(wmem); } @@ -107,32 +97,405 @@ public void checkHeapifyExceptions5() { @Test public void checkMisc() { - KllFloatsSketch sk = new KllFloatsSketch(8, true); + KllFloatsSketch sk = new KllFloatsSketch(8); assertTrue(Objects.isNull(sk.getQuantiles(10))); sk.toString(true, true); for (int i = 0; i < 20; i++) { sk.update(i); } sk.toString(true, true); sk.toByteArray(); - final float[] items = sk.getItems(); + final float[] items = sk.getFloatItemsArray(); assertEquals(items.length, 16); - final int[] levels = sk.getLevels(); + final int[] levels = sk.getLevelsArray(); assertEquals(levels.length, 3); assertEquals(sk.getNumLevels(), 2); } - //@Test //requires visual check - public void checkNumRetainedAboveLevelZero() { + //@Test //enable static println(..) for visual checking + public void visualCheckToString() { final KllFloatsSketch sketch = new KllFloatsSketch(20); for (int i = 0; i < 10; i++) { sketch.update(i + 1); } final String s1 = sketch.toString(true, true); println(s1); + final KllFloatsSketch sketch2 = new KllFloatsSketch(20); - for (int i = 0; i < 400; i++) { - sketch2.update(i + 1); - } + for (int i = 0; i < 400; i++) { sketch2.update(i + 1); } + println("\n" + sketch2.toString(true, true)); + sketch2.merge(sketch); final String s2 = sketch2.toString(true, true); - println(s2); + println(LS + s2); + } + + @Test + public void viewCompactions() { + KllFloatsSketch sk = new KllFloatsSketch(20); + show(sk, 20); + show(sk, 21); //compaction 1 + show(sk, 43); + show(sk, 44); //compaction 2 + show(sk, 54); + show(sk, 55); //compaction 3 + show(sk, 73); + show(sk, 74); //compaction 4 + show(sk, 88); + show(sk, 89); //compaction 5 + show(sk, 96); + show(sk, 97); //compaction 6 + show(sk, 108); + } + + private static void show(final KllFloatsSketch sk, int limit) { + int i = (int) sk.getN(); + for ( ; i < limit; i++) { sk.update(i + 1); } + println(sk.toString(true, true)); + } + + @Test + public void checkGrowLevels() { + KllFloatsSketch sk = new KllFloatsSketch(20); + for (int i = 1; i <= 21; i++) { sk.update(i); } + assertEquals(sk.getNumLevels(), 2); + assertEquals(sk.getFloatItemsArray().length, 33); + assertEquals(sk.getLevelsArray()[2], 33); + } + + @Test + public void checkSketchInitializeFloatHeap() { + int k = 20; //don't change this + KllFloatsSketch sk; + + println("#### CASE: FLOAT FULL HEAP"); + sk = new KllFloatsSketch(k); + for (int i = 1; i <= k + 1; i++) { sk.update(i); } + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxFloatValue(), 21.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: FLOAT HEAP EMPTY"); + sk = new KllFloatsSketch(k); + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), Float.NaN); + assertEquals(sk.getMinFloatValue(), Float.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: FLOAT HEAP SINGLE"); + sk = new KllFloatsSketch(k); + sk.update(1); + //println(sk.toString(true, true)); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), 1.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkSketchInitializeFloatHeapifyCompactMem() { + int k = 20; //don't change this + KllFloatsSketch sk; + KllFloatsSketch sk2; + byte[] compBytes; + WritableMemory wmem; + + println("#### CASE: FLOAT FULL HEAPIFIED FROM COMPACT"); + sk2 = new KllFloatsSketch(k); + for (int i = 1; i <= k + 1; i++) { sk2.update(i); } + println(sk2.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxFloatValue(), 21.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: FLOAT EMPTY HEAPIFIED FROM COMPACT"); + sk2 = new KllFloatsSketch(k); + //println(sk.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), Float.NaN); + assertEquals(sk.getMinFloatValue(), Float.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: FLOAT SINGLE HEAPIFIED FROM COMPACT"); + sk2 = new KllFloatsSketch(k); + sk2.update(1); + //println(sk2.toString(true, true)); + compBytes = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), 1.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkSketchInitializeFloatHeapifyUpdatableMem() { + int k = 20; //don't change this + KllFloatsSketch sk; + KllFloatsSketch sk2; + byte[] compBytes; + WritableMemory wmem; + + println("#### CASE: FLOAT FULL HEAPIFIED FROM UPDATABLE"); + sk2 = new KllFloatsSketch(k); + for (int i = 1; i <= k + 1; i++) { sk2.update(i); } + //println(sk2.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), k + 1); + assertEquals(sk.getNumRetained(), 11); + assertFalse(sk.isEmpty()); + assertTrue(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 33); + assertEquals(sk.getLevelsArray().length, 3); + assertEquals(sk.getMaxFloatValue(), 21.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 2); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: FLOAT EMPTY HEAPIFIED FROM UPDATABLE"); + sk2 = new KllFloatsSketch(k); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 0); + assertEquals(sk.getNumRetained(), 0); + assertTrue(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), Float.NaN); + assertEquals(sk.getMinFloatValue(), Float.NaN); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + + println("#### CASE: FLOAT SINGLE HEAPIFIED FROM UPDATABLE"); + sk2 = new KllFloatsSketch(k); + sk2.update(1); + //println(sk.toString(true, true)); + compBytes = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + println(KllPreambleUtil.toString(wmem)); + sk = KllFloatsSketch.heapify(wmem); + assertEquals(sk.getK(), k); + assertEquals(sk.getN(), 1); + assertEquals(sk.getNumRetained(), 1); + assertFalse(sk.isEmpty()); + assertFalse(sk.isEstimationMode()); + assertEquals(sk.getMinK(), k); + assertEquals(sk.getFloatItemsArray().length, 20); + assertEquals(sk.getLevelsArray().length, 2); + assertEquals(sk.getMaxFloatValue(), 1.0F); + assertEquals(sk.getMinFloatValue(), 1.0F); + assertEquals(sk.getNumLevels(), 1); + assertFalse(sk.isLevelZeroSorted()); + } + + @Test + public void checkMemoryToStringFloatCompact() { + int k = 20; //don't change this + KllFloatsSketch sk; + KllFloatsSketch sk2; + byte[] compBytes; + byte[] compBytes2; + WritableMemory wmem; + String s; + + println("#### CASE: FLOAT FULL COMPACT"); + sk = new KllFloatsSketch(k); + for (int i = 1; i <= k + 1; i++) { sk.update(i); } + compBytes = sk.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllFloatsSketch.heapify(wmem); + compBytes2 = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(compBytes, compBytes2); + + println("#### CASE: FLOAT EMPTY COMPACT"); + sk = new KllFloatsSketch(k); + compBytes = sk.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllFloatsSketch.heapify(wmem); + compBytes2 = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(compBytes, compBytes2); + + println("#### CASE: FLOAT SINGLE COMPACT"); + sk = new KllFloatsSketch(k); + sk.update(1); + compBytes = sk.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllFloatsSketch.heapify(wmem); + compBytes2 = sk2.toByteArray(); + wmem = WritableMemory.writableWrap(compBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(compBytes, compBytes2); + } + + @Test + public void checkMemoryToStringFloatUpdatable() { + int k = 20; //don't change this + KllFloatsSketch sk; + KllFloatsSketch sk2; + byte[] upBytes; + byte[] upBytes2; + WritableMemory wmem; + String s; + + println("#### CASE: FLOAT FULL UPDATABLE"); + sk = new KllFloatsSketch(20); + for (int i = 1; i <= k + 1; i++) { sk.update(i); } + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllFloatsSketch.heapify(wmem); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + + println("#### CASE: FLOAT EMPTY UPDATABLE"); + sk = new KllFloatsSketch(k); + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllFloatsSketch.heapify(wmem); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + + println("#### CASE: FLOAT SINGLE UPDATABLE"); + sk = new KllFloatsSketch(k); + sk.update(1); + upBytes = sk.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes); + s = KllPreambleUtil.memoryToString(wmem); + println("step 1: sketch to byte[]/memory & analyze memory"); + println(s); + sk2 = KllFloatsSketch.heapify(wmem); + upBytes2 = sk2.toUpdatableByteArray(); + wmem = WritableMemory.writableWrap(upBytes2); + s = KllPreambleUtil.memoryToString(wmem); + println("step 2: memory to heap sketch, to byte[]/memory & analyze memory. Should match above"); + println(s); + assertEquals(upBytes, upBytes2); + } + + @Test + public void checkSimpleMerge() { + int k = 20; + int m = 4; + int n1 = 21; + int n2 = 43; + KllFloatsSketch sk1 = new KllFloatsSketch(k, m); + KllFloatsSketch sk2 = new KllFloatsSketch(k, m); + for (int i = 1; i <= n1; i++) { + sk1.update(i); + } + for (int i = 1; i <= n2; i++) { + sk2.update(i + 100); + } + println(sk1.toString(true, true)); + println(sk2.toString(true, true)); + sk1.merge(sk2); + println(sk1.toString(true, true)); + } + + @Test + public void checkOtherM() { + } @Test diff --git a/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java index 7c017edfb..5513f7d6a 100644 --- a/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java @@ -855,9 +855,9 @@ public void checkPutMemoryTooSmall() { } @Test - public void checkAuxPosOfPhi() throws Exception { + public void checkAuxPosOfRank() throws Exception { long n = 10; - long returnValue = QuantilesHelper.posOfPhi(1.0, 10); + long returnValue = QuantilesHelper.posOfRank(1.0, 10); //println("" + returnValue); assertEquals(returnValue, n-1); } diff --git a/tools/SketchesCheckstyle.xml b/tools/SketchesCheckstyle.xml index 3fef29680..fe398312f 100644 --- a/tools/SketchesCheckstyle.xml +++ b/tools/SketchesCheckstyle.xml @@ -173,7 +173,7 @@ under the License. - + @@ -352,6 +352,7 @@ under the License. +