Skip to content
Permalink
Browse files
Iteration 11, ready for review.
  • Loading branch information
leerho committed Mar 26, 2022
1 parent 60cbfaa commit 5e3dd401fa7f90b0213e48b825849b6b04faf237
Showing 11 changed files with 1,519 additions and 66 deletions.
@@ -44,8 +44,6 @@ public KllDirectDoublesSketch(final WritableMemory wmem, final MemoryRequestServ
super(SketchType.DOUBLES_SKETCH, wmem, memReqSvr);
}

//public int getNumRetained()

/**
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
* cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
@@ -0,0 +1,249 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.datasketches.kll;

import static java.lang.Math.max;
import static java.lang.Math.min;

import org.apache.datasketches.memory.MemoryRequestServer;
import org.apache.datasketches.memory.WritableMemory;

/**
* This class implements an off-heap floats KllSketch via a WritableMemory instance of the sketch.
*
* <p>Please refer to the documentation in the package-info:<br>
* {@link org.apache.datasketches.kll}</p>
*
* @author Lee Rhodes, Kevin Lang
*/
public class KllDirectFloatsSketch extends KllDirectSketch {

/**
*
* @param wmem the current WritableMemory
* @param memReqSvr the given MemoryRequestServer to request a larger WritableMemory
*/
public KllDirectFloatsSketch(final WritableMemory wmem, final MemoryRequestServer memReqSvr) {
super(SketchType.FLOATS_SKETCH, wmem, memReqSvr);
}

/**
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
* cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
*
* <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
* getNormalizedRankError(false) function.
*
* <p>If the sketch is empty this returns null.</p>
*
* @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
* that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
* The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
* exclusive of the right splitPoint, with the exception that the last interval will include
* the maximum value.
* It is not necessary to include either the min or max values in these split points.
*
* @return an array of m+1 double values, which are a consecutive approximation to the CDF
* of the input stream given the splitPoints. The value at array position j of the returned
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
* array.
*/
public double[] getCDF(final float[] splitPoints) {
return getFloatsPmfOrCdf(splitPoints, true);
}

/**
* Returns the max value of the stream.
* If the sketch is empty this returns NaN.
*
* @return the max value of the stream
*/
public float getMaxValue() {
return getMaxFloatValue();
}

/**
* Returns the min value of the stream.
* If the sketch is empty this returns NaN.
*
* @return the min value of the stream
*/
public float getMinValue() {
return getMinFloatValue();
}

/**
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
* given a set of splitPoints (values).
*
* <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
* getNormalizedRankError(true) function.
*
* <p>If the sketch is empty this returns null.</p>
*
* @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
* that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
* The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
* exclusive of the right splitPoint, with the exception that the last interval will include
* the maximum value.
* It is not necessary to include either the min or max values in these split points.
*
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream values (the mass) that fall into one of those intervals.
* The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
* splitPoint, with the exception that the last interval will include maximum value.
*/
public double[] getPMF(final float[] splitPoints) {
return getFloatsPmfOrCdf(splitPoints, false);
}

/**
* Returns an approximation to the value of the data item
* that would be preceded by the given fraction of a hypothetical sorted
* version of the input stream so far.
*
* <p>We note that this method has a fairly large overhead (microseconds instead of nanoseconds)
* so it should not be called multiple times to get different quantiles from the same
* sketch. Instead use getQuantiles(), which pays the overhead only once.
*
* <p>If the sketch is empty this returns NaN.
*
* @param fraction the specified fractional position in the hypothetical sorted stream.
* These are also called normalized ranks or fractional ranks.
* If fraction = 0.0, the true minimum value of the stream is returned.
* If fraction = 1.0, the true maximum value of the stream is returned.
*
* @return the approximation to the value at the given fraction
*/
public float getQuantile(final double fraction) {
return getFloatsQuantile(fraction);
}

/**
* Gets the lower bound of the value interval in which the true quantile of the given rank
* exists with a confidence of at least 99%.
* @param fraction the given normalized rank as a fraction
* @return the lower bound of the value interval in which the true quantile of the given rank
* exists with a confidence of at least 99%. Returns NaN if the sketch is empty.
*/
public float getQuantileLowerBound(final double fraction) {
return getQuantile(max(0, fraction - KllHelper.getNormalizedRankError(getDyMinK(), false)));
}

/**
* This is a more efficient multiple-query version of getQuantile().
*
* <p>This returns an array that could have been generated by using getQuantile() with many
* different fractional ranks, but would be very inefficient.
* This method incurs the internal set-up overhead once and obtains multiple quantile values in
* a single query. It is strongly recommend that this method be used instead of multiple calls
* to getQuantile().
*
* <p>If the sketch is empty this returns null.
*
* @param fractions given array of fractional positions in the hypothetical sorted stream.
* These are also called normalized ranks or fractional ranks.
* These fractions must be in the interval [0.0, 1.0], inclusive.
*
* @return array of approximations to the given fractions in the same order as given fractions
* array.
*/
public float[] getQuantiles(final double[] fractions) {
return getFloatsQuantiles(fractions);
}

/**
* This is also a more efficient multiple-query version of getQuantile() and allows the caller to
* specify the number of evenly spaced fractional ranks.
*
* <p>If the sketch is empty this returns null.
*
* @param numEvenlySpaced an integer that specifies the number of evenly spaced fractional ranks.
* This must be a positive integer greater than 0. A value of 1 will return the min value.
* A value of 2 will return the min and the max value. A value of 3 will return the min,
* the median and the max value, etc.
*
* @return array of approximations to the given fractions in the same order as given fractions
* array.
*/
public float[] getQuantiles(final int numEvenlySpaced) {
if (isEmpty()) { return null; }
return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, numEvenlySpaced));
}

/**
* Gets the upper bound of the value interval in which the true quantile of the given rank
* exists with a confidence of at least 99%.
* @param fraction the given normalized rank as a fraction
* @return the upper bound of the value interval in which the true quantile of the given rank
* exists with a confidence of at least 99%. Returns NaN if the sketch is empty.
*/
public float getQuantileUpperBound(final double fraction) {
return getQuantile(min(1.0, fraction + KllHelper.getNormalizedRankError(getDyMinK(), false)));
}

/**
* Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
* inclusive.
*
* <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
* getNormalizedRankError(false) function.
*
* <p>If the sketch is empty this returns NaN.</p>
*
* @param value to be ranked
* @return an approximate rank of the given value
*/
public double getRank(final float value) {
return getFloatRank(value);
}

/**
* @return the iterator for this class
*/
public KllFloatsSketchIterator iterator() {
return new KllFloatsSketchIterator(getFloatItemsArray(), getLevelsArray(), getNumLevels());
}

/**
* Merges another sketch into this one.
* @param other sketch to merge into this one
*/
public void merge(final KllSketch other) {
if (!other.isDirect()) { kllSketchThrow(32); }
if (!other.isFloatsSketch()) { kllSketchThrow(34); }
mergeFloatImpl(other);
}

@Override
public byte[] toByteArray() {
return toCompactByteArrayImpl();
}

@Override
public String toString(final boolean withLevels, final boolean withData) {
return toStringImpl(withLevels, withData);
}

public void update(final float value) {
updateFloat(value);
}

}
@@ -729,8 +729,8 @@ final void mergeDoubleImpl(final KllSketch other) {
//Update min, max values
final double otherMin = other.getMinDoubleValue();
final double otherMax = other.getMaxDoubleValue();
if (Double.isNaN(myMin) || otherMin <= myMin) { setMinDoubleValue(otherMin); }
if (Double.isNaN(myMax) || otherMax >= myMax) { setMaxDoubleValue(otherMax); }
setMinDoubleValue(resolveDoubleMinValue(myMin, otherMin));
setMaxDoubleValue(resolveDoubleMaxValue(myMax, otherMax));

//Update numLevels, levelsArray, items
setNumLevels(myNewNumLevels);
@@ -739,6 +739,21 @@ final void mergeDoubleImpl(final KllSketch other) {
assert KllHelper.sumTheSampleWeights(getNumLevels(), getLevelsArray()) == getN();
}

private static double resolveDoubleMinValue(final double myMin, final double otherMin) {
if (Double.isNaN(myMin) && Double.isNaN(otherMin)) { return Double.NaN; }
if (Double.isNaN(myMin)) { return otherMin; }
if (Double.isNaN(otherMin)) { return myMin; }
return min(myMin, otherMin);
}

private static double resolveDoubleMaxValue(final double myMax, final double otherMax) {
if (Double.isNaN(myMax) && Double.isNaN(otherMax)) { return Double.NaN; }
if (Double.isNaN(myMax)) { return otherMax; }
if (Double.isNaN(otherMax)) { return myMax; }
return max(myMax, otherMax);
}


final void mergeFloatImpl(final KllSketch other) {
if (other.isEmpty()) { return; }
final long finalN = getN() + other.getN();
@@ -825,8 +840,8 @@ final void mergeFloatImpl(final KllSketch other) {
//Update min, max values
final float otherMin = other.getMinFloatValue();
final float otherMax = other.getMaxFloatValue();
if (Float.isNaN(myMin) || otherMin < myMin) { setMinFloatValue(otherMin); }
if (Float.isNaN(myMax) || otherMax > myMax) { setMaxFloatValue(otherMax); }
setMinFloatValue(resolveFloatMinValue(myMin, otherMin));
setMaxFloatValue(resolveFloatMaxValue(myMax, otherMax));

//Update numLevels, levelsArray, items
setNumLevels(myNewNumLevels);
@@ -835,6 +850,21 @@ final void mergeFloatImpl(final KllSketch other) {
assert KllHelper.sumTheSampleWeights(getNumLevels(), getLevelsArray()) == getN();
}

private static float resolveFloatMinValue(final float myMin, final float otherMin) {
if (Float.isNaN(myMin) && Float.isNaN(otherMin)) { return Float.NaN; }
if (Float.isNaN(myMin)) { return otherMin; }
if (Float.isNaN(otherMin)) { return myMin; }
return min(myMin, otherMin);
}

private static float resolveFloatMaxValue(final float myMax, final float otherMax) {
if (Float.isNaN(myMax) && Float.isNaN(otherMax)) { return Float.NaN; }
if (Float.isNaN(myMax)) { return otherMax; }
if (Float.isNaN(otherMax)) { return myMax; }
return max(myMax, otherMax);
}


abstract void setDoubleItemsArray(double[] floatItems);

abstract void setDoubleItemsArrayAt(int index, double value);
@@ -1381,7 +1411,7 @@ private void compressWhileUpdatingSketch() {
} else {
if (direct) {
myFloatItemsArr = getFloatItemsArray();
System.arraycopy(myDoubleItemsArr, myLevelsArr[0], myDoubleItemsArr, myLevelsArr[0] + halfAdjPop, amount);
System.arraycopy(myFloatItemsArr, myLevelsArr[0], myFloatItemsArr, myLevelsArr[0] + halfAdjPop, amount);
setFloatItemsArray(myFloatItemsArr);
} else {
System.arraycopy(myFloatItemsArr, myLevelsArr[0], myFloatItemsArr, myLevelsArr[0] + halfAdjPop, amount);
@@ -35,16 +35,17 @@
*
* <p>The <i>normalized rank</i> (<i>rank</i>) of any specific value is defined as its
* <i>absolute rank</i> divided by <i>N</i>.
* Thus, the <i>normalized rank</i> is a value between zero and one.
* Thus, the <i>normalized rank</i> is a value in the interval [0.0, 1.0), exclusive.
* In the documentation and Javadocs for this sketch <i>absolute rank</i> is never used so any
* reference to just <i>rank</i> should be interpreted to mean <i>normalized rank</i>.
*
* <p>This sketch is configured with a parameter <i>k</i>, which affects the size of the sketch
* and its estimation error.
*
* <p>The estimation error is commonly called <i>epsilon</i> (or <i>eps</i>) and is a fraction
* between zero and one. Larger values of <i>k</i> result in smaller values of epsilon.
* Epsilon is always with respect to the rank and cannot be applied to the
* <p>In the research literature, the estimation error is commonly called <i>epsilon</i>
* (or <i>eps</i>) and is a fraction between zero and one.
* Larger values of <i>k</i> result in smaller values of epsilon.
* The epsilon error is always with respect to the rank and cannot be applied to the
* corresponding values.
*
* <p>The relationship between the normalized rank and the corresponding values can be viewed
@@ -147,6 +148,25 @@
* <li>Then <i>v<sub>lo</sub> &le; v &le; v<sub>hi</sub></i>, with 99% confidence.</li>
* </ul>
*
* <p>The current implementations of the KLL sketch in the DataSketches Java library component include:</p>
*
* <ul>
* <li><b>KllFloatsSketch</b>: This operates on the Java heap and uses the java <i>float</i> primitive for the
* smallest possible size. It can be serialized to a compact, immutable form or to an updatable form suitable for
* use by the Kll Direct sketches.</li>
* <li><b>KllDoublesSketch</b>: This operates on the Java heap and uses the java <i>double</i> primitive for a much
* larger range of numeric values, and is larger as a result. It can be serialized to a compact, immutable form or
* to an updatable form suitable for use by the Kll Direct sketches.</li>
* <li><b>KllDirectFloatsSketch</b>: This is intended to operate off-heap and performs all of its operations in one
* contiguous chunk of memory. It uses the java <i>float</i> primitive for the smallest possible size off-heap.</li>
* <li><b>KllDirectDoublesSketch</b>: This is intended to operate off-heap and performs all of its operations in one
* contiguous chunk of memory. It uses the java <i>double</i> primitive for a much larger range of numeric values,
* and is larger as a result.</li>
* </ul>
*
* <p>Please visit our website: <a href="https://datasketches.apache.org">DataSketches Home Page</a> for more
* information.</p>
*
* @author Kevin Lang
* @author Alexander Saydakov
* @author Lee Rhodes

0 comments on commit 5e3dd40

Please sign in to comment.