Skip to content

Commit

Permalink
Merge pull request #403 from apache/kll_inclusive
Browse files Browse the repository at this point in the history
KLL inclusive ranks and quantiles
  • Loading branch information
AlexanderSaydakov committed Jul 5, 2022
2 parents 7aed20a + 726d9c4 commit b901b87
Show file tree
Hide file tree
Showing 10 changed files with 369 additions and 73 deletions.
5 changes: 3 additions & 2 deletions src/main/java/org/apache/datasketches/QuantilesHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@ public class QuantilesHelper {
* Convert the weights into totals of the weights preceding each item.
* An array of {1,1,1,0} becomes {0,1,2,3}
* @param array of weights where first element is zero
* @param inclusive for treating rank as including weight of an item
* @return total weight
*/ //used by classic Quantiles and KLL
public static long convertToPrecedingCummulative(final long[] array) {
public static long convertToPrecedingCummulative(final long[] array, final boolean inclusive) {
long subtotal = 0;
for (int i = 0; i < array.length; i++) {
final long newSubtotal = subtotal + array[i];
array[i] = subtotal;
array[i] = inclusive ? newSubtotal : subtotal;
subtotal = newSubtotal;
}
return subtotal;
Expand Down
39 changes: 22 additions & 17 deletions src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
*/
final class KllDoublesHelper {

static double getDoubleRank(final KllSketch mine, final double value) {
static double getDoubleRank(final KllSketch mine, final double value, final boolean inclusive) {
if (mine.isEmpty()) { return Double.NaN; }
int level = 0;
int weight = 1;
Expand All @@ -47,7 +47,7 @@ static double getDoubleRank(final KllSketch mine, final double value) {
final int fromIndex = myLevelsArr[level];
final int toIndex = myLevelsArr[level + 1]; // exclusive
for (int i = fromIndex; i < toIndex; i++) {
if (myDoubleItemsArr[i] < value) {
if (inclusive ? myDoubleItemsArr[i] <= value : myDoubleItemsArr[i] < value) {
total += weight;
} else if (level > 0 || mine.isLevelZeroSorted()) {
break; // levels above 0 are sorted, no point comparing further
Expand All @@ -59,7 +59,8 @@ static double getDoubleRank(final KllSketch mine, final double value) {
return (double) total / mine.getN();
}

static double[] getDoublesPmfOrCdf(final KllSketch mine, final double[] splitPoints, final boolean isCdf) {
static double[] getDoublesPmfOrCdf(final KllSketch mine, final double[] splitPoints,
final boolean isCdf, final boolean inclusive) {
if (mine.isEmpty()) { return null; }
validateDoubleValues(splitPoints);
final double[] buckets = new double[splitPoints.length + 1];
Expand All @@ -71,9 +72,11 @@ static double[] getDoublesPmfOrCdf(final KllSketch mine, final double[] splitPoi
final int fromIndex = myLevelsArr[level];
final int toIndex = myLevelsArr[level + 1]; // exclusive
if (level == 0 && !mine.isLevelZeroSorted()) {
KllDoublesHelper.incrementDoublesBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets);
KllDoublesHelper.incrementDoublesBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints,
buckets, inclusive);
} else {
KllDoublesHelper.incrementDoublesBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets);
KllDoublesHelper.incrementDoublesBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints,
buckets, inclusive);
}
level++;
weight *= 2;
Expand All @@ -93,19 +96,19 @@ static double[] getDoublesPmfOrCdf(final KllSketch mine, final double[] splitPoi
return buckets;
}

static double getDoublesQuantile(final KllSketch mine, final double fraction) {
static double getDoublesQuantile(final KllSketch mine, final double fraction, final boolean inclusive) {
if (mine.isEmpty()) { return Double.NaN; }
if (fraction < 0.0 || fraction > 1.0) {
throw new SketchesArgumentException("Fraction cannot be less than zero nor greater than 1.0");
}
//These two assumptions make KLL compatible with the previous classic Quantiles Sketch
if (fraction == 0.0) { return mine.getMinDoubleValue(); }
if (fraction == 1.0) { return mine.getMaxDoubleValue(); }
final KllDoublesQuantileCalculator quant = KllDoublesHelper.getDoublesQuantileCalculator(mine);
final KllDoublesQuantileCalculator quant = KllDoublesHelper.getDoublesQuantileCalculator(mine, inclusive);
return quant.getQuantile(fraction);
}

static double[] getDoublesQuantiles(final KllSketch mine, final double[] fractions) {
static double[] getDoublesQuantiles(final KllSketch mine, final double[] fractions, final boolean inclusive) {
if (mine.isEmpty()) { return null; }
KllDoublesQuantileCalculator quant = null;
final double[] quantiles = new double[fractions.length];
Expand All @@ -118,7 +121,7 @@ static double[] getDoublesQuantiles(final KllSketch mine, final double[] fractio
else if (fraction == 1.0) { quantiles[i] = mine.getMaxDoubleValue(); }
else {
if (quant == null) {
quant = KllDoublesHelper.getDoublesQuantileCalculator(mine);
quant = KllDoublesHelper.getDoublesQuantileCalculator(mine, inclusive);
}
quantiles[i] = quant.getQuantile(fraction);
}
Expand Down Expand Up @@ -433,24 +436,26 @@ private static int[] generalDoublesCompress(
return new int[] {numLevels, targetItemCount, currentItemCount};
}

private static KllDoublesQuantileCalculator getDoublesQuantileCalculator(final KllSketch mine) {
private static KllDoublesQuantileCalculator getDoublesQuantileCalculator(final KllSketch mine,
final boolean inclusive) {
final int[] myLevelsArr = mine.getLevelsArray();
final double[] myDoubleItemsArr = mine.getDoubleItemsArray();
if (!mine.isLevelZeroSorted()) {
Arrays.sort(myDoubleItemsArr, myLevelsArr[0], myLevelsArr[1]);
if (!mine.hasMemory()) { mine.setLevelZeroSorted(true); }
}
return new KllDoublesQuantileCalculator(myDoubleItemsArr, myLevelsArr, mine.getNumLevels(), mine.getN());
return new KllDoublesQuantileCalculator(myDoubleItemsArr, myLevelsArr, mine.getNumLevels(), mine.getN(),
inclusive);
}

private static void incrementDoublesBucketsSortedLevel(
final KllSketch mine, final int fromIndex, final int toIndex,
final int weight, final double[] splitPoints, final double[] buckets) {
final KllSketch mine, final int fromIndex, final int toIndex, final int weight,
final double[] splitPoints, final double[] buckets, final boolean inclusive) {
final double[] myDoubleItemsArr = mine.getDoubleItemsArray();
int i = fromIndex;
int j = 0;
while (i < toIndex && j < splitPoints.length) {
if (myDoubleItemsArr[i] < splitPoints[j]) {
if (inclusive ? myDoubleItemsArr[i] <= splitPoints[j] : myDoubleItemsArr[i] < splitPoints[j]) {
buckets[j] += weight; // this sample goes into this bucket
i++; // move on to next sample and see whether it also goes into this bucket
} else {
Expand All @@ -466,13 +471,13 @@ private static void incrementDoublesBucketsSortedLevel(
}

private static void incrementDoublesBucketsUnsortedLevel(
final KllSketch mine, final int fromIndex, final int toIndex,
final int weight, final double[] splitPoints, final double[] buckets) {
final KllSketch mine, final int fromIndex, final int toIndex, final int weight,
final double[] splitPoints, final double[] buckets, final boolean inclusive) {
final double[] myDoubleItemsArr = mine.getDoubleItemsArray();
for (int i = fromIndex; i < toIndex; i++) {
int j;
for (j = 0; j < splitPoints.length; j++) {
if (myDoubleItemsArr[i] < splitPoints[j]) {
if (inclusive ? myDoubleItemsArr[i] <= splitPoints[j] : myDoubleItemsArr[i] < splitPoints[j]) {
break;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ final class KllDoublesQuantileCalculator {

// assumes that all levels are sorted including level 0
KllDoublesQuantileCalculator(final double[] items, final int[] levels, final int numLevels,
final long n) {
final long n, final boolean inclusive) {
n_ = n;
final int numItems = levels[numLevels] - levels[0];
items_ = new double[numItems];
weights_ = new long[numItems + 1]; // one more is intentional
levels_ = new int[numLevels + 1];
populateFromSketch(items, levels, numLevels, numItems);
blockyTandemMergeSort(items_, weights_, levels_, numLevels_);
QuantilesHelper.convertToPrecedingCummulative(weights_);
QuantilesHelper.convertToPrecedingCummulative(weights_, inclusive);
}

//For testing only. Allows testing of getQuantile without a sketch.
Expand Down
77 changes: 72 additions & 5 deletions src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,25 @@ public static KllDoublesSketch writableWrap(
* the maximum value.
* It is not necessary to include either the min or max values in these split points.
*
* @param inclusive if true the weight of the given value is included into the rank.
* Otherwise the rank equals the sum of the weights of all values that are less than the given value
*
* @return an array of m+1 double values on the interval [0.0, 1.0),
* which are a consecutive approximation to the CDF of the input stream given the splitPoints.
* The value at array position j of the returned CDF array is the sum of the returned values
* in positions 0 through j of the returned PMF array.
*/
public double[] getCDF(final double[] splitPoints, final boolean inclusive) {
return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true, inclusive);
}

/**
* Same as {@link #getCDF(double[], boolean) getCDF(double[] splitPoints, false)}
* @param splitPoints splitPoints
* @return CDF
*/
public double[] getCDF(final double[] splitPoints) {
return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true);
return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true, false);
}

/**
Expand Down Expand Up @@ -213,14 +225,26 @@ public double[] getCDF(final double[] splitPoints) {
* the maximum value.
* It is not necessary to include either the min or max values in these split points.
*
* @param inclusive if true the weight of the given value is included into the rank.
* Otherwise the rank equals the sum of the weights of all values that are less than the given value
*
* @return an array of m+1 doubles on the interval [0.0, 1.0),
* each of which is an approximation to the fraction of the total input stream values
* (the mass) that fall into one of those intervals.
* The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
* splitPoint, with the exception that the last interval will include maximum value.
*/
public double[] getPMF(final double[] splitPoints, final boolean inclusive) {
return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false, inclusive);
}

/**
* Same as {@link #getPMF(double[], boolean) getPMF(double[] splitPoints, false)}
* @param splitPoints splitPoints
* @return PMF
*/
public double[] getPMF(final double[] splitPoints) {
return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false);
return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false, false);
}

/**
Expand All @@ -239,10 +263,20 @@ public double[] getPMF(final double[] splitPoints) {
* If fraction = 0.0, the true minimum value of the stream is returned.
* If fraction = 1.0, the true maximum value of the stream is returned.
*
* @param inclusive if true, the given fraction (rank) is considered inclusive
* @return the approximation to the value at the given fraction
*/
public double getQuantile(final double fraction, final boolean inclusive) {
return KllDoublesHelper.getDoublesQuantile(this, fraction, inclusive);
}

/**
* Same as {@link #getQuantile(double, boolean) getQuantile(double fraction, false)}
* @param fraction fractional rank
* @return quantile
*/
public double getQuantile(final double fraction) {
return KllDoublesHelper.getDoublesQuantile(this, fraction);
return KllDoublesHelper.getDoublesQuantile(this, fraction, false);
}

/**
Expand Down Expand Up @@ -271,11 +305,22 @@ public double getQuantileLowerBound(final double fraction) {
* These are also called normalized ranks or fractional ranks.
* These fractions must be in the interval [0.0, 1.0], inclusive.
*
* @param inclusive if true, the given fractions (ranks) are considered inclusive
*
* @return array of approximations to the given fractions in the same order as given fractions
* array.
*/
public double[] getQuantiles(final double[] fractions, final boolean inclusive) {
return KllDoublesHelper.getDoublesQuantiles(this, fractions, inclusive);
}

/**
* Same as {@link #getQuantiles(double[], boolean) getQuantiles(double[] fractions, false)}
* @param fractions fractional ranks
* @return quantiles
*/
public double[] getQuantiles(final double[] fractions) {
return KllDoublesHelper.getDoublesQuantiles(this, fractions);
return KllDoublesHelper.getDoublesQuantiles(this, fractions, false);
}

/**
Expand All @@ -289,9 +334,20 @@ public double[] getQuantiles(final double[] fractions) {
* A value of 2 will return the min and the max value. A value of 3 will return the min,
* the median and the max value, etc.
*
* @param inclusive if true, the fractional ranks are considered inclusive
* @return array of approximations to the given fractions in the same order as given fractions
* array.
*/
public double[] getQuantiles(final int numEvenlySpaced, final boolean inclusive) {
if (isEmpty()) { return null; }
return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, numEvenlySpaced), inclusive);
}

/**
* Same as {@link #getQuantiles(int, boolean) getQuantiles(int numEvenlySpaced, false)}
* @param numEvenlySpaced number of evenly spaced fractional ranks
* @return quantiles
*/
public double[] getQuantiles(final int numEvenlySpaced) {
if (isEmpty()) { return null; }
return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, numEvenlySpaced));
Expand All @@ -318,10 +374,21 @@ public double getQuantileUpperBound(final double fraction) {
* <p>If the sketch is empty this returns NaN.</p>
*
* @param value to be ranked
* @param inclusive if true the weight of the given value is included into the rank.
* Otherwise the rank equals the sum of the weights of all values that are less than the given value
* @return an approximate rank of the given value
*/
public double getRank(final double value, final boolean inclusive) {
return KllDoublesHelper.getDoubleRank(this, value, inclusive);
}

/**
* Same as {@link #getRank(double, boolean) getRank(double value, false)}
* @param value value to be ranked
* @return fractional rank
*/
public double getRank(final double value) {
return KllDoublesHelper.getDoubleRank(this, value);
return KllDoublesHelper.getDoubleRank(this, value, false);
}

/**
Expand Down

0 comments on commit b901b87

Please sign in to comment.