Skip to content

Commit

Permalink
Merge 063d1a3 into a4893d3
Browse files Browse the repository at this point in the history
  • Loading branch information
leerho committed Oct 6, 2022
2 parents a4893d3 + 063d1a3 commit 54ba826
Show file tree
Hide file tree
Showing 163 changed files with 6,015 additions and 5,394 deletions.
91 changes: 65 additions & 26 deletions src/main/java/org/apache/datasketches/DoublesSortedView.java
Expand Up @@ -19,9 +19,11 @@

package org.apache.datasketches;

import static org.apache.datasketches.Util.checkDoublesSplitPointsOrder;

/**
* The Sorted View for double values.
*
* The Sorted View for quantiles of primitive type double.
* @see SortedView
* @author Alexander Saydakov
* @author Lee Rhodes
*/
Expand All @@ -31,44 +33,81 @@ public interface DoublesSortedView extends SortedView {
* Gets the quantile based on the given normalized rank, and the given search criterion.
* @param normalizedRank the given normalized rank, which must be in the range [0.0, 1.0].
* @param searchCrit the given search criterion to use.
* @return the associated quantile value.
* @return the associated quantile.
*/
double getQuantile(double normalizedRank, QuantileSearchCriteria searchCrit);

/**
* Gets the normalized rank based on the given quantile value.
* @param value the given quantile value
* Gets the normalized rank based on the given double quantile.
* @param quantile the given quantile.
* @param searchCrit the given search criterion to use.
* @return the normalized rank, which is a number in the range [0.0, 1.0].
*/
double getRank(double value, QuantileSearchCriteria searchCrit);
double getRank(double quantile, QuantileSearchCriteria searchCrit);

/**
* Returns an array of values where each value is a number in the range [0.0, 1.0].
* The size of this array is one larger than the size of the input splitPoints array.
* Returns an array of ranks in the range [0.0, 1.0].
* The size of this array is one larger than the size of the input splitPoints array because it will always include
* 1.0 at the top.
*
* <p>If <i>isCdf</i> is true, the points in the returned array are monotonically increasing and end with the
* value 1.0. Each value represents a point along the cumulative distribution function that approximates
* the CDF of the input data stream. Therefore, each point represents the fractional density of the distribution
* between from zero. For example, if one of the returned values is 0.5, then the splitPoint corresponding to that
* value would be the median of the distribution.</p>
* <p>The points in the returned array are monotonically increasing and end with 1.0.
* Each point represents a cumulative probability or cumulative fractional density along a cumulative distribution
* function (CDF) that approximates the CDF of the input data stream. For example, if one of the returned points is
* 0.5, then the splitPoint corresponding to that point would be the median of the distribution and its center
* of mass.</p>
*
* <p>If <i>isCdf</i> is false, the points in the returned array are not monotonic and represent the discrete
* derivative of the CDF, or the Probability Mass Function (PMF). Each returned point represents the fractional
* @param splitPoints the given array of quantiles or splitPoints. This is a sorted, monotonic array of unique
* quantiles in the range of (minQuantile, maxQuantile). This array does not need to include either the minQuantile
* or the maxQuantile. The returned array will have one extra interval representing the very top of the distribution.
* @param searchCrit if INCLUSIVE, each interval within the distribution will include its top quantile and exclude its
* bottom quantile. Otherwise, it will be the reverse. The only exception is that the top portion will always include
* the top quantile retained by the sketch.
* @return an array of points that correspond to the given splitPoints, and represents the input data distribution
* as a CDF.
*/
default double[] getCDF(double[] splitPoints, QuantileSearchCriteria searchCrit) {
checkDoublesSplitPointsOrder(splitPoints);
final int len = splitPoints.length + 1;
final double[] buckets = new double[len];
for (int i = 0; i < len - 1; i++) {
buckets[i] = getRank(splitPoints[i], searchCrit);
}
buckets[len - 1] = 1.0;
return buckets;
}

/**
* Returns an array of doubles where each double is in the range [0.0, 1.0].
* The size of this array is one larger than the size of the input splitPoints array.
*
* <p>The points in the returned array are not monotonic and represent the discrete derivative of the CDF,
* which is also called the Probability Mass Function (PMF). Each returned point represents the fractional
* area of the total distribution which lies between the previous point (or zero) and the given point, which
* corresponds to the given splitPoint.<p>
* corresponds to the given splitPoint.</p>
*
* @param splitPoints the given array of quantile values or splitPoints. This is a sorted, unique, monotonic array
* of values in the range of (minValue, maxValue). This array should not include either the minValue or the maxValue.
* The returned array will have one extra interval representing the very top of the distribution.
* @param isCdf if true, a CDF will be returned, otherwise, a PMF will be returned.
* @param searchCrit if INCLUSIVE, each interval within the distribution will include its top value and exclude its
* bottom value. Otherwise, it will be the reverse. The only exception is that the top portion will always include
* the top value retained by the sketch.
* @return an array of points that correspond to the given splitPoints, and represents the data distribution
* as a CDF or PMF.
* @param splitPoints the given array of quantiles or splitPoints. This is a sorted, monotonic array of unique
* quantiles in the range of (minQuantile, maxQuantile). This array does not need to include either the minQuantile
* or the maxQuantile. The returned array will have one extra interval representing the very top of the distribution.
* @param searchCrit if INCLUSIVE, each interval within the distribution will include its top quantile and exclude its
* bottom quantile. Otherwise, it will be the reverse. The only exception is that the top portion will always include
* the top quantile retained by the sketch.
* @return an array of points that correspond to the given splitPoints, and represents the input data distribution
* as a PMF.
*/
default double[] getPMF(double[] splitPoints, QuantileSearchCriteria searchCrit) {
final double[] buckets = getCDF(splitPoints, searchCrit);
final int len = buckets.length;
for (int i = len; i-- > 1; ) {
buckets[i] -= buckets[i - 1];
}
return buckets;
}

/**
* Returns the array of quantiles.
* @return the array of quantiles.
*/
double[] getPmfOrCdf(double[] splitPoints, boolean isCdf, QuantileSearchCriteria searchCrit);
double[] getQuantiles();

@Override
DoublesSortedViewIterator iterator();
Expand Down
Expand Up @@ -20,17 +20,22 @@
package org.apache.datasketches;

/**
* The Sorted View Iterator for double values.
*
* The quantiles SortedView iterator for type double.
* @see SortedViewIterator
* @author Alexander Saydakov
* @author Lee Rhodes
*/
public interface DoublesSortedViewIterator extends SortedViewIterator {

/**
* Gets the quantile value at the current index.
* @return the quantile value at the current index.
* Gets the quantile at the current index.
*
* <p>Don't call this before calling next() for the first time
* or after getting false from next().</p>
*
* @return the quantile at the current index.
*/
double getValue();
double getQuantile();

}

92 changes: 66 additions & 26 deletions src/main/java/org/apache/datasketches/FloatsSortedView.java
Expand Up @@ -19,9 +19,11 @@

package org.apache.datasketches;

import static org.apache.datasketches.Util.checkFloatsSplitPointsOrder;

/**
* The Sorted View for float values.
*
* The Sorted View for quantiles of primitive type float.
* @see SortedView
* @author Alexander Saydakov
* @author Lee Rhodes
*/
Expand All @@ -31,46 +33,84 @@ public interface FloatsSortedView extends SortedView {
* Gets the quantile based on the given normalized rank, and the given search criterion.
* @param normalizedRank the given normalized rank, which must be in the range [0.0, 1.0].
* @param searchCrit the given search criterion to use.
* @return the associated quantile value.
* @return the associated quantile.
*/
float getQuantile(double normalizedRank, QuantileSearchCriteria searchCrit);

/**
* Gets the normalized rank based on the given quantile value.
* @param value the given quantile value
* Gets the normalized rank based on the given float quantile.
* @param quantile the given quantile.
* @param searchCrit the given search criterion to use.
* @return the normalized rank, which is a number in the range [0.0, 1.0].
*/
double getRank(float value, QuantileSearchCriteria searchCrit);
double getRank(float quantile, QuantileSearchCriteria searchCrit);

/**
* Returns an array of values where each value is a number in the range [0.0, 1.0].
* The size of this array is one larger than the size of the input splitPoints array.
* Returns an array of ranks in the range [0.0, 1.0].
* The size of this array is one larger than the size of the input splitPoints array because it will always include
* 1.0 at the top.
*
* <p>If <i>isCdf</i> is true, the points in the returned array are monotonically increasing and end with the
* value 1.0. Each value represents a point along the cumulative distribution function that approximates
* the CDF of the input data stream. Therefore, each point represents the fractional density of the distribution
* between from zero. For example, if one of the returned values is 0.5, then the splitPoint corresponding to that
* value would be the median of the distribution.</p>
* <p>The points in the returned array are monotonically increasing and end with 1.0.
* Each point represents a cumulative probability or cumulative fractional density along a cumulative distribution
* function (CDF) that approximates the CDF of the input data stream. For example, if one of the returned points is
* 0.5, then the splitPoint corresponding to that point would be the median of the distribution and its center
* of mass.</p>
*
* @param splitPoints the given array of quantiles or splitPoints. This is a sorted, monotonic array of unique
* quantiles in the range of (minQuantile, maxQuantile). This array does not need to include either the minQuantile
* or the maxQuantile. The returned array will have one extra interval representing the very top of the distribution.
* @param searchCrit if INCLUSIVE, each interval within the distribution will include its top quantile and exclude its
* bottom quantile. Otherwise, it will be the reverse. The only exception is that the top portion will always include
* the top quantile retained by the sketch.
* @return an array of points that correspond to the given splitPoints, and represents the input data distribution
* as a CDF.
*/
default double[] getCDF(float[] splitPoints, QuantileSearchCriteria searchCrit) {
checkFloatsSplitPointsOrder(splitPoints);
final int len = splitPoints.length + 1;
final double[] buckets = new double[len];
for (int i = 0; i < len - 1; i++) {
buckets[i] = getRank(splitPoints[i], searchCrit);
}
buckets[len - 1] = 1.0;
return buckets;
}

/**
* Returns an array of doubles where each double is in the range [0.0, 1.0].
* The size of this array is one larger than the size of the input splitPoints array.
*
* <p>If <i>isCdf</i> is false, the points in the returned array are not monotonic and represent the discrete
* derivative of the CDF, or the Probability Mass Function (PMF). Each returned point represents the fractional
* <p>The points in the returned array are not monotonic and represent the discrete derivative of the CDF,
* which is also called the Probability Mass Function (PMF). Each returned point represents the fractional
* area of the total distribution which lies between the previous point (or zero) and the given point, which
* corresponds to the given splitPoint.<p>
* corresponds to the given splitPoint.</p>
*
* @param splitPoints the given array of quantile values or splitPoints. This is a sorted, unique, monotonic array
* of values in the range of (minValue, maxValue). This array should not include either the minValue or the maxValue.
* The returned array will have one extra interval representing the very top of the distribution.
* @param isCdf if true, a CDF will be returned, otherwise, a PMF will be returned.
* @param searchCrit if INCLUSIVE, each interval within the distribution will include its top value and exclude its
* bottom value. Otherwise, it will be the reverse. The only exception is that the top portion will always include
* the top value retained by the sketch.
* @return an array of points that correspond to the given splitPoints, and represents the data distribution
* as a CDF or PMF.
* @param splitPoints the given array of quantiles or splitPoints. This is a sorted, monotonic array of unique
* quantiles in the range of (minQuantile, maxQuantile). This array does not need to include either the minQuantile
* or the maxQuantile. The returned array will have one extra interval representing the very top of the distribution.
* @param searchCrit if INCLUSIVE, each interval within the distribution will include its top quantile and exclude its
* bottom quantile. Otherwise, it will be the reverse. The only exception is that the top portion will always include
* the top quantile retained by the sketch.
* @return an array of points that correspond to the given splitPoints, and represents the input data distribution
* as a PMF.
*/
default double[] getPMF(float[] splitPoints, QuantileSearchCriteria searchCrit) {
final double[] buckets = getCDF(splitPoints, searchCrit);
final int len = buckets.length;
for (int i = len; i-- > 1; ) {
buckets[i] -= buckets[i - 1];
}
return buckets;
}

/**
* Returns the array of quantiles
* @return the array of quantiles
*/
double[] getPmfOrCdf(float[] splitPoints, boolean isCdf, QuantileSearchCriteria searchCrit);
float[] getQuantiles();

@Override
FloatsSortedViewIterator iterator();

}

Expand Up @@ -20,17 +20,22 @@
package org.apache.datasketches;

/**
* The Sorted View Iterator for float values.
*
* The quantiles SortedView Iterator for type float.
* @see SortedViewIterator
* @author Alexander Saydakov
* @author Lee Rhodes
*/
public interface FloatsSortedViewIterator extends SortedViewIterator {

/**
* Gets the quantile value at the current index.
* @return the quantile value at the current index.
* Gets the quantile at the current index.
*
* <p>Don't call this before calling next() for the first time
* or after getting false from next().</p>
*
* @return the quantile at the current index.
*/
float getValue();
float getQuantile();

}

Expand Up @@ -143,7 +143,7 @@ public static <T> int find(final T[] arr, final int low, final int high, final T
if (hi - lo <= 1) {
return resolve(arr, lo, hi, v, crit, comparator);
}
final int mid = (lo + hi) / 2;
final int mid = lo + (hi - lo) / 2;
final int ret = compare(arr, mid, mid + 1, v, crit, comparator);
if (ret == -1 ) { hi = mid; }
else if (ret == 1) { lo = mid + 1; }
Expand Down

0 comments on commit 54ba826

Please sign in to comment.