Skip to content
Permalink
Browse files
Merge pull request #387 from apache/KLLdoubles
KLL doubles
  • Loading branch information
leerho committed Feb 9, 2022
2 parents 3ef6678 + 719fb6a commit 4b993a9e66d363315a40b6dbc2c8bad0671b1b93
Showing 21 changed files with 2,962 additions and 350 deletions.
@@ -58,6 +58,7 @@ build.xml
*.releaseBackup
*.next
*.tag
doc/

# Jekyll
_site/
@@ -20,44 +20,19 @@
package org.apache.datasketches;

import java.util.Comparator;
import java.util.Objects;

/**
* This provides efficient, unique and unambiguous binary searching for inequalities
* for ordered arrays of values that may include duplicate values. These
* for ordered arrays of increasing values that may include duplicate values. These
* inequalities include <, ≤, ==, ≥, >. The same search method can be used for all
* these inequalities.
*
* <p>In order to make the searching unique and unambiguous, we modified the traditional binary
* search algorithm to search for adjacent pairs of values <i>{A, B}</i> in the values array
* instead of just a single value, where <i>A</i> and <i>B</i> are the array indicies of two
* instead of just a single value, where <i>A</i> and <i>B</i> are the array indices of two
* adjacent values in the array. We then define the searching criteria,
* given an array of values <i>arr[]</i> and the search key value <i>v</i>, as follows:</p>
* <ul>
* <li><b>LT:</b> Find the highest ranked adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &lt; v &le; arr[B]</i>. Normally we return the index <i>A</i>. However if the
* search algorithm reaches the ends of the search range, the search algorithm calls the
* <i>resolve()</i> method to determine what to return to the caller.
* </li>
* <li><b>LE:</b> Find the highest ranked adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &le; v &lt; arr[B]</i>. Normally we return the index <i>A</i>. However if the
* search algorithm reaches the ends of the search range, the search algorithm calls the
* <i>resolve()</i> method to determine what to return to the caller.
* </li>
* <li><b>EQ:</b> Find the adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &le; v &lt; arr[B]</i>. We return the index <i>A</i> or <i>B</i> whichever
* equals <i>v</i>, otherwise we return -1.
* </li>
* <li><b>GE:</b> Find the lowest ranked adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &lt; v &le; arr[B]</i>. Normally we return the index <i>B</i>. However if the
* search algorithm reaches the ends of the search range, the search algorithm calls the
* <i>resolve()</i> method to determine what to return to the caller.
* </li>
* <li><b>GT:</b> Find the lowest ranked adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &le; v &lt; arr[B]</i>. Normally we return the index <i>B</i>. However if the
* search algorithm reaches the ends of the search range, the search algorithm calls the
* <i>resolve()</i> method to determine what to return to the caller.
* </li>
* </ul>
* given an array of values <i>arr[]</i> and the search key value <i>v</i>.</p>
*
* @author Lee Rhodes
*/
@@ -69,27 +44,56 @@ public class GenericInequalitySearch {
public enum Inequality {

/**
* Less-Than
* <b>Less-Than:</b> Find the highest ranked adjacent ordered pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &lt; v &le; arr[B]</i> within the given range.<br>
* Let <i>low</i> = lowest index of the lowest value in the range.<br>
* Let <i>high</i> = highest index of the highest value in the range.
*
* <p>If <i>v</i> &gt; arr[high], return arr[high].<br>
* If <i>v</i> &le; arr[low], return -1.<br>
* Else return index of A.</p>
*/
LT,

/**
* Less-Than Or Equals
* <b>Less-Than Or Equals:</b> Find the highest ranked adjacent ordered pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &le; v &lt; arr[B]</i>.<br>
* Let <i>low</i> = lowest index of the lowest value in the range.<br>
* Let <i>high</i> = highest index of the highest value in the range.
*
* <p>If <i>v</i> &ge; arr[high], return arr[high].<br>
* If <i>v</i> &lt; arr[low], return -1.<br>
* Else return index of A.</p>
*/
LE,

/**
* Equals. Although not an inequality, it is included for completeness.
* <b>Equals:</b> Although not an inequality, it is included for completeness.
* An index &ge; 0 is returned unless not found, then -1 is returned.
*/
EQ,

/**
* Greater-Than Or Equals
* <b>Greater-Than Or Equals:</b> Find the lowest ranked adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &lt; v &le; arr[B]</i>.<br>
* Let <i>low</i> = lowest index of the lowest value in the range.<br>
* Let <i>high</i> = highest index of the highest value in the range.
*
* <p>If <i>v</i> &le; arr[low], return arr[low].<br>
* If <i>v</i> &gt; arr[high], return -1.<br>
* Else return index of B.</p>
*/
GE,

/**
* Greater-Than
* <b>Greater-Than:</b> Find the lowest ranked adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &le; v &lt; arr[B]</i>.<br>
* Let <i>low</i> = lowest index of the lowest value in the range.<br>
* Let <i>high</i> = highest index of the highest value in the range.
*
* <p>If <i>v</i> &lt; arr[low], return arr[low].<br>
* If <i>v</i> &ge; arr[high], return -1.<br>
* Else return index of B.</p>
*/
GT
}
@@ -99,17 +103,23 @@ public enum Inequality {
* the given inequality.
* If -1 is returned there are no values in the search range that satisfy the inequality.
*
* @param arr the given array that must be sorted.
* @param low the index of the lowest value in the search range
* @param high the index of the highest value in the search range
* @param v the value to search for.
* @param inequality one of LT, LE, EQ, GE, GT
* @param comparator for the type T
* @param arr the given array that must be sorted with increasing values, must not be null,
* and must not contain null values in the given range {low, high} inclusive.
* @param low the lowest index of the lowest value in the search range, inclusive.
* @param high the highest index of the highest value in the search range, inclusive.
* @param v the value to search for. It must not be null.
* @param inequality one of LT, LE, EQ, GE, GT. It must not be null.
* @param comparator for the type T. It must not be null.
* @param <T> The generic type of value to be used in the search process.
* @return the index of the value in the given search range that satisfies the inequality.
*/
public static <T> int find(final T[] arr, final int low, final int high, final T v,
final Inequality inequality, final Comparator<T> comparator) {
Objects.requireNonNull(arr, "Input arr must not be null");
Objects.requireNonNull(v,"Input v must not be null");
Objects.requireNonNull(inequality, "Input inequality must not be null");
Objects.requireNonNull(comparator,"Input comparator must not be null");

int lo = low;
int hi = high - 1;
int ret;
@@ -19,6 +19,8 @@

package org.apache.datasketches;

import java.util.Objects;

/**
* This provides efficient, unique and unambiguous binary searching for inequality comparison criteria
* for ordered arrays of values that may include duplicate values. The inequality criteria include
@@ -27,42 +29,28 @@
*
* <p>In order to make the searching unique and unambiguous, we modified the traditional binary
* search algorithm to search for adjacent pairs of values <i>{A, B}</i> in the values array
* instead of just a single value, where <i>A</i> and <i>B</i> are the array indicies of two
* instead of just a single value, where <i>A</i> and <i>B</i> are the array indices of two
* adjacent values in the array. For all the search criteria, if the algorithm reaches the ends of
* the search range, the algorithm calls the <i>resolve()</i> method to determine what to
* return to the caller. If the key value cannot be resolved, it returns a -1 to the caller.
*
* <p>Given an array of values <i>arr[]</i> and the search key value <i>v</i>, the algorithms for
* the searching criteria are as follows:</p>
* <ul>
* <li><b>LT:</b> Find the highest ranked adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &lt; v &le; arr[B]</i>. The normal return is the index <i>A</i>.
* </li>
* <li><b>LE:</b> Find the highest ranked adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &le; v &lt; arr[B]</i>. The normal return is the index <i>A</i>.
* </li>
* <li><b>EQ:</b> Find the adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &le; v &le; arr[B]</i>. The normal return is the index <i>A</i> or <i>B</i> whichever
* equals <i>v</i>, otherwise it returns -1.
* </li>
* <li><b>GE:</b> Find the lowest ranked adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &lt; v &le; arr[B]</i>. The normal return is the index <i>B</i>.
* </li>
* <li><b>GT:</b> Find the lowest ranked adjacent pair <i>{A, B}</i> such that:<br>
* <i>arr[A] &le; v &lt; arr[B]</i>. The normal return is the index <i>B</i>.
* </li>
* </ul>
* <p>Given a sorted array of values <i>arr[]</i> and the search key value <i>v</i>, the algorithms for
* the searching criteria are given with each enum criterion.</p>
*
* @author Lee Rhodes
*/
public enum InequalitySearch {

/**
* Given a sorted array of increasing values <i>arr[]</i> and a key value <i>V</i>,
* Given a sorted array of increasing values <i>arr[]</i> and a key value <i>v</i>,
* this criterion instructs the binary search algorithm to find the highest adjacent pair of
* values <i>{A,B}</i> such that <i>A &lt; V &le; B</i>.
* The returned value from the binary search algorithm will be the index of <i>A</i>
* or -1, if the value <i>V</i> &le; the lowest value in the selected range of the array.
* values <i>{A,B}</i> such that <i>A &lt; v &le; B</i>.<br>
* Let <i>low</i> = lowest index of the lowest value in the range.<br>
* Let <i>high</i> = highest index of the highest value in the range.
*
* <p>If <i>v</i> &gt; arr[high], return arr[high].<br>
* If <i>v</i> &le; arr[low], return -1.<br>
* Else return index of A.</p>
*/
LT { //arr[A] < V <= arr[B], return A
@Override
@@ -147,9 +135,13 @@ String desc(final long[] arr, final int low, final int high, final long v, final
/**
* Given a sorted array of increasing values <i>arr[]</i> and a key value <i>V</i>,
* this criterion instructs the binary search algorithm to find the highest adjacent pair of
* values <i>{A,B}</i> such that <i>A &le; V &lt; B</i>.
* The returned value from the binary search algorithm will be the index of <i>A</i>
* or -1, if the value <i>V</i> &lt; the lowest value in the selected range of the array.
* values <i>{A,B}</i> such that <i>A &le; V &lt; B</i>.<br>
* Let <i>low</i> = lowest index of the lowest value in the range.<br>
* Let <i>high</i> = highest index of the highest value in the range.
*
* <p>If <i>v</i> &ge; arr[high], return arr[high].<br>
* If <i>v</i> &lt; arr[low], return -1.<br>
* Else return index of A.</p>
*/
LE { //arr[A] <= V < arr[B], return A
@Override
@@ -320,9 +312,13 @@ String desc(final long[] arr, final int low, final int high, final long v, final
/**
* Given a sorted array of increasing values <i>arr[]</i> and a key value <i>V</i>,
* this criterion instructs the binary search algorithm to find the lowest adjacent pair of
* values <i>{A,B}</i> such that <i>A &lt; V &le; B</i>.
* The returned value from the binary search algorithm will be the index of <i>B</i>
* or -1, if the value <i>V</i> &gt; the highest value in the selected range of the array.
* values <i>{A,B}</i> such that <i>A &lt; V &le; B</i>.<br>
* Let <i>low</i> = lowest index of the lowest value in the range.<br>
* Let <i>high</i> = highest index of the highest value in the range.
*
* <p>If <i>v</i> &le; arr[low], return arr[low].<br>
* If <i>v</i> &gt; arr[high], return -1.<br>
* Else return index of B.</p>
*/
GE { //arr[A] < V <= arr[B], return B
@Override
@@ -407,9 +403,13 @@ String desc(final long[] arr, final int low, final int high, final long v, final
/**
* Given a sorted array of increasing values <i>arr[]</i> and a key value <i>V</i>,
* this criterion instructs the binary search algorithm to find the lowest adjacent pair of
* values <i>{A,B}</i> such that <i>A &le; V &lt; B</i>.
* The returned value from the binary search algorithm will be the index of <i>B</i>
* or -1, if the value <i>V</i> &ge; the highest value in the selected range of the array.
* values <i>{A,B}</i> such that <i>A &le; V &lt; B</i>.<br>
* Let <i>low</i> = lowest index of the lowest value in the range.<br>
* Let <i>high</i> = highest index of the highest value in the range.
*
* <p>If <i>v</i> &lt; arr[low], return arr[low].<br>
* If <i>v</i> &ge; arr[high], return -1.<br>
* Else return index of B.</p>
*/
GT { //arr[A] <= V < arr[B], return B
@Override
@@ -614,14 +614,18 @@ String desc(final long[] arr, final int low, final int high, final long v, final
* If -1 is returned there are no values in the search range that satisfy the criterion.
*
* @param arr the given array that must be sorted.
* @param low the index of the lowest value in the search range
* @param high the index of the highest value in the search range
* @param v the value to search for.
* @param crit one of LT, LE, EQ, GT, GE
* It must not be null and must not contain any NaN values in the range {low, high} inclusive.
* @param low the lowest index of the lowest value in the search range, inclusive.
* @param high the highest index of the highest value in the search range, inclusive.
* @param v the value to search for. It must not be NaN.
* @param crit one of LT, LE, EQ, GT, GE. It must not be null.
* @return the index of the value in the given search range that satisfies the criterion
*/
public static int find(final double[] arr, final int low, final int high,
final double v, final InequalitySearch crit) {
Objects.requireNonNull(arr, "Inpurt arr must not be null");
Objects.requireNonNull(crit, "Input crit must not be null");
if (Double.isNaN(v)) { throw new SketchesArgumentException("Input v must not be NaN."); }
int lo = low;
int hi = high - 1;
int ret;
@@ -641,14 +645,18 @@ public static int find(final double[] arr, final int low, final int high,
* If -1 is returned there are no values in the search range that satisfy the criterion.
*
* @param arr the given array that must be sorted.
* @param low the index of the lowest value in the search range
* @param high the index of the highest value in the search range
* @param v the value to search for.
* It must not be null and must not contain any NaN values in the range {low, high} inclusive.
* @param low the lowest index of the lowest value in the search range, inclusive.
* @param high the highest index of the highest value in the search range, inclusive.
* @param v the value to search for. It must not be NaN.
* @param crit one of LT, LE, EQ, GT, GE
* @return the index of the value in the given search range that satisfies the criterion
*/
public static int find(final float[] arr, final int low, final int high,
final float v, final InequalitySearch crit) {
Objects.requireNonNull(arr, "Inpurt arr must not be null");
Objects.requireNonNull(crit, "Input crit must not be null");
if (Float.isNaN(v)) { throw new SketchesArgumentException("Input v must not be NaN."); }
int lo = low;
int hi = high - 1;
int ret;
@@ -668,14 +676,16 @@ public static int find(final float[] arr, final int low, final int high,
* If -1 is returned there are no values in the search range that satisfy the criterion.
*
* @param arr the given array that must be sorted.
* @param low the index of the lowest value in the search range
* @param high the index of the highest value in the search range
* @param low the lowest index of the lowest value in the search range, inclusive.
* @param high the highest index of the highest value in the search range, inclusive.
* @param v the value to search for.
* @param crit one of LT, LE, EQ, GT, GE
* @return the index of the value in the given search range that satisfies the criterion
*/
public static int find(final long[] arr, final int low, final int high,
final long v, final InequalitySearch crit) {
Objects.requireNonNull(arr, "Inpurt arr must not be null");
Objects.requireNonNull(crit, "Input crit must not be null");
int lo = low;
int hi = high - 1;
int ret;
@@ -688,4 +698,4 @@ public static int find(final long[] arr, final int low, final int high,
}
return crit.resolve(lo, hi, low, high);
}
}
} //End of enum
@@ -20,7 +20,7 @@
package org.apache.datasketches;

/**
* Common static methods for quantiles sketches
* Common static methods for classic quantiles and KLL sketches
*/
public class QuantilesHelper {

@@ -29,7 +29,7 @@ public class QuantilesHelper {
* An array of {1,1,1,0} becomes {0,1,2,3}
* @param array of weights where first element is zero
* @return total weight
*/
*/ //also used by KLL
public static long convertToPrecedingCummulative(final long[] array) {
long subtotal = 0;
for (int i = 0; i < array.length; i++) {
@@ -46,7 +46,7 @@ public static long convertToPrecedingCummulative(final long[] array) {
* @param phi the fractional position where: 0 &le; &#966; &le; 1.0.
* @param n the size of the stream
* @return the index, a value between 0 and n-1.
*/
*/ //also used by KLL
public static long posOfPhi(final double phi, final long n) {
final long pos = (long) Math.floor(phi * n);
return pos == n ? n - 1 : pos; //avoids ArrayIndexOutOfBoundException
@@ -57,7 +57,7 @@ public static long posOfPhi(final double phi, final long n) {
* @param wtArr the cumulative weights array consisting of chunks
* @param pos the position
* @return the index of the chunk containing the position
*/
*/ //also used by KLL
public static int chunkContainingPos(final long[] wtArr, final long pos) {
final int nominalLength = wtArr.length - 1; /* remember, wtArr contains an "extra" position */
assert nominalLength > 0;

0 comments on commit 4b993a9

Please sign in to comment.