From 20166c7f681a272bd9277d7eaa490829c50c5000 Mon Sep 17 00:00:00 2001
From: Lee Rhodes
Date: Thu, 11 Apr 2024 21:37:02 -0700
Subject: [PATCH 1/4] temp commit
---
.../datasketches/kll/KllItemsSketch.java | 17 +++-
.../datasketches/partitions/Partitioner.java | 4 +-
.../datasketches/quantiles/ItemsSketch.java | 17 +++-
.../GenericPartitionBoundaries.java | 45 ++++++++---
.../quantilescommon/GenericSortedView.java | 2 +-
.../ItemsSketchSortedView.java | 46 +++++------
.../quantilescommon/PartitionBoundaries.java | 67 ----------------
.../quantilescommon/PartitioningFeature.java | 78 ++++++++++++++++---
.../quantilescommon/QuantilesGenericAPI.java | 2 +-
.../SketchPartitionLimits.java | 63 +++++++++++++++
.../quantiles/ItemsSketchTest.java | 4 +-
.../CrossCheckQuantilesTest.java | 6 +-
.../PartitionBoundariesTest.java | 18 ++---
13 files changed, 228 insertions(+), 141 deletions(-)
delete mode 100644 src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java
create mode 100644 src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java
diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java
index 9f5a5ae71..efcca934b 100644
--- a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java
@@ -156,11 +156,21 @@ public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searc
}
@Override
- public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallySized,
+ public GenericPartitionBoundaries getPartitionBoundariesFromNumParts(
+ final int numEquallySizedParts,
final QuantileSearchCriteria searchCrit) {
if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); }
refreshSortedView();
- return itemsSV.getPartitionBoundaries(numEquallySized, searchCrit);
+ return itemsSV.getPartitionBoundariesFromNumParts(numEquallySizedParts, searchCrit);
+ }
+
+ @Override
+ public GenericPartitionBoundaries getPartitionBoundariesFromPartSize(
+ final long nominalPartSizeItems,
+ final QuantileSearchCriteria searchCrit) {
+ if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); }
+ refreshSortedView();
+ return itemsSV.getPartitionBoundariesFromPartSize(nominalPartSizeItems, searchCrit);
}
@Override
@@ -424,9 +434,8 @@ ItemsSketchSortedView getSV() {
quantiles = (T[]) Array.newInstance(serDe.getClassOfT(), numQuantiles);
cumWeights = new long[numQuantiles];
populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles);
- final double normRankErr = getNormalizedRankError(getK(), true);
return new ItemsSketchSortedView(
- quantiles, cumWeights, getN(), comparator, getMaxItem(), getMinItem(), normRankErr);
+ quantiles, cumWeights, getN(), comparator, getMaxItem(), getMinItem());
}
private void populateFromSketch(final Object[] srcQuantiles, final int[] srcLevels,
diff --git a/src/main/java/org/apache/datasketches/partitions/Partitioner.java b/src/main/java/org/apache/datasketches/partitions/Partitioner.java
index be1247ca3..66030fb22 100644
--- a/src/main/java/org/apache/datasketches/partitions/Partitioner.java
+++ b/src/main/java/org/apache/datasketches/partitions/Partitioner.java
@@ -117,7 +117,7 @@ public List> partition(final S sk) {
this.numLevels = (int)max(1, ceil(log(guessNumParts) / log(maxPartsPerSk)));
final int partsPerSk = (int)round(pow(guessNumParts, 1.0 / numLevels));
this.partitionsPerSk = min(partsPerSk, maxPartsPerSk);
- final GenericPartitionBoundaries gpb = sk.getPartitionBoundaries(partitionsPerSk, criteria);
+ final GenericPartitionBoundaries gpb = sk.getPartitionBoundariesFromNumParts(partitionsPerSk, criteria);
final StackElement se = new StackElement<>(gpb, 0, "1");
stack.push(se);
partitionSearch(stack);
@@ -144,7 +144,7 @@ private void partitionSearch(final ArrayDeque> stack) {
if (++se.part <= numParts) {
final PartitionBoundsRow row = new PartitionBoundsRow<>(se);
final S sk = fillReq.getRange(row.lowerBound, row.upperBound, row.rule);
- final GenericPartitionBoundaries gpb2 = sk.getPartitionBoundaries(this.partitionsPerSk, criteria);
+ final GenericPartitionBoundaries gpb2 = sk.getPartitionBoundariesFromNumParts(this.partitionsPerSk, criteria);
final int level = stack.size() + 1;
final String partId = se.levelPartId + "." + se.part + "," + level;
final StackElement se2 = new StackElement<>(gpb2, 0, partId);
diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java
index 9361c6cd6..3d2d33882 100644
--- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java
+++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java
@@ -277,11 +277,21 @@ public T getMinItem() {
}
@Override
- public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallySized,
+ public GenericPartitionBoundaries getPartitionBoundariesFromNumParts(
+ final int numEquallySizedParts,
final QuantileSearchCriteria searchCrit) {
if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); }
refreshSortedView();
- return classicQisSV.getPartitionBoundaries(numEquallySized, searchCrit);
+ return classicQisSV.getPartitionBoundariesFromNumParts(numEquallySizedParts, searchCrit);
+ }
+
+ @Override
+ public GenericPartitionBoundaries getPartitionBoundariesFromPartSize(
+ final long nominalPartSizeItems,
+ final QuantileSearchCriteria searchCrit) {
+ if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); }
+ refreshSortedView();
+ return classicQisSV.getPartitionBoundariesFromPartSize(nominalPartSizeItems, searchCrit);
}
@Override
@@ -656,9 +666,8 @@ private static ItemsSketchSortedView getSV(final ItemsSketch sk) {
throw new SketchesStateException("Sorted View is misconfigured. TotalN does not match cumWeights.");
}
- final double normRankErr = getNormalizedRankError(sk.getK(), true);
return new ItemsSketchSortedView<>(
- svQuantiles, svCumWeights, sk.getN(), comparator, sk.getMaxItem(), sk.getMinItem(), normRankErr);
+ svQuantiles, svCumWeights, sk.getN(), comparator, sk.getMaxItem(), sk.getMinItem());
}
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java
index b21cde774..ee53e910f 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericPartitionBoundaries.java
@@ -25,9 +25,10 @@
import org.apache.datasketches.common.SketchesStateException;
/**
- * Implements PartitionBoundaries
+ * This defines the returned results of the getParitionBoundaries() function and
+ * includes the basic methods needed to construct actual partitions.
*/
-final public class GenericPartitionBoundaries implements PartitionBoundaries {
+public final class GenericPartitionBoundaries {
private long totalN; //totalN of source sketch
private T[] boundaries; //quantiles at the boundaries
private long[] natRanks; //natural ranks at the boundaries
@@ -36,7 +37,7 @@ final public class GenericPartitionBoundaries implements PartitionBoundaries
private T minItem; //of the source sketch
private QuantileSearchCriteria searchCrit; //of the source sketch query to getPartitionBoundaries.
//computed
- private long[] numDeltaItems; //num of items in each part
+ private long[] numDeltaItems; //num of items in each partition
private int numPartitions; //num of partitions
public GenericPartitionBoundaries(
@@ -48,7 +49,7 @@ public GenericPartitionBoundaries(
final T minItem,
final QuantileSearchCriteria searchCrit) {
this.totalN = totalN;
- this.boundaries = boundaries; //SpotBugs EI_EXPOSE_REP2 copying from sketch class to this "friend" class.
+ this.boundaries = boundaries; //SpotBugs EI_EXPOSE_REP2 OK: copying from sketch class to this "friend" class.
this.natRanks = natRanks; // "
this.normRanks = normRanks; // "
this.maxItem = maxItem;
@@ -56,7 +57,7 @@ public GenericPartitionBoundaries(
this.searchCrit = searchCrit;
//check and compute
final int len = boundaries.length;
- if (len < 2) { throw new SketchesStateException("Source sketch is empty"); }
+ if (len < 2) { throw new SketchesStateException("Source sketch is empty"); } //class is final, this is ok
numDeltaItems = new long[len];
numDeltaItems[0] = 0; // index 0 is always 0
for (int i = 1; i < len; i++) {
@@ -67,7 +68,10 @@ public GenericPartitionBoundaries(
this.numPartitions = len - 1;
}
- @Override
+ /**
+ * Gets the length of the input stream offered to the underlying sketch.
+ * @return the length of the input stream offered to the underlying sketch.
+ */
public long getN() { return totalN; }
/**
@@ -100,16 +104,32 @@ public GenericPartitionBoundaries(
*/
public T[] getBoundaries() { return boundaries.clone(); }
- @Override
+ /**
+ * Gets an ordered array of natural ranks of the associated array of partition boundaries utilizing
+ * a specified search criterion. Natural ranks are integral values on the interval [1, N]
+ * @return an array of natural ranks.
+ */
public long[] getNaturalRanks() { return natRanks.clone(); }
- @Override
+ /**
+ * Gets an ordered array of normalized ranks of the associated array of partition boundaries utilizing
+ * a specified search criterion. Normalized ranks are double values on the interval [0.0, 1.0].
+ * @return an array of normalized ranks.
+ */
public double[] getNormalizedRanks() { return normRanks.clone(); }
- @Override
+ /**
+ * Gets the number of items to be included for each partition as an array.
+ * The count at index 0 is 0. The number of items included in the first partition, defined by the boundaries at
+ * index 0 and index 1, is at index 1 in this array, etc.
+ * @return the number of items to be included for each partition as an array.
+ */
public long[] getNumDeltaItems() { return numDeltaItems.clone(); }
- @Override
+ /**
+ * Gets the number of partitions
+ * @return the number of partitions
+ */
public int getNumPartitions() { return numPartitions; }
/**
@@ -130,7 +150,10 @@ public GenericPartitionBoundaries(
*/
public T getMinItem() { return minItem; }
- @Override
+ /**
+ * Gets the search criteria specified for the source sketch
+ * @return The search criteria specified for the source sketch
+ */
public QuantileSearchCriteria getSearchCriteria() { return searchCrit; }
}
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java
index 1c54395fa..4f9da76a5 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedView.java
@@ -30,7 +30,7 @@
* @author Alexander Saydakov
* @author Lee Rhodes
*/
-public interface GenericSortedView extends PartitioningFeature, SortedView {
+public interface GenericSortedView extends PartitioningFeature, SketchPartitionLimits, SortedView {
/**
* Returns an approximation to the Cumulative Distribution Function (CDF) of the input stream
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java
index 5fbcf4347..57735c6df 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java
@@ -19,6 +19,7 @@
package org.apache.datasketches.quantilescommon;
+import static java.lang.Math.min;
import static org.apache.datasketches.quantilescommon.GenericInequalitySearch.find;
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE;
import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG;
@@ -38,7 +39,6 @@
* @author Lee Rhodes
*/
public class ItemsSketchSortedView implements GenericSortedView {
- private static final double PARTITIONING_ERROR_FACTOR = 2.0;
private final T[] quantiles;
private final long[] cumWeights; //cumulative natural weights
private final long totalN;
@@ -46,7 +46,6 @@ public class ItemsSketchSortedView implements GenericSortedView {
private final T maxItem;
private final T minItem;
private final Class clazz;
- private final double normRankErr;//assumes PMF type error
/**
* Construct from elements, also used in testing.
@@ -56,7 +55,6 @@ public class ItemsSketchSortedView implements GenericSortedView {
* @param comparator the Comparator for type T
* @param maxItem of type T
* @param minItem of type T
- * @param normRankErr the normalized rank error of the originating sketch.
*/
@SuppressWarnings("unchecked")
public ItemsSketchSortedView(
@@ -65,8 +63,7 @@ public ItemsSketchSortedView(
final long totalN,
final Comparator super T> comparator,
final T maxItem,
- final T minItem,
- final double normRankErr) {
+ final T minItem) {
this.quantiles = quantiles;
this.cumWeights = cumWeights;
this.totalN = totalN;
@@ -74,7 +71,6 @@ public ItemsSketchSortedView(
this.maxItem = maxItem;
this.minItem = minItem;
this.clazz = (Class)quantiles[0].getClass();
- this.normRankErr = normRankErr;
}
//end of constructors
@@ -118,29 +114,35 @@ public int getNumRetained() {
}
@Override
- @SuppressWarnings("unchecked")
- public GenericPartitionBoundaries getPartitionBoundaries(final int numEquallySized,
+ public GenericPartitionBoundaries getPartitionBoundariesFromPartSize(
+ final long nominalPartitionSize,
final QuantileSearchCriteria searchCrit) {
if (isEmpty()) { throw new SketchesArgumentException(QuantilesAPI.EMPTY_MSG); }
- final long totalN = this.totalN;
- final int maxParts = (int) (totalN / Math.ceil(normRankErr * PARTITIONING_ERROR_FACTOR) );
- final int svLen = cumWeights.length;
-
- if (numEquallySized > maxParts) {
+ final long partSizeItems = getMinPartitionSizeItems();
+ if (nominalPartitionSize < partSizeItems) {
throw new SketchesArgumentException(QuantilesAPI.UNSUPPORTED_MSG
- + "The requested number of partitions is too large for the 'k' of this sketch "
- + "if it exceeds the maximum number of partitions allowed by the error threshold for the 'k' of this sketch."
- + "Requested Partitions: " + numEquallySized + " > " + maxParts);
+ + " The requested nominal partition size is too small for this sketch.");
}
- if (numEquallySized > svLen / 2.0) {
+ final long totalN = this.totalN;
+ final int numEquallySizedParts = (int) min(totalN / partSizeItems, getMaxPartitions());
+ return getPartitionBoundariesFromNumParts(numEquallySizedParts);
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public GenericPartitionBoundaries getPartitionBoundariesFromNumParts(
+ final int numEquallySizedParts,
+ final QuantileSearchCriteria searchCrit) {
+ if (isEmpty()) { throw new SketchesArgumentException(QuantilesAPI.EMPTY_MSG); }
+ final int maxParts = getMaxPartitions();
+ if (numEquallySizedParts > maxParts) {
throw new SketchesArgumentException(QuantilesAPI.UNSUPPORTED_MSG
- + "The requested number of partitions is too large for the number of retained items "
- + "if it exceeds maximum number of retained items divided by 2."
- + "Requested Partitions: " + numEquallySized + " > "
- + "Retained Items / 2: " + (svLen / 2));
+ + " The requested number of partitions is too large for this sketch.");
}
+ final long totalN = this.totalN;
+ final int svLen = cumWeights.length;
- final double[] searchNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySized + 1);
+ final double[] searchNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySizedParts + 1);
final int partArrLen = searchNormRanks.length;
final T[] partQuantiles = (T[]) Array.newInstance(clazz, partArrLen);
final long[] partNatRanks = new long[partArrLen];
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java b/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java
deleted file mode 100644
index e3c59d2c7..000000000
--- a/src/main/java/org/apache/datasketches/quantilescommon/PartitionBoundaries.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.datasketches.quantilescommon;
-
-/**
- * This defines a set of results computed from the getParitionBoundaries() function and
- * encapsulates the basic methods needed to construct actual partitions based on generic items.
- */
-public interface PartitionBoundaries {
-
- /**
- * Gets the length of the input stream offered to the underlying sketch.
- * @return the length of the input stream offered to the underlying sketch.
- */
- long getN();
-
- /**
- * Gets an ordered array of natural ranks of the associated array of partition boundaries utilizing
- * a specified search criterion. Natural ranks are integral values on the interval [1, N]
- * @return an array of natural ranks.
- */
- long[] getNaturalRanks();
-
- /**
- * Gets an ordered array of normalized ranks of the associated array of partition boundaries utilizing
- * a specified search criterion. Normalized ranks are double values on the interval [0.0, 1.0].
- * @return an array of normalized ranks.
- */
- double[] getNormalizedRanks();
-
- /**
- * Gets the number of items to be included for each partition as an array.
- * The count at index 0 is 0. The number of items included in the first partition, defined by the boundaries at
- * index 0 and index 1, is at index 1 in this array, etc.
- * @return the number of items to be included for each partition as an array.
- */
- long[] getNumDeltaItems();
-
- /**
- * Gets the number of partitions
- * @return the number of partitions
- */
- int getNumPartitions();
-
- /**
- * Gets the search criteria specified for the source sketch
- * @return The search criteria specified for the source sketch
- */
- QuantileSearchCriteria getSearchCriteria();
-}
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java
index 2c36bb10a..3d35cfe97 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java
@@ -33,13 +33,15 @@ public interface PartitioningFeature {
* refers to an approximately equal number of items per partition.
*
* This method is equivalent to
- * {@link #getPartitionBoundaries(int, QuantileSearchCriteria) getPartitionBoundaries(numEquallySized, INCLUSIVE)}.
+ * {@link #getPartitionBoundariesFromNumParts(int, QuantileSearchCriteria)
+ * getPartitionBoundariesFromNumParts(numEquallySizedParts, INCLUSIVE)}.
*
*
- * @param numEquallySized an integer that specifies the number of equally sized partitions between
+ * @param numEquallySizedParts an integer that specifies the number of equally sized partitions between
* {@link GenericPartitionBoundaries#getMinItem() getMinItem()} and
* {@link GenericPartitionBoundaries#getMaxItem() getMaxItem()}.
- * This must be a positive integer greater than zero.
+ * This must be a positive integer less than
+ * {@link SketchPartitionLimits#getMaxPartitions() getMaxPartitions()}
*
* - A 1 will return: minItem, maxItem.
* - A 2 will return: minItem, median quantile, maxItem.
@@ -47,11 +49,12 @@ public interface PartitioningFeature {
*
*
* @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}.
- * @throws IllegalArgumentException if sketch is empty.
- * @throws IllegalArgumentException if numEquallySized is less than 1.
+ * @throws SketchesArgumentException if sketch is empty.
+ * @throws SketchesArgumentException if numEquallySized is greater than
+ * {@link SketchPartitionLimits#getMaxPartitions() getMaxPartitions()}
*/
- default GenericPartitionBoundaries getPartitionBoundaries(int numEquallySized) {
- return getPartitionBoundaries(numEquallySized, INCLUSIVE);
+ default GenericPartitionBoundaries getPartitionBoundariesFromNumParts(int numEquallySizedParts) {
+ return getPartitionBoundariesFromNumParts(numEquallySizedParts, INCLUSIVE);
}
/**
@@ -60,10 +63,11 @@ default GenericPartitionBoundaries getPartitionBoundaries(int numEquallySized
* sufficient information for the user to create the given number of equally sized partitions, where "equally sized"
* refers to an approximately equal number of items per partition.
*
- * @param numEquallySized an integer that specifies the number of equally sized partitions between
+ * @param numEquallySizedParts an integer that specifies the number of equally sized partitions between
* {@link GenericPartitionBoundaries#getMinItem() getMinItem()} and
* {@link GenericPartitionBoundaries#getMaxItem() getMaxItem()}.
- * This must be a positive integer greater than zero.
+ * This must be a positive integer less than
+ * {@link SketchPartitionLimits#getMaxPartitions() getMaxPartitions()}
*
* - A 1 will return: minItem, maxItem.
* - A 2 will return: minItem, median quantile, maxItem.
@@ -77,9 +81,59 @@ default GenericPartitionBoundaries getPartitionBoundaries(int numEquallySized
* with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition.
*
* @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}.
- * @throws IllegalArgumentException if sketch is empty.
- * @throws IllegalArgumentException if numEquallySized is less than 1.
+ * @throws SketchesArgumentException if sketch is empty.
+ * @throws SketchesArgumentException if numEquallySized is greater than
+ * {@link SketchPartitionLimits#getMaxPartitions() getMaxPartitions()}
*/
- GenericPartitionBoundaries getPartitionBoundaries(int numEquallySized, QuantileSearchCriteria searchCrit);
+ GenericPartitionBoundaries getPartitionBoundariesFromNumParts(
+ int numEquallySizedParts, QuantileSearchCriteria searchCrit);
+
+ /**
+ * This method returns an instance of
+ * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides
+ * sufficient information for the user to create the given number of equally sized partitions, where "equally sized"
+ * refers to an approximately equal number of items per partition.
+ *
+ * This method is equivalent to
+ * {@link #getPartitionBoundariesFromPartSize(long, QuantileSearchCriteria)
+ * getPartitionBoundariesFromPartSize(nominalPartSizeItems, INCLUSIVE)}.
+ *
+ *
+ * @param nominalPartSizeItems an integer that specifies the nominal size, in items, of each target partition.
+ * This must be a positive integer greater than
+ * {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()}
+ *
+ * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}.
+ * @throws SketchesArgumentException if sketch is empty.
+ * @throws SketchesArgumentException if nominalPartSizeItems is less than
+ * {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()}
+ */
+ default GenericPartitionBoundaries getPartitionBoundariesFromPartSize(long nominalPartSizeItems) {
+ return getPartitionBoundariesFromPartSize(nominalPartSizeItems, INCLUSIVE);
+ }
+
+ /**
+ * This method returns an instance of
+ * {@link GenericPartitionBoundaries GenericPartitionBoundaries} which provides
+ * sufficient information for the user to create the given number of equally sized partitions, where "equally sized"
+ * refers to an approximately equal number of items per partition.
+ *
+ * @param nominalPartSizeItems an integer that specifies the nominal size, in items, of each target partition.
+ * This must be a positive integer greater than
+ * {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()}.
+ *
+ * @param searchCrit
+ * If INCLUSIVE, all the returned quantiles are the upper boundaries of the equally sized partitions
+ * with the exception of the lowest returned quantile, which is the lowest boundary of the lowest ranked partition.
+ * If EXCLUSIVE, all the returned quantiles are the lower boundaries of the equally sized partitions
+ * with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition.
+ *
+ * @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}.
+ * @throws SketchesArgumentException if sketch is empty.
+ * @throws SketchesArgumentException if nominalPartSizeItems is less than
+ * {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()}
+ */
+ GenericPartitionBoundaries getPartitionBoundariesFromPartSize(
+ long nominalPartSizeItems, QuantileSearchCriteria searchCrit);
}
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java
index 404ec7a7b..d422f15c4 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java
@@ -27,7 +27,7 @@
* @param The given item type
* @author Lee Rhodes
*/
-public interface QuantilesGenericAPI extends QuantilesAPI, PartitioningFeature {
+public interface QuantilesGenericAPI extends QuantilesAPI, PartitioningFeature, SketchPartitionLimits {
/**
* This is equivalent to {@link #getCDF(Object[], QuantileSearchCriteria) getCDF(splitPoints, INCLUSIVE)}
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java b/src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java
new file mode 100644
index 000000000..578f142af
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.quantilescommon;
+
+import static org.apache.datasketches.quantilescommon.QuantilesAPI.EMPTY_MSG;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+
+/**
+ * This defines the methods required to compute the partition limits.
+ */
+public interface SketchPartitionLimits {
+
+ /**
+ * Gets the maximum number of partitions this sketch will support based on the configured size K
+ * and the number of retained values of this sketch.
+ * @return the maximum number of partitions this sketch will support.
+ */
+ default int getMaxPartitions() {
+ return getNumRetained() / 2;
+ }
+
+ /**
+ * Gets the minimum partition size in items this sketch will support based on the configured size K of this
+ * sketch and the number of retained values of this sketch.
+ * @return the minimum partition size in items this sketch will support.
+ */
+ default long getMinPartitionSizeItems() {
+ final long totalN = getN();
+ if (totalN <= 0) { throw new SketchesArgumentException(EMPTY_MSG); }
+ return totalN / getMaxPartitions();
+ }
+
+ /**
+ * Gets the length of the input stream offered to the sketch..
+ * @return the length of the input stream offered to the sketch.
+ */
+ long getN();
+
+ /**
+ * Gets the number of quantiles retained by the sketch.
+ * @return the number of quantiles retained by the sketch
+ */
+ int getNumRetained();
+
+}
diff --git a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java
index 02990ae0d..69e736677 100644
--- a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java
+++ b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java
@@ -19,7 +19,6 @@
package org.apache.datasketches.quantiles;
-import static org.apache.datasketches.quantiles.PreambleUtil.DEFAULT_K;
import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.getString;
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE;
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE;
@@ -623,8 +622,7 @@ public void sortedView2() {
Double[] qArr = {8.0, 10.0, 10.0, 20.0};
long[] cwArr = {1, 3, 4, 5};
Comparator comp = Comparator.naturalOrder();
- final double normRankErr = ItemsSketch.getNormalizedRankError(DEFAULT_K, true);
- ItemsSketchSortedView sv = new ItemsSketchSortedView<>(qArr, cwArr, 5L, comp, 20.0, 8.0, normRankErr);
+ ItemsSketchSortedView sv = new ItemsSketchSortedView<>(qArr, cwArr, 5L, comp, 20.0, 8.0);
double[] ranks = {0, .1, .2, .3, .6, .7, .8, .9, 1.0};
Double[] qOut = new Double[9];
for (int i = 0; i < ranks.length; i++) {
diff --git a/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java b/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java
index dd07ae602..00cd380fb 100644
--- a/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java
+++ b/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java
@@ -39,7 +39,6 @@
import org.apache.datasketches.kll.KllDoublesSketch;
import org.apache.datasketches.kll.KllFloatsSketch;
import org.apache.datasketches.kll.KllItemsSketch;
-import org.apache.datasketches.kll.KllSketch;
import org.apache.datasketches.quantiles.DoublesSketch;
import org.apache.datasketches.quantiles.ItemsSketch;
import org.apache.datasketches.quantiles.UpdateDoublesSketch;
@@ -321,11 +320,10 @@ private void buildSVs(int set) throws Exception {
String svImin = svIValues[set][0];
kllItemsSV = new ItemsSketchSortedView<>(svIValues[set], svCumWeights[set], totalN[set],
- comparator, svImax, svImin, KllSketch.getNormalizedRankError(k, true));
+ comparator, svImax, svImin);
classicItemsSV = new ItemsSketchSortedView<>(svIValues[set], svCumWeights[set], totalN[set],
- comparator, svImax, svImin, ItemsSketch.getNormalizedRankError(k, true));
-
+ comparator, svImax, svImin);
}
/********BUILD DATA SETS**********/
diff --git a/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java b/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java
index d4a3eb434..1849c04fa 100644
--- a/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java
+++ b/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java
@@ -71,7 +71,7 @@ public void checkSkewWithClassic() {
printf(rowdfmt2, j++, itr.getNormalizedRank(searchCrit), itr.getNaturalRank(searchCrit), itr.getQuantile());
}
- GenericPartitionBoundaries gpb = sv.getPartitionBoundaries(numParts, searchCrit);
+ GenericPartitionBoundaries gpb = sv.getPartitionBoundariesFromNumParts(numParts, searchCrit);
int arrLen = gpb.getBoundaries().length;
double[] normRanks = gpb.getNormalizedRanks();
long[] natRanks = gpb.getNaturalRanks();
@@ -111,7 +111,7 @@ public void checkSkewWithKll() {
printf(rowdfmt2, j++, itr.getNormalizedRank(searchCrit), itr.getNaturalRank(searchCrit), itr.getQuantile());
}
- GenericPartitionBoundaries gpb = sv.getPartitionBoundaries(numParts, searchCrit);
+ GenericPartitionBoundaries gpb = sv.getPartitionBoundariesFromNumParts(numParts, searchCrit);
int arrLen = gpb.getBoundaries().length;
double[] normRanks = gpb.getNormalizedRanks();
long[] natRanks = gpb.getNaturalRanks();
@@ -136,10 +136,10 @@ public void getQuantilesVsPartitionBoundariesKll() {
sketch.update("C");
sketch.update("D");
String[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE);
- String[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).getBoundaries();
+ String[] quantiles2 = sketch.getPartitionBoundariesFromNumParts(2, EXCLUSIVE).getBoundaries();
assertEquals(quantiles1, quantiles2);
quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE);
- quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).getBoundaries();
+ quantiles2 = sketch.getPartitionBoundariesFromNumParts(2, INCLUSIVE).getBoundaries();
assertEquals(quantiles1, quantiles2);
}
@@ -151,10 +151,10 @@ public void getQuantilesVsPartitionBoundariesClassic() {
sketch.update(3);
sketch.update(4);
Integer[] quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, EXCLUSIVE);
- Integer[] quantiles2 = sketch.getPartitionBoundaries(2, EXCLUSIVE).getBoundaries();
+ Integer[] quantiles2 = sketch.getPartitionBoundariesFromNumParts(2, EXCLUSIVE).getBoundaries();
assertEquals(quantiles1, quantiles2);
quantiles1 = sketch.getQuantiles(new double[] {0.0, 0.5, 1.0}, INCLUSIVE);
- quantiles2 = sketch.getPartitionBoundaries(2, INCLUSIVE).getBoundaries();
+ quantiles2 = sketch.getPartitionBoundariesFromNumParts(2, INCLUSIVE).getBoundaries();
assertEquals(quantiles1, quantiles2);
}
@@ -164,22 +164,20 @@ public void getQuantilesVsPartitionBoundariesClassic() {
*/
@Test
public void checkSimpleEndsAdjustment() {
- final int k = 128;
final String[] quantiles = {"2","4","6","7"};
final long[] cumWeights = {2, 4, 6, 8};
final long totalN = 8;
final Comparator comparator = Comparator.naturalOrder();
final String maxItem = "8";
final String minItem = "1";
- final double normRankErr = ItemsSketch.getNormalizedRankError(k, true);
ItemsSketchSortedView sv = new ItemsSketchSortedView<>(
- quantiles, cumWeights, totalN, comparator, maxItem, minItem, normRankErr);
+ quantiles, cumWeights, totalN, comparator, maxItem, minItem);
GenericSortedViewIterator itr = sv.iterator();
while (itr.next()) {
println(itr.getNaturalRank(INCLUSIVE) + ", " + itr.getQuantile(INCLUSIVE));
}
- GenericPartitionBoundaries gpb = sv.getPartitionBoundaries(2);
+ GenericPartitionBoundaries gpb = sv.getPartitionBoundariesFromNumParts(2);
String[] boundaries = gpb.getBoundaries();
long[] natRanks = gpb.getNaturalRanks();
double[] normRanks = gpb.getNormalizedRanks();
From 30a813d50fa267e0ff73afe0bc1dbaffc7e299e9 Mon Sep 17 00:00:00 2001
From: Lee Rhodes
Date: Fri, 12 Apr 2024 12:36:34 -0700
Subject: [PATCH 2/4] Added tests for new partition limits
---
.../PartitionBoundariesTest.java | 42 +++++++++++++++++--
1 file changed, 39 insertions(+), 3 deletions(-)
diff --git a/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java b/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java
index 1849c04fa..abbce503f 100644
--- a/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java
+++ b/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java
@@ -28,6 +28,7 @@
import java.util.Comparator;
import org.apache.datasketches.common.ArrayOfStringsSerDe;
+import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.kll.KllItemsSketch;
import org.apache.datasketches.quantiles.ItemsSketch;
import org.testng.annotations.Test;
@@ -49,6 +50,7 @@ public class PartitionBoundariesTest {
private static String rowhdrfmt2= "%5s %12s %12s %12s\n";
private static String rowdfmt2 = "%5d %12.8f %12d %12s\n";
+ //@Test //visual check only. set enablePrinting = true to view.
public void checkSkewWithClassic() {
int n = 2050;
int k = 1 << 15;
@@ -88,7 +90,7 @@ public void checkSkewWithClassic() {
}
}
- @Test
+ //@Test //visual check only. set enablePrinting = true to view.
public void checkSkewWithKll() {
int n = 2050;
int k = 1 << 15;
@@ -159,8 +161,8 @@ public void getQuantilesVsPartitionBoundariesClassic() {
}
/**
- * Because both Kll and Classic items sketches use the same Sorted View class
- * this test applies to both. The only difference is a different normalized error given the same k.
+ * Because both Kll and Classic items sketches use the same Sorted View class.
+ * This test applies to both.
*/
@Test
public void checkSimpleEndsAdjustment() {
@@ -194,6 +196,40 @@ public void checkSimpleEndsAdjustment() {
assertEquals(minItm, "1");
}
+ @Test(expectedExceptions = SketchesArgumentException.class)
+ public void checkSketchPartitionLimits() {
+ final long totalN = 1_000_000;
+ final Comparator comparator = Comparator.naturalOrder();
+ final ArrayOfStringsSerDe serDe = new ArrayOfStringsSerDe();
+ final KllItemsSketch sk = KllItemsSketch.newHeapInstance(comparator, serDe);
+ final int d = digits(totalN);
+ for (int i = 1; i <= totalN; i++) {
+ sk.update(getString(i, d));
+ }
+ final int numLimit = sk.getMaxPartitions();
+ final int ret = sk.getNumRetained();
+ println("ret: " + ret + ", numLimit " + numLimit);
+ @SuppressWarnings("unused")
+ GenericPartitionBoundaries gpb = sk.getPartitionBoundariesFromNumParts(numLimit + 1);
+ }
+
+ @Test(expectedExceptions = SketchesArgumentException.class)
+ public void checkSketchPartitionLimits2() {
+ final long totalN = 1_000_000;
+ final Comparator comparator = Comparator.naturalOrder();
+ final ArrayOfStringsSerDe serDe = new ArrayOfStringsSerDe();
+ final KllItemsSketch sk = KllItemsSketch.newHeapInstance(comparator, serDe);
+ final int d = digits(totalN);
+ for (int i = 1; i <= totalN; i++) {
+ sk.update(getString(i, d));
+ }
+ final long sizeLimit= sk.getMinPartitionSizeItems();
+
+ println("Min Size Limit: " + sizeLimit);
+ @SuppressWarnings("unused")
+ GenericPartitionBoundaries gpb = sk.getPartitionBoundariesFromPartSize(sizeLimit - 1);
+ }
+
@Test
public void printlnTest() {
println("PRINTING: " + this.getClass().getName());
From 5cc7887f4e4cd9857441923aa5bf7241fa217877 Mon Sep 17 00:00:00 2001
From: Lee Rhodes
Date: Fri, 12 Apr 2024 13:20:18 -0700
Subject: [PATCH 3/4] Removed javadoc @Throws clauses -- because they duplicate
what is already documented in the body of the javadoc.
---
.../quantilescommon/PartitioningFeature.java | 20 ++++++++-----------
1 file changed, 8 insertions(+), 12 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java
index 3d35cfe97..380f57f08 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/PartitioningFeature.java
@@ -37,6 +37,8 @@ public interface PartitioningFeature {
* getPartitionBoundariesFromNumParts(numEquallySizedParts, INCLUSIVE)}.
*
*
+ * The sketch must not be empty.
+ *
* @param numEquallySizedParts an integer that specifies the number of equally sized partitions between
* {@link GenericPartitionBoundaries#getMinItem() getMinItem()} and
* {@link GenericPartitionBoundaries#getMaxItem() getMaxItem()}.
@@ -49,9 +51,6 @@ public interface PartitioningFeature {
*
*
* @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}.
- * @throws SketchesArgumentException if sketch is empty.
- * @throws SketchesArgumentException if numEquallySized is greater than
- * {@link SketchPartitionLimits#getMaxPartitions() getMaxPartitions()}
*/
default GenericPartitionBoundaries getPartitionBoundariesFromNumParts(int numEquallySizedParts) {
return getPartitionBoundariesFromNumParts(numEquallySizedParts, INCLUSIVE);
@@ -63,6 +62,8 @@ default GenericPartitionBoundaries getPartitionBoundariesFromNumParts(int num
* sufficient information for the user to create the given number of equally sized partitions, where "equally sized"
* refers to an approximately equal number of items per partition.
*
+ * The sketch must not be empty.
+ *
* @param numEquallySizedParts an integer that specifies the number of equally sized partitions between
* {@link GenericPartitionBoundaries#getMinItem() getMinItem()} and
* {@link GenericPartitionBoundaries#getMaxItem() getMaxItem()}.
@@ -81,9 +82,6 @@ default GenericPartitionBoundaries getPartitionBoundariesFromNumParts(int num
* with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition.
*
* @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}.
- * @throws SketchesArgumentException if sketch is empty.
- * @throws SketchesArgumentException if numEquallySized is greater than
- * {@link SketchPartitionLimits#getMaxPartitions() getMaxPartitions()}
*/
GenericPartitionBoundaries getPartitionBoundariesFromNumParts(
int numEquallySizedParts, QuantileSearchCriteria searchCrit);
@@ -99,14 +97,13 @@ GenericPartitionBoundaries getPartitionBoundariesFromNumParts(
* getPartitionBoundariesFromPartSize(nominalPartSizeItems, INCLUSIVE)}.
*
*
+ * The sketch must not be empty.
+ *
* @param nominalPartSizeItems an integer that specifies the nominal size, in items, of each target partition.
* This must be a positive integer greater than
* {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()}
*
* @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}.
- * @throws SketchesArgumentException if sketch is empty.
- * @throws SketchesArgumentException if nominalPartSizeItems is less than
- * {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()}
*/
default GenericPartitionBoundaries getPartitionBoundariesFromPartSize(long nominalPartSizeItems) {
return getPartitionBoundariesFromPartSize(nominalPartSizeItems, INCLUSIVE);
@@ -118,6 +115,8 @@ default GenericPartitionBoundaries getPartitionBoundariesFromPartSize(long no
* sufficient information for the user to create the given number of equally sized partitions, where "equally sized"
* refers to an approximately equal number of items per partition.
*
+ * The sketch must not be empty.
+ *
* @param nominalPartSizeItems an integer that specifies the nominal size, in items, of each target partition.
* This must be a positive integer greater than
* {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()}.
@@ -129,9 +128,6 @@ default GenericPartitionBoundaries getPartitionBoundariesFromPartSize(long no
* with the exception of the highest returned quantile, which is the upper boundary of the highest ranked partition.
*
* @return an instance of {@link GenericPartitionBoundaries GenericPartitionBoundaries}.
- * @throws SketchesArgumentException if sketch is empty.
- * @throws SketchesArgumentException if nominalPartSizeItems is less than
- * {@link SketchPartitionLimits#getMinPartitionSizeItems() getMinPartitionSizeItems()}
*/
GenericPartitionBoundaries getPartitionBoundariesFromPartSize(
long nominalPartSizeItems, QuantileSearchCriteria searchCrit);
From de4ef799d91807e1aa97e2840a06297bc9ad6f38 Mon Sep 17 00:00:00 2001
From: Lee Rhodes
Date: Sat, 13 Apr 2024 22:37:58 -0700
Subject: [PATCH 4/4] Added two limit check methods:
- getMaxPartitions()
- getMinPartSizeItems()
---
.../datasketches/kll/KllItemsSketch.java | 15 ++--
.../datasketches/quantiles/ItemsSketch.java | 18 ++---
.../ItemsSketchSortedView.java | 47 ++++++++---
.../quantilescommon/QuantilesGenericAPI.java | 24 ++++--
.../SketchPartitionLimits.java | 10 +--
.../quantiles/ItemsSketchTest.java | 20 -----
.../CrossCheckQuantilesTest.java | 8 +-
.../PartitionBoundariesTest.java | 80 +++++++++++++------
8 files changed, 132 insertions(+), 90 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java
index efcca934b..6ad57cf60 100644
--- a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java
@@ -145,9 +145,6 @@ public static KllItemsSketch wrap(
//END of Constructors
- @Override
- public Class getClassOfT() { return serDe.getClassOfT(); }
-
@Override
public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) {
if (isEmpty()) { throw new SketchesArgumentException(EMPTY_MSG); }
@@ -155,6 +152,14 @@ public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searc
return itemsSV.getCDF(splitPoints, searchCrit);
}
+ @Override
+ public Class getClassOfT() { return serDe.getClassOfT(); }
+
+ @Override
+ public Comparator super T> getComparator() {
+ return comparator;
+ }
+
@Override
public GenericPartitionBoundaries getPartitionBoundariesFromNumParts(
final int numEquallySizedParts,
@@ -434,8 +439,8 @@ ItemsSketchSortedView getSV() {
quantiles = (T[]) Array.newInstance(serDe.getClassOfT(), numQuantiles);
cumWeights = new long[numQuantiles];
populateFromSketch(srcQuantiles, srcLevels, srcNumLevels, numQuantiles);
- return new ItemsSketchSortedView(
- quantiles, cumWeights, getN(), comparator, getMaxItem(), getMinItem());
+ final QuantilesGenericAPI sk = KllItemsSketch.this;
+ return new ItemsSketchSortedView(quantiles, cumWeights, sk);
}
private void populateFromSketch(final Object[] srcQuantiles, final int[] srcLevels,
diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java
index 3d2d33882..00e0f046f 100644
--- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java
+++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java
@@ -254,9 +254,6 @@ static ItemsSketch copy(final ItemsSketch sketch) {
//END of Constructors
- @Override
- public Class getClassOfT() { return clazz; }
-
@Override
public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searchCrit) {
if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); }
@@ -264,6 +261,14 @@ public double[] getCDF(final T[] splitPoints, final QuantileSearchCriteria searc
return classicQisSV.getCDF(splitPoints, searchCrit);
}
+ @Override
+ public Class getClassOfT() { return clazz; }
+
+ @Override
+ public Comparator super T> getComparator() {
+ return comparator_;
+ }
+
@Override
public T getMaxItem() {
if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); }
@@ -587,10 +592,6 @@ Object[] getCombinedBuffer() {
return combinedBuffer_;
}
- Comparator super T> getComparator() {
- return comparator_;
- }
-
/**
* Loads the Combined Buffer, min and max from the given items array.
* The Combined Buffer is always in non-compact form and must be pre-allocated.
@@ -666,8 +667,7 @@ private static ItemsSketchSortedView getSV(final ItemsSketch sk) {
throw new SketchesStateException("Sorted View is misconfigured. TotalN does not match cumWeights.");
}
- return new ItemsSketchSortedView<>(
- svQuantiles, svCumWeights, sk.getN(), comparator, sk.getMaxItem(), sk.getMinItem());
+ return new ItemsSketchSortedView<>(svQuantiles, svCumWeights, sk);
}
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java
index 57735c6df..67e92ad27 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/ItemsSketchSortedView.java
@@ -46,31 +46,50 @@ public class ItemsSketchSortedView implements GenericSortedView {
private final T maxItem;
private final T minItem;
private final Class clazz;
+ private final double normRankError;
+ private final int numRetItems;
/**
* Construct from elements, also used in testing.
* @param quantiles sorted array of quantiles
* @param cumWeights sorted, monotonically increasing cumulative weights.
- * @param totalN the total number of items presented to the sketch.
- * @param comparator the Comparator for type T
- * @param maxItem of type T
- * @param minItem of type T
+ * @param sk the underlying quantile sketch.
*/
- @SuppressWarnings("unchecked")
public ItemsSketchSortedView(
final T[] quantiles,
final long[] cumWeights, //or Natural Ranks
+ final QuantilesGenericAPI sk) {
+ this.quantiles = quantiles;
+ this.cumWeights = cumWeights;
+ this.totalN = sk.getN();
+ this.comparator = sk.getComparator();
+ this.maxItem = sk.getMaxItem();
+ this.minItem = sk.getMinItem();
+ this.clazz = sk.getClassOfT();
+ this.normRankError = sk.getNormalizedRankError(true);
+ this.numRetItems = sk.getNumRetained();
+ }
+
+ //Used for testing
+ ItemsSketchSortedView(
+ final T[] quantiles,
+ final long[] cumWeights,
final long totalN,
final Comparator super T> comparator,
final T maxItem,
- final T minItem) {
+ final T minItem,
+ final Class clazz,
+ final double normRankError,
+ final int numRetItems) {
this.quantiles = quantiles;
this.cumWeights = cumWeights;
this.totalN = totalN;
this.comparator = comparator;
this.maxItem = maxItem;
this.minItem = minItem;
- this.clazz = (Class)quantiles[0].getClass();
+ this.clazz = clazz;
+ this.normRankError = normRankError;
+ this.numRetItems = numRetItems;
}
//end of constructors
@@ -113,18 +132,23 @@ public int getNumRetained() {
return quantiles.length;
}
+ @Override
+ public int getMaxPartitions() {
+ return (int) min(1.0 / normRankError, numRetItems / 2.0);
+ }
+
@Override
public GenericPartitionBoundaries getPartitionBoundariesFromPartSize(
final long nominalPartitionSize,
final QuantileSearchCriteria searchCrit) {
if (isEmpty()) { throw new SketchesArgumentException(QuantilesAPI.EMPTY_MSG); }
- final long partSizeItems = getMinPartitionSizeItems();
- if (nominalPartitionSize < partSizeItems) {
+ final long minPartSizeItems = getMinPartitionSizeItems();
+ if (nominalPartitionSize < minPartSizeItems) {
throw new SketchesArgumentException(QuantilesAPI.UNSUPPORTED_MSG
+ " The requested nominal partition size is too small for this sketch.");
}
final long totalN = this.totalN;
- final int numEquallySizedParts = (int) min(totalN / partSizeItems, getMaxPartitions());
+ final int numEquallySizedParts = (int) min(totalN / minPartSizeItems, getMaxPartitions());
return getPartitionBoundariesFromNumParts(numEquallySizedParts);
}
@@ -139,8 +163,6 @@ public GenericPartitionBoundaries getPartitionBoundariesFromNumParts(
throw new SketchesArgumentException(QuantilesAPI.UNSUPPORTED_MSG
+ " The requested number of partitions is too large for this sketch.");
}
- final long totalN = this.totalN;
- final int svLen = cumWeights.length;
final double[] searchNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySizedParts + 1);
final int partArrLen = searchNormRanks.length;
@@ -152,6 +174,7 @@ public GenericPartitionBoundaries getPartitionBoundariesFromNumParts(
// which are absolutely required when partitioning, especially inner partitions.
//Are the minItem and maxItem already in place?
+ final int svLen = cumWeights.length;
int adjLen = svLen; //this will be the length of the local copies of quantiles and cumWeights
final boolean adjLow = quantiles[0] != minItem; //if true, adjust the low end
final boolean adjHigh = quantiles[svLen - 1] != maxItem; //if true, adjust the high end
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java
index d422f15c4..459e58cdd 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/QuantilesGenericAPI.java
@@ -19,8 +19,11 @@
package org.apache.datasketches.quantilescommon;
+import static java.lang.Math.min;
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE;
+import java.util.Comparator;
+
/**
* The Quantiles API for item type generic.
* @see QuantilesAPI
@@ -74,6 +77,17 @@ default double[] getCDF(T[] splitPoints) {
*/
double[] getCDF(T[] splitPoints, QuantileSearchCriteria searchCrit);
+ /**
+ * @return the sketch item class
+ */
+ Class getClassOfT();
+
+ /**
+ * Returns the Comparator of T
+ * @return Comparator of the sketch
+ */
+ Comparator super T> getComparator();
+
/**
* Returns the maximum item of the stream. This may be distinct from the largest item retained by the
* sketch algorithm.
@@ -83,6 +97,11 @@ default double[] getCDF(T[] splitPoints) {
*/
T getMaxItem();
+ @Override
+ default int getMaxPartitions() {
+ return (int) min(1.0 / getNormalizedRankError(true), getNumRetained() / 2.0);
+ }
+
/**
* Returns the minimum item of the stream. This may be distinct from the smallest item retained by the
* sketch algorithm.
@@ -267,11 +286,6 @@ default double[] getRanks(T[] quantiles) {
*/
double[] getRanks(T[] quantiles, QuantileSearchCriteria searchCrit);
- /**
- * @return the sketch item class
- */
- Class getClassOfT();
-
/**
* Gets the sorted view of this sketch
* @return the sorted view of this sketch
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java b/src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java
index 578f142af..624ec21ca 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/SketchPartitionLimits.java
@@ -33,9 +33,7 @@ public interface SketchPartitionLimits {
* and the number of retained values of this sketch.
* @return the maximum number of partitions this sketch will support.
*/
- default int getMaxPartitions() {
- return getNumRetained() / 2;
- }
+ int getMaxPartitions();
/**
* Gets the minimum partition size in items this sketch will support based on the configured size K of this
@@ -54,10 +52,4 @@ default long getMinPartitionSizeItems() {
*/
long getN();
- /**
- * Gets the number of quantiles retained by the sketch.
- * @return the number of quantiles retained by the sketch
- */
- int getNumRetained();
-
}
diff --git a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java
index 69e736677..21d6ce988 100644
--- a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java
+++ b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java
@@ -617,26 +617,6 @@ public void sortedView() {
}
}
- @Test
- public void sortedView2() {
- Double[] qArr = {8.0, 10.0, 10.0, 20.0};
- long[] cwArr = {1, 3, 4, 5};
- Comparator comp = Comparator.naturalOrder();
- ItemsSketchSortedView sv = new ItemsSketchSortedView<>(qArr, cwArr, 5L, comp, 20.0, 8.0);
- double[] ranks = {0, .1, .2, .3, .6, .7, .8, .9, 1.0};
- Double[] qOut = new Double[9];
- for (int i = 0; i < ranks.length; i++) {
- qOut[i] = sv.getQuantile(ranks[i], EXCLUSIVE);
- println("rank: " + ranks[i] + ", quantiles: " + qOut[i]);
- }
- long[] cumWts = sv.getCumulativeWeights();
- Double[] quants = sv.getQuantiles();
- for (int i = 0; i < qArr.length; i++) {
- assertEquals(quants[i], qArr[i]);
- assertEquals(cumWts[i], cwArr[i]);
- }
- }
-
@Test
public void checkIssue484() {
Boolean[] items = { true,false,true,false,true,false,true,false,true,false };
diff --git a/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java b/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java
index 00cd380fb..9cf2441bd 100644
--- a/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java
+++ b/src/test/java/org/apache/datasketches/quantilescommon/CrossCheckQuantilesTest.java
@@ -295,8 +295,8 @@ private void buildSketches(int set) {
kllFloatsSk = KllFloatsSketch.newHeapInstance(k);
kllDoublesSk = KllDoublesSketch.newHeapInstance(k);
classicDoublesSk = DoublesSketch.builder().setK(k).build();
- kllItemsSk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), serDe);
- itemsSk = ItemsSketch.getInstance(String.class, k, Comparator.naturalOrder());
+ kllItemsSk = KllItemsSketch.newHeapInstance(k, comparator, serDe);
+ itemsSk = ItemsSketch.getInstance(String.class, k, comparator);
int count = skFStreamValues[set].length;
for (int i = 0; i < count; i++) {
@@ -320,10 +320,10 @@ private void buildSVs(int set) throws Exception {
String svImin = svIValues[set][0];
kllItemsSV = new ItemsSketchSortedView<>(svIValues[set], svCumWeights[set], totalN[set],
- comparator, svImax, svImin);
+ comparator, svImax, svImin, String.class, .01, svCumWeights[set].length);
classicItemsSV = new ItemsSketchSortedView<>(svIValues[set], svCumWeights[set], totalN[set],
- comparator, svImax, svImin);
+ comparator, svImax, svImin, String.class, .01, svCumWeights[set].length);
}
/********BUILD DATA SETS**********/
diff --git a/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java b/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java
index abbce503f..030c930b5 100644
--- a/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java
+++ b/src/test/java/org/apache/datasketches/quantilescommon/PartitionBoundariesTest.java
@@ -19,11 +19,13 @@
package org.apache.datasketches.quantilescommon;
+import static org.apache.datasketches.common.Util.LS;
import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.digits;
import static org.apache.datasketches.quantilescommon.LongsAsOrderableStrings.getString;
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE;
import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE;
import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.fail;
import java.util.Comparator;
@@ -39,31 +41,31 @@
public class PartitionBoundariesTest {
private ArrayOfStringsSerDe serDe = new ArrayOfStringsSerDe();
private static String[] hdr = {"N", "MaxItem", "MinItem", "NumParts", "SearchCriteria"};
- private static String hdrfmt = "%6s %10s %10s %10s %15s\n";
- private static String hdrdfmt = "%6d %10s %10s %10d %15s\n";
+ private static String hdrfmt = "%6s %10s %10s %10s %15s" + LS;
+ private static String hdrdfmt = "%6d %10s %10s %10d %15s" + LS;
private static String[] rowhdr = {"Row", "NormRanks", "NatRanks", "Boundaries", "DeltaItems"};
- private static String rowhdrfmt = "%5s %12s %12s %12s %12s\n";
- private static String rowdfmt = "%5d %12.8f %12d %12s %12d\n";
+ private static String rowhdrfmt = "%5s %12s %12s %12s %12s" + LS;
+ private static String rowdfmt = "%5d %12.8f %12d %12s %12d" + LS;
private static String[] rowhdr2 = {"Row", "NormRanks", "NatRanks", "Boundaries"};
- private static String rowhdrfmt2= "%5s %12s %12s %12s\n";
- private static String rowdfmt2 = "%5d %12.8f %12d %12s\n";
+ private static String rowhdrfmt2= "%5s %12s %12s %12s" + LS;
+ private static String rowdfmt2 = "%5d %12.8f %12d %12s" + LS;
//@Test //visual check only. set enablePrinting = true to view.
public void checkSkewWithClassic() {
- int n = 2050;
+ int n = 2050; //1000000;
int k = 1 << 15;
int n2 = 200;
int totalN = n + n2;
int numDigits = digits(totalN);
long v2 = 1000L;
- int numParts = 22;
QuantileSearchCriteria searchCrit = QuantileSearchCriteria.INCLUSIVE;
ItemsSketch sk = ItemsSketch.getInstance(String.class,k, Comparator.naturalOrder());
for (long i = 1; i <= n; i++) { sk.update(getString(i, numDigits)); }
for (long i = 1; i <= n2; i++) { sk.update(getString(v2, numDigits)); }
+ int numParts = sk.getMaxPartitions(); //22
ItemsSketchSortedView sv = sk.getSortedView();
GenericSortedViewIterator itr = sv.iterator();
println("SORTED VIEW:");
@@ -92,18 +94,18 @@ public void checkSkewWithClassic() {
//@Test //visual check only. set enablePrinting = true to view.
public void checkSkewWithKll() {
- int n = 2050;
+ int n = 2050; //1_000_000;
int k = 1 << 15;
int n2 = 200;
int totalN = n + n2;
int numDigits = digits(totalN);
long v2 = 1000L;
- int numParts = 22;
QuantileSearchCriteria searchCrit = QuantileSearchCriteria.INCLUSIVE;
KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), serDe);
for (long i = 1; i <= n; i++) { sk.update(getString(i, numDigits)); }
for (long i = 1; i <= n2; i++) { sk.update(getString(v2, numDigits)); }
+ int numParts = sk.getMaxPartitions(); //22
ItemsSketchSortedView sv = sk.getSortedView();
GenericSortedViewIterator itr = sv.iterator();
println("SORTED VIEW:");
@@ -173,7 +175,8 @@ public void checkSimpleEndsAdjustment() {
final String maxItem = "8";
final String minItem = "1";
ItemsSketchSortedView sv = new ItemsSketchSortedView<>(
- quantiles, cumWeights, totalN, comparator, maxItem, minItem);
+ quantiles, cumWeights, totalN, comparator, maxItem, minItem,
+ String.class, .01, 4);
GenericSortedViewIterator itr = sv.iterator();
while (itr.next()) {
@@ -196,38 +199,63 @@ public void checkSimpleEndsAdjustment() {
assertEquals(minItm, "1");
}
- @Test(expectedExceptions = SketchesArgumentException.class)
+ @SuppressWarnings("unused")
+ @Test //For visual check, set enablePrinting = true to view.
public void checkSketchPartitionLimits() {
- final long totalN = 1_000_000;
+ final long totalN = 2000; //1_000_000;
final Comparator comparator = Comparator.naturalOrder();
final ArrayOfStringsSerDe serDe = new ArrayOfStringsSerDe();
- final KllItemsSketch sk = KllItemsSketch.newHeapInstance(comparator, serDe);
+ final int k = 1 << 15;
+ final KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, comparator, serDe);
final int d = digits(totalN);
for (int i = 1; i <= totalN; i++) {
sk.update(getString(i, d));
}
- final int numLimit = sk.getMaxPartitions();
- final int ret = sk.getNumRetained();
- println("ret: " + ret + ", numLimit " + numLimit);
- @SuppressWarnings("unused")
- GenericPartitionBoundaries gpb = sk.getPartitionBoundariesFromNumParts(numLimit + 1);
+ //***
+ final int numRet = sk.getNumRetained();
+ println("NumRetained: " + numRet + " /2: " + (numRet / 2));
+ final double eps = sk.getNormalizedRankError(true);
+ printf("NormRankErr: %10.6f 1/eps: %10.3f" + LS, eps, 1/eps);
+ //***
+ //this should pass
+ final int goodNumPartsRequest = sk.getMaxPartitions();
+ println("Good numPartsRequest " + goodNumPartsRequest);
+ GenericPartitionBoundaries gpb = sk.getPartitionBoundariesFromNumParts(goodNumPartsRequest);
+ //this should fail
+ try {
+ final int badNumPartsRequest = goodNumPartsRequest + 1;
+ println("Bad numPartsRequest " + badNumPartsRequest);
+ gpb = sk.getPartitionBoundariesFromNumParts(badNumPartsRequest);
+ fail("Bad numPartsRequest should have failed. " + badNumPartsRequest);
+ } catch (SketchesArgumentException e) { } //OK
}
- @Test(expectedExceptions = SketchesArgumentException.class)
+ @SuppressWarnings("unused")
+ @Test //For visual check, set enablePrinting = true to view.
public void checkSketchPartitionLimits2() {
- final long totalN = 1_000_000;
+ final long totalN = 2000; //1_000_000;
final Comparator comparator = Comparator.naturalOrder();
final ArrayOfStringsSerDe serDe = new ArrayOfStringsSerDe();
- final KllItemsSketch sk = KllItemsSketch.newHeapInstance(comparator, serDe);
+ final int k = 1 << 15;
+ final KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, comparator, serDe);
final int d = digits(totalN);
for (int i = 1; i <= totalN; i++) {
sk.update(getString(i, d));
}
- final long sizeLimit= sk.getMinPartitionSizeItems();
+ final double eps = sk.getNormalizedRankError(true);
+ printf("NormRankErr: %10.6f 1/eps: %10.3f" + LS, eps, 1/eps);
+ println("N: " + sk.getN());
+ println("Max Parts: " + sk.getMaxPartitions());
- println("Min Size Limit: " + sizeLimit);
- @SuppressWarnings("unused")
- GenericPartitionBoundaries gpb = sk.getPartitionBoundariesFromPartSize(sizeLimit - 1);
+ //this should pass
+ final long goodPartSizeRequest= sk.getMinPartitionSizeItems();
+ println("Good partSizeRequest " + goodPartSizeRequest);
+ GenericPartitionBoundaries gpb = sk.getPartitionBoundariesFromPartSize(goodPartSizeRequest);
+ //this should fail
+ try {
+ final long badPartSizeRequest = goodPartSizeRequest - 1;
+ println("Bad partSizeRequest " + badPartSizeRequest);
+ } catch (SketchesArgumentException e) { } //OK
}
@Test