Skip to content
Permalink
Browse files
Merge pull request #388 from apache/getUnionSize
Get union size
  • Loading branch information
leerho committed Feb 16, 2022
2 parents 4b993a9 + 7c2c57d commit 402a36a96c8e94220f0af3caf8c83469aa759acc
Showing 9 changed files with 201 additions and 28 deletions.
@@ -26,6 +26,7 @@
import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
import static org.apache.datasketches.Util.TAB;
import static org.apache.datasketches.Util.ceilingPowerOf2;
import static org.apache.datasketches.Util.checkNomLongs;

import org.apache.datasketches.Family;
import org.apache.datasketches.ResizeFactor;
@@ -66,8 +67,8 @@ public SetOperationBuilder() {

/**
* Sets the Maximum Nominal Entries (max K) for this set operation. The effective value of K of the result of a
* Set Operation can be less than max K, but never greater.
* The minimum value is 16 and the maximum value is 67,108,864, which is 2^26.
* Set Operation can be less than max K, but never greater.
* The minimum value is 16 and the maximum value is 67,108,864, which is 2^26.
* @param nomEntries <a href="{@docRoot}/resources/dictionary.html#nomEntries">Nominal Entres</a>
* This will become the ceiling power of 2 if it is not a power of 2.
* @return this SetOperationBuilder
@@ -81,6 +82,20 @@ public SetOperationBuilder setNominalEntries(final int nomEntries) {
return this;
}

/**
* Alternative method of setting the Nominal Entries for this set operation from the log_base2 value.
* The minimum value is 4 and the maximum value is 26.
* Be aware that set operations as large as this maximum value may not have been
* thoroughly characterized for performance.
*
* @param lgNomEntries the log_base2 Nominal Entries.
* @return this SetOperationBuilder
*/
public SetOperationBuilder setLogNominalEntries(final int lgNomEntries) {
bLgNomLongs = checkNomLongs(1 << lgNomEntries);
return this;
}

/**
* Returns Log-base 2 Nominal Entries
* @return Log-base 2 Nominal Entries
@@ -301,7 +301,7 @@ public static int getMaxCompactSketchBytes(final int numberOfEntries) {
/**
* Returns the maximum number of storage bytes required for an UpdateSketch with the given
* number of nominal entries (power of 2).
* @param nomEntries <a href="{@docRoot}/resources/dictionary.html#nomEntries">Nominal Entres</a>
* @param nomEntries <a href="{@docRoot}/resources/dictionary.html#nomEntries">Nominal Entries</a>
* This will become the ceiling power of 2 if it is not.
* @return the maximum number of storage bytes required for a UpdateSketch with the given
* nomEntries
@@ -31,11 +31,25 @@
*/
public abstract class Union extends SetOperation {


/**
* Returns the number of storage bytes required for this union in its current state.
*
* @return the number of storage bytes required for this union in its current state.
*/
public abstract int getCurrentBytes();

@Override
public Family getFamily() {
return Family.UNION;
}

/**
* Returns the maximum required storage bytes for this union.
* @return the maximum required storage bytes for this union.
*/
public abstract int getMaxUnionBytes();

/**
* Gets the result of this operation as an ordered CompactSketch on the Java heap.
* This does not disturb the underlying data structure of the union.
@@ -216,9 +216,14 @@ static UnionImpl wrapInstance(final WritableMemory srcMem, final long expectedSe
}

@Override
public boolean isSameResource(final Memory that) {
return gadget_ instanceof DirectQuickSelectSketchR
? gadget_.getMemory().isSameResource(that) : false;
public int getCurrentBytes() {
return gadget_.getCurrentBytes();
}

@Override
public int getMaxUnionBytes() {
final int lgK = gadget_.getLgNomLongs();
return (16 << lgK) + (Family.UNION.getMaxPreLongs() << 3);
}

@Override
@@ -256,6 +261,12 @@ public CompactSketch getResult(final boolean dstOrdered, final WritableMemory ds
minThetaLong, curCountOut, seedHash, empty, true, dstOrdered, dstOrdered, dstMem, compactCacheOut);
}

@Override
public boolean isSameResource(final Memory that) {
return gadget_ instanceof DirectQuickSelectSketchR
? gadget_.getMemory().isSameResource(that) : false;
}

@Override
public void reset() {
gadget_.reset();
@@ -113,7 +113,7 @@ public UpdateSketchBuilder setNominalEntries(final int nomEntries) {
* This value is also used for building a shared concurrent sketch.
* The minimum value is 4 and the maximum value is 26.
* Be aware that sketches as large as this maximum value may not have been
* thoroughly tested or characterized for performance.
* thoroughly characterized for performance.
*
* @param lgNomEntries the Log Nominal Entries. Also for the concurrent shared sketch
* @return this UpdateSketchBuilder
@@ -38,7 +38,7 @@
public class MurmurHash3v2Test {
private Random rand = new Random();
private static final int trials = 1 << 20;

@Test
public void compareLongArrLong() { //long[]
int arrLen = 3;
@@ -54,7 +54,7 @@ public void compareLongArrLong() { //long[]
assertEquals(res2, res1);
}
}

@Test
public void compareIntArr() { //int[]
int bytes = Integer.BYTES;
@@ -63,7 +63,7 @@ public void compareIntArr() { //int[]
int iPer = 8 / bytes;
int nLongs = arrLen / iPer;
int shift = 64 / iPer;

for (int i = 0; i < trials; i++) { //trials
for (int j = 0; j < nLongs; j++) { //longs
long r = rand.nextLong();
@@ -77,7 +77,7 @@ public void compareIntArr() { //int[]
assertEquals(res2, res1);
}
}

@Test
public void compareCharArr() { //char[]
int bytes = Character.BYTES;
@@ -86,7 +86,7 @@ public void compareCharArr() { //char[]
int iPer = 8 / bytes;
int nLongs = arrLen / iPer;
int shift = 64 / iPer;

for (int i = 0; i < trials; i++) { //trials
for (int j = 0; j < nLongs; j++) { //longs
long r = rand.nextLong();
@@ -100,7 +100,7 @@ public void compareCharArr() { //char[]
assertEquals(res2, res1);
}
}

@Test
public void compareByteArr() { //byte[]
int bytes = Byte.BYTES;
@@ -109,7 +109,7 @@ public void compareByteArr() { //byte[]
int iPer = 8 / bytes;
int nLongs = arrLen / iPer;
int shift = 64 / iPer;

for (int i = 0; i < trials; i++) { //trials
for (int j = 0; j < nLongs; j++) { //longs
long r = rand.nextLong();
@@ -123,7 +123,7 @@ public void compareByteArr() { //byte[]
assertEquals(res2, res1);
}
}

@Test
public void compareLongVsLongArr() {
int arrLen = 1;
@@ -137,55 +137,55 @@ public void compareLongVsLongArr() {
assertEquals(res2, res1);
}
}

private static final long[] hashV1(long[] key, long seed) {
return MurmurHash3.hash(key, seed);
}

private static final long[] hashV1(int[] key, long seed) {
return MurmurHash3.hash(key, seed);
}

private static final long[] hashV1(char[] key, long seed) {
return MurmurHash3.hash(key, seed);
}

private static final long[] hashV1(byte[] key, long seed) {
return MurmurHash3.hash(key, seed);
}

private static final long[] hashV2(long[] key, long seed) {
return MurmurHash3v2.hash(key, seed);
}

private static final long[] hashV2(int[] key2, long seed) {
return MurmurHash3v2.hash(key2, seed);
}

private static final long[] hashV2(char[] key, long seed) {
return MurmurHash3v2.hash(key, seed);
}

private static final long[] hashV2(byte[] key, long seed) {
return MurmurHash3v2.hash(key, seed);
}

//V2 single primitives

private static final long[] hashV2(long key, long seed, long[] out) {
return MurmurHash3v2.hash(key, seed, out);
}

// private static final long[] hashV2(double key, long seed, long[] out) {
// return MurmurHash3v2.hash(key, seed, out);
// }

// private static final long[] hashV2(String key, long seed, long[] out) {
// return MurmurHash3v2.hash(key, seed, out);
// }



@Test
public void offsetChecks() {
long seed = 12345;
@@ -20,7 +20,12 @@
package org.apache.datasketches.kll;

import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;

import java.util.Objects;

import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.memory.WritableMemory;
import org.testng.annotations.Test;

/**
@@ -57,6 +62,63 @@ public void checkBounds() {
println("LB : " + lb);
}

@Test(expectedExceptions = SketchesArgumentException.class)
public void checkHeapifyExceptions1() {
KllDoublesSketch sk = new KllDoublesSketch();
WritableMemory wmem = WritableMemory.writableWrap(sk.toByteArray());
wmem.putByte(6, (byte)4); //corrupt M
KllDoublesSketch.heapify(wmem);
}

@Test(expectedExceptions = SketchesArgumentException.class)
public void checkHeapifyExceptions2() {
KllDoublesSketch sk = new KllDoublesSketch();
WritableMemory wmem = WritableMemory.writableWrap(sk.toByteArray());
wmem.putByte(0, (byte)1); //corrupt preamble ints, should be 2
KllDoublesSketch.heapify(wmem);
}

@Test(expectedExceptions = SketchesArgumentException.class)
public void checkHeapifyExceptions3() {
KllDoublesSketch sk = new KllDoublesSketch();
sk.update(1.0f);
sk.update(2.0f);
WritableMemory wmem = WritableMemory.writableWrap(sk.toByteArray());
wmem.putByte(0, (byte)1); //corrupt preamble ints, should be 5
KllDoublesSketch.heapify(wmem);
}

@Test(expectedExceptions = SketchesArgumentException.class)
public void checkHeapifyExceptions4() {
KllDoublesSketch sk = new KllDoublesSketch();
WritableMemory wmem = WritableMemory.writableWrap(sk.toByteArray());
wmem.putByte(1, (byte)0); //corrupt SerVer, should be 1 or 2
KllDoublesSketch.heapify(wmem);
}

@Test(expectedExceptions = SketchesArgumentException.class)
public void checkHeapifyExceptions5() {
KllDoublesSketch sk = new KllDoublesSketch();
WritableMemory wmem = WritableMemory.writableWrap(sk.toByteArray());
wmem.putByte(2, (byte)0); //corrupt FamilyID, should be 15
KllDoublesSketch.heapify(wmem);
}

@Test
public void checkMisc() {
KllDoublesSketch sk = new KllDoublesSketch(8, true);
assertTrue(Objects.isNull(sk.getQuantiles(10)));
sk.toString(true, true);
for (int i = 0; i < 20; i++) { sk.update(i); }
sk.toString(true, true);
sk.toByteArray();
final double[] items = sk.getItems();
assertEquals(items.length, 16);
final int[] levels = sk.getLevels();
assertEquals(levels.length, 3);
assertEquals(sk.getNumLevels(), 2);
}

//@Test //requires visual check
public void checkNumRetainedAboveLevelZero() {
final KllDoublesSketch sketch = new KllDoublesSketch(20);

0 comments on commit 402a36a

Please sign in to comment.