Skip to content
Permalink
Browse files
This fixes Mikhail's Bug: datasketches-java Issue #368.
In addition to the issue that Mikhail found, I found a number of other
discrepancies in the treatment of various corner cases in the Set
Operations.  Those were also fixed.
  • Loading branch information
leerho committed Oct 18, 2021
1 parent 4273a53 commit c89e0711418666801f4dcc5c39924ad656d97e5f
Show file tree
Hide file tree
Showing 6 changed files with 1,449 additions and 34 deletions.
@@ -78,6 +78,7 @@ public void setA(final Sketch skA) {

//process A
hashArr_ = getHashArrA(skA);
hashArr_ = (hashArr_ == null) ? new long[0] : hashArr_;
empty_ = false;
thetaLong_ = skA.getThetaLong();
curCount_ = hashArr_.length;
@@ -93,6 +94,7 @@ public void notB(final Sketch skB) {

//process B
hashArr_ = getResultHashArr(thetaLong_, curCount_, hashArr_, skB);
hashArr_ = (hashArr_ == null) ? new long[0] : hashArr_;
curCount_ = hashArr_.length;
empty_ = curCount_ == 0 && thetaLong_ == Long.MAX_VALUE;
}
@@ -119,17 +121,22 @@ public CompactSketch aNotB(final Sketch skA, final Sketch skB, final boolean dst
}
//Both skA & skB are not null

final long minThetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong());

if (skA.isEmpty()) { return skA.compact(dstOrdered, dstMem); }
//A is not Empty
checkSeedHashes(skA.getSeedHash(), seedHash_);

if (skB.isEmpty()) { return skA.compact(dstOrdered, dstMem); }
if (skB.isEmpty() && skB.getRetainedEntries() == 0) {
return skA.compact(dstOrdered, dstMem);
}
checkSeedHashes(skB.getSeedHash(), seedHash_);
//Both skA & skB are not empty

//process A
final long[] hashArrA = getHashArrA(skA);
final int countA = hashArrA.length;
final long minThetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong());
final int countA = (hashArrA == null) ? 0 : hashArrA.length;


//process B
final long[] hashArrOut = getResultHashArr(minThetaLong, countA, hashArrA, skB); //out is clone
@@ -37,7 +37,7 @@
*
* <p>The stateful operation is as follows:</p>
* <pre><code>
* AnotB anotb = SetOperationBuilder.buildAnotB();
* AnotB anotb = new AnotB();
*
* anotb.setA(Sketch skA); //The first argument.
* anotb.notB(Sketch skB); //The second (subtraction) argument.
@@ -49,7 +49,7 @@
*
* <p>The stateless operation is as follows:</p>
* <pre><code>
* AnotB anotb = SetOperationBuilder.buildAnotB();
* AnotB anotb = new AnotB();
*
* CompactSketch csk = anotb.aNotB(Sketch skA, Sketch skB);
* </code></pre>
@@ -95,17 +95,18 @@
* With a null as the first argument, we cannot know what the user's intent is.
* Since it is very likely that a <i>null</i> is a programming error, we throw a an exception.</p>
*
* <p>An enpty input argument will set the internal state to empty.</p>
* <p>An empty input argument will set the internal state to empty.</p>
*
* <p>Rationale: An empty set is a mathematically legal concept. Although it makes any subsequent,
* valid argument for B irrelvant, we must allow this and assume the user knows what they are
* valid argument for B irrelevant, we must allow this and assume the user knows what they are
* doing.</p>
*
* <p>Performing {@link #getResult(boolean)} just after this step will return a compact form of
* the given argument.</p>
*
* @param skA The incoming sketch for the first argument, <i>A</i>.
*/
@SuppressWarnings("unchecked")
public void setA(final Sketch<S> skA) {
if (skA == null) {
reset();
@@ -116,14 +117,23 @@ public void setA(final Sketch<S> skA) {
return;
}
//skA is not empty
empty_ = false;
thetaLong_ = skA.getThetaLong();

//process A
empty_ = false;
thetaLong_ = skA.getThetaLong();
final DataArrays<S> da = getDataArraysA(skA);

hashArr_ = da.hashArr;
summaryArr_ = da.summaryArr;
hashArr_ = (hashArr_ == null) ? new long[0] : hashArr_;
curCount_ = hashArr_.length;

summaryArr_ = da.summaryArr;
if (summaryArr_ == null) {
final SummaryFactory<S> sumFact = ((QuickSelectSketch<S>)skA).getSummaryFactory();
final S summary = sumFact.newSummary();
final Class<S> summaryType = (Class<S>)summary.getClass();
summaryArr_ = (S[]) Array.newInstance(summaryType, 0);
}
}

/**
@@ -133,7 +143,7 @@ public void setA(final Sketch<S> skA) {
*
* <p>An input argument of null or empty is ignored.</p>
*
* <p>Rationale: A <i>null</i> for the second or following arguments is more tollerable because
* <p>Rationale: A <i>null</i> for the second or following arguments is more tolerable because
* <i>A NOT null</i> is still <i>A</i> even if we don't know exactly what the null represents. It
* clearly does not have any content that overlaps with <i>A</i>. Also, because this can be part of
* a multistep operation with multiple <i>notB</i> steps. Other following steps can still produce
@@ -143,18 +153,28 @@ public void setA(final Sketch<S> skA) {
*
* @param skB The incoming Tuple sketch for the second (or following) argument <i>B</i>.
*/
@SuppressWarnings("unchecked")
public void notB(final Sketch<S> skB) {
if (empty_ || skB == null || skB.isEmpty() || hashArr_ == null) { return; }
if (empty_ || skB == null || skB.isEmpty()) { return; }
//skB is not empty
final long thetaLongB = skB.getThetaLong();
thetaLong_ = Math.min(thetaLong_, thetaLongB);

//process B
final DataArrays<S> daB = getResultArraysTuple(thetaLong_, curCount_, hashArr_, summaryArr_, skB);

hashArr_ = daB.hashArr;
hashArr_ = (hashArr_ == null) ? new long[0] : hashArr_;
curCount_ = hashArr_.length;

summaryArr_ = daB.summaryArr;
if (summaryArr_ == null) {
final SummaryFactory<S> sumFact = ((QuickSelectSketch<S>)skB).getSummaryFactory();
final S summary = sumFact.newSummary();
final Class<S> summaryType = (Class<S>)summary.getClass();
summaryArr_ = (S[]) Array.newInstance(summaryType, 0);
}

curCount_ = hashArr_.length;
empty_ = curCount_ == 0 && thetaLong_ == Long.MAX_VALUE;
}

@@ -167,7 +187,7 @@ public void notB(final Sketch<S> skB) {
*
* <p>An input argument of null or empty is ignored.</p>
*
* <p>Rationale: A <i>null</i> for the second or following arguments is more tollerable because
* <p>Rationale: A <i>null</i> for the second or following arguments is more tolerable because
* <i>A NOT null</i> is still <i>A</i> even if we don't know exactly what the null represents. It
* clearly does not have any content that overlaps with <i>A</i>. Also, because this can be part of
* a multistep operation with multiple <i>notB</i> steps. Other following steps can still produce
@@ -185,15 +205,18 @@ public void notB(final org.apache.datasketches.theta.Sketch skB) {

//process B
final DataArrays<S> daB = getResultArraysTheta(thetaLong_, curCount_, hashArr_, summaryArr_, skB);

hashArr_ = daB.hashArr;
summaryArr_ = daB.summaryArr;
hashArr_ = (hashArr_ == null) ? new long[0] : hashArr_;
curCount_ = hashArr_.length;

summaryArr_ = daB.summaryArr;
curCount_ = hashArr_.length;
empty_ = curCount_ == 0 && thetaLong_ == Long.MAX_VALUE;
}

/**
* Gets the result of the mutistep, stateful operation AnotB that have been executed with calls
* Gets the result of the multistep, stateful operation AnotB that have been executed with calls
* to {@link #setA(Sketch)} and ({@link #notB(Sketch)} or
* {@link #notB(org.apache.datasketches.theta.Sketch)}).
*
@@ -235,25 +258,40 @@ public CompactSketch<S> getResult(final boolean reset) {
* @param <S> Type of Summary
* @return the result as an unordered {@link CompactSketch}
*/
@SuppressWarnings("unchecked")
public static <S extends Summary>
CompactSketch<S> aNotB(final Sketch<S> skA, final Sketch<S> skB) {
if (skA == null || skB == null) {
throw new SketchesArgumentException("Neither argument may be null");
}
if (skA.getRetainedEntries() == 0) { return skA.compact(); }
if (skB.getRetainedEntries() == 0) { return skA.compact(); }
//Both skA & skB are not empty
//Both skA & skB are not null

final long minThetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong());

if (skA.isEmpty()) { return skA.compact(); }
if (skB.isEmpty() && skB.getRetainedEntries() == 0) { return skA.compact(); }
//Both skA & skB are not empty, and skB has valid entries

//Process A
final DataArrays<S> da = getDataArraysA(skA);
final long[] hashArrA = da.hashArr;
final S[] summaryArrA = da.summaryArr;
long[] hashArrA = da.hashArr;
hashArrA = (hashArrA == null) ? new long[0] : hashArrA;
final int countA = hashArrA.length;

S[] summaryArrA = da.summaryArr;
if (summaryArrA == null) {
final SummaryFactory<S> sumFact = ((QuickSelectSketch<S>)skA).getSummaryFactory();
final S summary = sumFact.newSummary();
final Class<S> summaryType = (Class<S>)summary.getClass();
summaryArrA = (S[]) Array.newInstance(summaryType, 0);
}

if (countA == 0) {
return new CompactSketch<S>(new long[0], summaryArrA, minThetaLong, false);
}

//Process B
final long minThetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong());
final DataArrays<S> daB = getResultArraysTuple(minThetaLong, countA, hashArrA, summaryArrA, skB);

final long[] hashArr = daB.hashArr;
final S[] summaryArr = daB.summaryArr;
final int curCountOut = hashArr.length;
@@ -287,26 +325,41 @@ CompactSketch<S> aNotB(final Sketch<S> skA, final Sketch<S> skB) {
* @param <S> Type of Summary
* @return the result as an unordered {@link CompactSketch}
*/
@SuppressWarnings("unchecked")
public static <S extends Summary>
CompactSketch<S> aNotB(final Sketch<S> skA, final org.apache.datasketches.theta.Sketch skB) {
if (skA == null || skB == null) {
throw new SketchesArgumentException("Neither argument may be null");
}
//Both skA & skB are not null

if (skA.getRetainedEntries() == 0) { return skA.compact(); }
if (skB.getRetainedEntries() == 0) { return skA.compact(); }
//Both skA & skB have valid retained entries, and are not empty
final long minThetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong());

if (skA.isEmpty()) { return skA.compact(); }
if (skB.isEmpty() && skB.getRetainedEntries() == 0) { return skA.compact(); }
//Both skA & skB are not empty, and skB has valid entries

//Process A
final DataArrays<S> da = getDataArraysA(skA);
final long[] hashArrA = da.hashArr;
final S[] summaryArrA = da.summaryArr;
long[] hashArrA = da.hashArr;
hashArrA = (hashArrA == null) ? new long[0] : hashArrA;
final int countA = hashArrA.length;

S[] summaryArrA = da.summaryArr;
if (summaryArrA == null) {
final SummaryFactory<S> sumFact = ((QuickSelectSketch<S>)skA).getSummaryFactory();
final S summary = sumFact.newSummary();
final Class<S> summaryType = (Class<S>)summary.getClass();
summaryArrA = (S[]) Array.newInstance(summaryType, 0);
}

if (countA == 0) {
return new CompactSketch<S>(new long[0], summaryArrA, minThetaLong, false);
}

//Process B
final long minThetaLong = Math.min(skA.getThetaLong(), skB.getThetaLong());
final DataArrays<S> daB = getResultArraysTheta(minThetaLong, countA, hashArrA, summaryArrA, skB);

final DataArrays<S> daB = getResultArraysTheta(minThetaLong, countA, hashArrA, summaryArrA, skB);
final long[] hashArr = daB.hashArr;
final S[] summaryArr = daB.summaryArr;
final int countOut = hashArr.length;
@@ -107,14 +107,19 @@ public void intersect(final Sketch<S> tupleSketch) {
if (tupleSketch == null) { throw new SketchesArgumentException("Sketch must not be null"); }
final boolean firstCall = firstCall_;
firstCall_ = false;
final boolean emptyIn = tupleSketch.isEmpty();
if (empty_ || emptyIn) { //empty rule
//Because of the definition of null above and the Empty Rule (which is OR), empty_ must be true.
//Whatever the current internal state, we make our local empty.
resetToEmpty();
return;
}

// input sketch could be first or next call
final long thetaLongIn = tupleSketch.getThetaLong();
final int countIn = tupleSketch.getRetainedEntries();
thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
// Empty rule extended in case incoming sketch does not have empty bit properly set
final boolean emptyIn = countIn == 0 && thetaLongIn == Long.MAX_VALUE;
empty_ |= emptyIn; //empty rule

if (countIn == 0) {
hashTables_.clear();
return;
@@ -274,12 +279,23 @@ public boolean hasResult() {
* Resets the internal set to the initial state, which represents the Universal Set
*/
public void reset() {
hardReset();
}

private void hardReset() {
empty_ = false;
thetaLong_ = Long.MAX_VALUE;
hashTables_.clear();
firstCall_ = true;
}

private void resetToEmpty() {
empty_ = true;
thetaLong_ = Long.MAX_VALUE;
hashTables_.clear();
firstCall_ = false;
}

static int getLgTableSize(final int count) {
final int tableSize = max(ceilingPowerOf2((int) ceil(count / 0.75)), 1 << MIN_LG_NOM_LONGS);
return Integer.numberOfTrailingZeros(tableSize);
@@ -292,7 +292,8 @@ public void reset() {
@SuppressWarnings("unchecked")
public CompactSketch<S> compact() {
if (getRetainedEntries() == 0) {
return new CompactSketch<>(null, null, thetaLong_, empty_);
if (empty_) { return new CompactSketch<>(null, null, Long.MAX_VALUE, true); }
return new CompactSketch<>(null, null, thetaLong_, false);
}
final long[] hashArr = new long[getRetainedEntries()];
final S[] summaryArr = (S[])

0 comments on commit c89e071

Please sign in to comment.