Skip to content
Permalink
Browse files
Code additions requested by Will.
Added 4 methods to Sketches class:

heapifyCompactSketch(Memory)
heapifyCompactSketch(Memory, long)
wrapCompactSketch(Memory)
wrapCompactSketch(Memory, long)

Changed the behavior of heapifySketch(2) and wrapSketch(2) calls
so that if the given image is a CompactSketch the behavior will
be the same as above.

The basic behavior change is that a seed is no longer required to
heapify or wrap a CompactSketch image.
But if it is given, it will be used to check the
hashSeed of the image, if there is one.

This push represents the main code changes.
More test code needs to be added.
  • Loading branch information
leerho committed Dec 22, 2021
1 parent 6d3c5db commit 64322b5a986c7f863fec24c0fe0e0726b0d7920b
Show file tree
Hide file tree
Showing 8 changed files with 571 additions and 324 deletions.
@@ -19,25 +19,239 @@

package org.apache.datasketches.theta;

import static org.apache.datasketches.Family.idToFamily;
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem;

import org.apache.datasketches.Family;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.Util;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;

/**
* The parent class of all the CompactSketches. CompactSketches are never created directly.
* They are created as a result of the compact() method of an UpdateSketch or as a result of a
* getResult() of a SetOperation.
* They are created as a result of the compact() method of an UpdateSketch, a result of a
* getResult() of a SetOperation, or from a heapify method.
*
* <p>A CompactSketch is the simplest form of a Theta Sketch. It consists of a compact list
* (i.e., no intervening spaces) of hash values, which may be ordered or not, a value for theta
* and a seed hash. A CompactSketch is read-only,
* and a seed hash. A CompactSketch is immutable (read-only),
* and the space required when stored is only the space required for the hash values and 8 to 24
* bytes of preamble. An empty CompactSketch consumes only 8 bytes.</p>
*
* @author Lee Rhodes
*/
public abstract class CompactSketch extends Sketch {

//Sketch

/**
* Heapify takes a CompactSketch image in Memory and instantiates an on-heap CompactSketch.
*
* <p>The resulting sketch will not retain any link to the source Memory.</p>
*
* <p>This method assumes that the sketch image was created with the correct hash seed,
* so it is not checked.</p>
*
* @param srcMem an image of a CompactSketch.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>.
* @return a CompactSketch on the heap.
*/
public static CompactSketch heapify(final Memory srcMem) {
final int serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF;
final int familyID = srcMem.getByte(FAMILY_BYTE) & 0XFF;
final Family family = Family.idToFamily(familyID);
if (family != Family.COMPACT) {
throw new IllegalArgumentException("Corrupted: " + family + " is not Compact!");
}
if (serVer == 3) { //no seed check
final int flags = PreambleUtil.extractFlags(srcMem);
final boolean srcOrdered = (flags & ORDERED_FLAG_MASK) != 0;
return CompactOperations.memoryToCompact(srcMem, srcOrdered, null);
}
//not SerVer 3, assume compact stored form
final short srcSeedHash = (short) extractSeedHash(srcMem);
if (serVer == 1) {
return ForwardCompatibility.heapify1to3(srcMem, srcSeedHash);
}
if (serVer == 2) {
return ForwardCompatibility.heapify2to3(srcMem, srcSeedHash);
}
throw new SketchesArgumentException("Unknown Serialization Version: " + serVer);
}

/**
* Heapify takes a CompactSketch image in Memory and instantiates an on-heap CompactSketch.
*
* <p>The resulting sketch will not retain any link to the source Memory.</p>
*
* <p>This this method checks if the given seed was used to create the source Memory image.
* However, SerialVersion 1 sketches cannot be checked.</p>
*
* @param srcMem an image of a CompactSketch that was created using the given seed.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>.
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
* @return a CompactSketch on the heap.
*/
public static CompactSketch heapify(final Memory srcMem, final long seed) {
final int serVer = srcMem.getByte(SER_VER_BYTE);
final byte familyID = srcMem.getByte(FAMILY_BYTE);
final Family family = idToFamily(familyID);
if (family != Family.COMPACT) {
throw new IllegalArgumentException("Corrupted: " + family + " is not Compact!");
}
if (serVer == 3) {
final int flags = PreambleUtil.extractFlags(srcMem);
final boolean srcOrdered = (flags & ORDERED_FLAG_MASK) != 0;
final boolean empty = (flags & EMPTY_FLAG_MASK) != 0;
if (!empty) { PreambleUtil.checkMemorySeedHash(srcMem, seed); }
return CompactOperations.memoryToCompact(srcMem, srcOrdered, null);
}
//not SerVer 3, assume compact stored form
final short seedHash = Util.computeSeedHash(seed);
if (serVer == 1) {
return ForwardCompatibility.heapify1to3(srcMem, seedHash);
}
if (serVer == 2) {
return ForwardCompatibility.heapify2to3(srcMem, seedHash);
}
throw new SketchesArgumentException("Unknown Serialization Version: " + serVer);
}

/**
* Wrap takes the CompactSketch image in given Memory and refers to it directly.
* There is no data copying onto the java heap.
* The wrap operation enables fast read-only merging and access to all the public read-only API.
*
* <p>Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have
* been explicitly stored as direct sketches can be wrapped.
* Wrapping earlier serial version sketches will result in a on-heap CompactSketch
* where all data will be copied to the heap. These early versions were never designed to
* "wrap".</p>
*
* <p>Wrapping any subclass of this class that is empty or contains only a single item will
* result in on-heap equivalent forms of empty and single item sketch respectively.
* This is actually faster and consumes less overall memory.</p>
*
* <p>This method assumes that the sketch image was created with the
* correct hash seed, so it is not checked.</p>
*
* @param srcMem an image of a Sketch.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>.
* @return a Sketch backed by the given Memory.
*/
public static CompactSketch wrap(final Memory srcMem) {
final int serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF;
final int familyID = srcMem.getByte(FAMILY_BYTE) & 0XFF;
final Family family = Family.idToFamily(familyID);
if (family != Family.COMPACT) {
throw new IllegalArgumentException("Corrupted: " + family + " is not Compact!");
}
final short memSeedHash = (short) extractSeedHash(srcMem);
if (serVer == 3) {
if (PreambleUtil.isEmptyFlag(srcMem)) {
return EmptyCompactSketch.getHeapInstance(srcMem);
}
if (otherCheckForSingleItem(srcMem)) { //SINGLEITEM?
return SingleItemSketch.heapify(srcMem, memSeedHash);
}
//not empty & not singleItem
final int flags = srcMem.getByte(FLAGS_BYTE);
final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0;
if (!compactFlag) {
throw new SketchesArgumentException(
"Corrupted: COMPACT family sketch image must have compact flag set");
}
final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0;
if (!readOnly) {
throw new SketchesArgumentException(
"Corrupted: COMPACT family sketch image must have Read-Only flag set");
}
return DirectCompactSketch.wrapInstance(srcMem, memSeedHash);
} //end of serVer 3
else if (serVer == 1) {
return ForwardCompatibility.heapify1to3(srcMem, memSeedHash);
}
else if (serVer == 2) {
return ForwardCompatibility.heapify2to3(srcMem, memSeedHash);
}
throw new SketchesArgumentException(
"Corrupted: Serialization Version " + serVer + " not recognized.");
}

/**
* Wrap takes the sketch image in the given Memory and refers to it directly.
* There is no data copying onto the java heap.
* The wrap operation enables fast read-only merging and access to all the public read-only API.
*
* <p>Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have
* been explicitly stored as direct sketches can be wrapped.
* Wrapping earlier serial version sketches will result in a on-heap CompactSketch
* where all data will be copied to the heap. These early versions were never designed to
* "wrap".</p>
*
* <p>Wrapping any subclass of this class that is empty or contains only a single item will
* result in on-heap equivalent forms of empty and single item sketch respectively.
* This is actually faster and consumes less overall memory.</p>
*
* <p>This method checks if the given seed was used to
* create the source Memory image. However, SerialVersion 1 sketches cannot be checked.</p>
*
* @param srcMem an image of a Sketch.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
* @return a UpdateSketch backed by the given Memory except as above.
*/
public static CompactSketch wrap(final Memory srcMem, final long seed) {
final int serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF;
final int familyID = srcMem.getByte(FAMILY_BYTE) & 0XFF;
final Family family = Family.idToFamily(familyID);
if (family != Family.COMPACT) {
throw new IllegalArgumentException("Corrupted: " + family + " is not Compact!");
}
final short seedHash = Util.computeSeedHash(seed);

if (serVer == 3) {
if (PreambleUtil.isEmptyFlag(srcMem)) {
return EmptyCompactSketch.getHeapInstance(srcMem);
}
if (otherCheckForSingleItem(srcMem)) { //SINGLEITEM?
return SingleItemSketch.heapify(srcMem, seedHash);
}
//not empty & not singleItem
final int flags = srcMem.getByte(FLAGS_BYTE);
final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0;
if (!compactFlag) {
throw new SketchesArgumentException(
"Corrupted: COMPACT family sketch image must have compact flag set");
}
final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0;
if (!readOnly) {
throw new SketchesArgumentException(
"Corrupted: COMPACT family sketch image must have Read-Only flag set");
}
return DirectCompactSketch.wrapInstance(srcMem, seedHash);
} //end of serVer 3
else if (serVer == 1) {
return ForwardCompatibility.heapify1to3(srcMem, seedHash);
}
else if (serVer == 2) {
return ForwardCompatibility.heapify2to3(srcMem, seedHash);
}
throw new SketchesArgumentException(
"Corrupted: Serialization Version " + serVer + " not recognized.");

}


//Sketch Overrides

@Override
public abstract CompactSketch compact(final boolean dstOrdered, final WritableMemory dstMem);
@@ -19,10 +19,10 @@

package org.apache.datasketches.theta;

import static org.apache.datasketches.Util.checkSeedHashes;
import static org.apache.datasketches.theta.CompactOperations.checkIllegalCurCountAndEmpty;
import static org.apache.datasketches.theta.CompactOperations.memoryToCompact;
import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
@@ -60,16 +60,16 @@ class DirectCompactSketch extends CompactSketch {
* Wraps the given Memory, which must be a SerVer 3, ordered, CompactSketch image.
* Must check the validity of the Memory before calling. The order bit must be set properly.
* @param srcMem <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @param seed The update seed.
* <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
* @param seedHash The update seedHash.
* <a href="{@docRoot}/resources/dictionary.html#seedHash">See Seed Hash</a>.
* @return this sketch
*/
static DirectCompactSketch wrapInstance(final Memory srcMem, final long seed) {
checkMemorySeedHash(srcMem, seed);
static DirectCompactSketch wrapInstance(final Memory srcMem, final short seedHash) {
checkSeedHashes((short) extractSeedHash(srcMem), seedHash);
return new DirectCompactSketch(srcMem);
}

//Sketch
//Sketch Overrides

@Override
public CompactSketch compact(final boolean dstOrdered, final WritableMemory dstMem) {
@@ -19,14 +19,12 @@

package org.apache.datasketches.theta;

import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;

import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.Util;
import org.apache.datasketches.memory.Memory;

/**
@@ -38,25 +36,6 @@
*/
final class ForwardCompatibility {

/**
* Convert a serialization version (SerVer) 1 sketch (~Feb 2014) to a SerVer 3 sketch.
* Note: SerVer 1 sketches always have (metadata) preamble-longs of 3 and are always stored
* in a compact ordered form, but with 3 different sketch types. All SerVer 1 sketches will
* be converted to a SerVer 3 sketches. There is no concept of p-sampling, no empty bit.
*
* @param srcMem the image of a SerVer 1 sketch
*
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
* The seed used for building the sketch image in srcMem.
* Note: SerVer 1 sketches do not have the concept of the SeedHash, so the seed provided here
* MUST be the actual seed that was used when the SerVer 1 sketches were built.
* @return a SerVer 3 {@link CompactSketch}.
*/
static final CompactSketch heapify1to3(final Memory srcMem, final long seed) {
final short seedHash = Util.computeSeedHash(seed);
return heapify1to3(srcMem, seedHash);
}

/**
* Convert a serialization version (SerVer) 1 sketch (~Feb 2014) to a SerVer 3 sketch.
* Note: SerVer 1 sketches always have (metadata) preamble-longs of 3 and are always stored
@@ -108,12 +87,11 @@ static final CompactSketch heapify1to3(final Memory srcMem, final short seedHash
* Note: SerVer 2 sketches can have metadata-longs of 1,2 or 3 and are always stored
* in a compact ordered form (not as a hash table), but with 4 different sketch types.
* @param srcMem the image of a SerVer 2 sketch
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
* @param seedHash <a href="{@docRoot}/resources/dictionary.html#seedHash">See Seed Hash</a>.
* The seed used for building the sketch image in srcMem
* @return a SerVer 3 HeapCompactOrderedSketch
*/
static final CompactSketch heapify2to3(final Memory srcMem, final long seed) {
final short seedHash = checkMemorySeedHash(srcMem, seed);
static final CompactSketch heapify2to3(final Memory srcMem, final short seedHash) {
final int memCap = (int) srcMem.getCapacity();
final int preLongs = extractPreLongs(srcMem); //1,2 or 3
final int familyId = extractFamilyID(srcMem); //1,2,3,4
@@ -139,7 +117,7 @@ static final CompactSketch heapify2to3(final Memory srcMem, final long seed) {
reqBytesIn = (preLongs + 1) << 3;
validateInputSize(reqBytesIn, memCap);
final long hash = srcMem.getLong(preLongs << 3);
return new SingleItemSketch(hash, seed);
return new SingleItemSketch(hash, seedHash);
}
//curCount > 1
reqBytesIn = (curCount + preLongs) << 3;
@@ -160,7 +138,7 @@ static final CompactSketch heapify2to3(final Memory srcMem, final long seed) {
reqBytesIn = (preLongs + 1) << 3;
validateInputSize(reqBytesIn, memCap);
final long hash = srcMem.getLong(preLongs << 3);
return new SingleItemSketch(hash, seed);
return new SingleItemSketch(hash, seedHash);
}
//curCount > 1 and/or theta < 1.0
reqBytesIn = (curCount + preLongs) << 3;
@@ -153,7 +153,7 @@ short getSeedHash() {
return seedHash_;
}

//use of Memory is convenient. The byteArray and Memory are loaded simulaneously.
//use of Memory is convenient. The byteArray and Memory are loaded simultaneously.
@Override
public byte[] toByteArray() {
final int bytes = getCurrentBytes();
@@ -22,13 +22,15 @@
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.datasketches.ByteArrayUtil.putLongLE;
import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
import static org.apache.datasketches.Util.checkSeedHashes;
import static org.apache.datasketches.Util.computeSeedHash;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;

import org.apache.datasketches.Family;
@@ -99,6 +101,21 @@ public static SingleItemSketch heapify(final Memory srcMem, final long seed) {
throw new SketchesArgumentException("Input Memory is not a SingleItemSketch.");
}

/**
* Creates a SingleItemSketch on the heap given a SingleItemSketch Memory image and a seedHash.
* Checks the seed hash of the given Memory against a hash of the given seed.
* @param srcMem the Memory to be heapified.
* @param seedHash a given seedHash
* @return a SingleItemSketch
*/ //does not override Sketch
public static SingleItemSketch heapify(final Memory srcMem, final short seedHash) {
checkSeedHashes((short) extractSeedHash(srcMem), seedHash);
final boolean singleItem = otherCheckForSingleItem(srcMem);
if (singleItem) { return new SingleItemSketch(srcMem.getLong(8), seedHash); }
throw new SketchesArgumentException("Input Memory is not a SingleItemSketch.");
}


@Override
public CompactSketch compact(final boolean dstOrdered, final WritableMemory dstMem) {
if (dstMem == null) { return this; }

0 comments on commit 64322b5

Please sign in to comment.