Skip to content

Commit

Permalink
Merge pull request #222 from apache/cpc_max_serialized_size
Browse files Browse the repository at this point in the history
CPC max serialized size bytes
  • Loading branch information
AlexanderSaydakov authored Jun 22, 2021
2 parents 043b947 + 7dea7c7 commit f17db17
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 3 deletions.
13 changes: 13 additions & 0 deletions cpc/include/cpc_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,17 @@ class cpc_sketch_alloc {
*/
static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());

/**
* The actual size of a compressed CPC sketch has a small random variance, but the following
* empirically measured size should be large enough for at least 99.9 percent of sketches.
*
* <p>For small values of <i>n</i> the size can be much smaller.
*
* @param lg_k the given value of lg_k.
* @return the estimated maximum compressed serialized size of a sketch.
*/
static size_t get_max_serialized_size_bytes(uint8_t lg_k);

// for internal use
uint32_t get_num_coupons() const;

Expand Down Expand Up @@ -303,6 +314,8 @@ class cpc_sketch_alloc {
inline void write_hip(std::ostream& os) const;
inline size_t copy_hip_to_mem(void* dst) const;

static void check_lg_k(uint8_t lg_k);

friend cpc_compressor<A>;
friend cpc_union_alloc<A>;
};
Expand Down
47 changes: 44 additions & 3 deletions cpc/include/cpc_sketch_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,7 @@ first_interesting_column(0),
kxp(1 << lg_k),
hip_est_accum(0)
{
if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
}
check_lg_k(lg_k);
}

template<typename A>
Expand Down Expand Up @@ -681,6 +679,49 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
}

/*
* These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
* trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
* spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
* worst-case space for the preamble, which is added by the function.
*/
static const uint8_t CPC_EMPIRICAL_SIZE_MAX_LGK = 19;
static const size_t CPC_EMPIRICAL_MAX_SIZE_BYTES[] = {
24, // lg_k = 4
36, // lg_k = 5
56, // lg_k = 6
100, // lg_k = 7
180, // lg_k = 8
344, // lg_k = 9
660, // lg_k = 10
1292, // lg_k = 11
2540, // lg_k = 12
5020, // lg_k = 13
9968, // lg_k = 14
19836, // lg_k = 15
39532, // lg_k = 16
78880, // lg_k = 17
157516, // lg_k = 18
314656 // lg_k = 19
};
static const double CPC_EMPIRICAL_MAX_SIZE_FACTOR = 0.6; // 0.6 = 4.8 / 8.0
static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;

template<typename A>
size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
check_lg_k(lg_k);
if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - CPC_MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
const uint32_t k = 1 << lg_k;
return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
}

template<typename A>
void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
}
}

template<typename A>
uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
return num_coupons;
Expand Down
5 changes: 5 additions & 0 deletions cpc/test/cpc_sketch_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,4 +398,9 @@ TEST_CASE("cpc sketch: update string equivalence", "[cpc_sketch]") {
REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
}

TEST_CASE("cpc sketch: max serialized size", "[cpc_sketch]") {
REQUIRE(cpc_sketch::get_max_serialized_size_bytes(4) == 24 + 40);
REQUIRE(cpc_sketch::get_max_serialized_size_bytes(26) == static_cast<size_t>((0.6 * (1 << 26)) + 40));
}

} /* namespace datasketches */

0 comments on commit f17db17

Please sign in to comment.