Skip to content

Commit

Permalink
Merge pull request #221 from apache/kll_max_serialized_size
Browse files Browse the repository at this point in the history
max serialized size
  • Loading branch information
AlexanderSaydakov committed Jun 15, 2021
2 parents 61b9d4e + cf0e660 commit 5cba443
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 0 deletions.
27 changes: 27 additions & 0 deletions kll/include/kll_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,33 @@ class kll_sketch {
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
size_t get_serialized_size_bytes() const;

/**
* Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
* length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
* This method can be used if allocation of storage is necessary beforehand, but it is not
* optimal.
* This method is for arithmetic types (integral and floating point)
* @param k parameter that controls size of the sketch and accuracy of estimates
* @param n stream length
* @return upper bound on the serialized size
*/
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n);

/**
* Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
* length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
* This method can be used if allocation of storage is necessary beforehand, but it is not
* optimal.
* This method is for all other non-arithmetic types, and it takes a max size of an item as input.
* @param k parameter that controls size of the sketch and accuracy of estimates
* @param n stream length
* @param max_item_size_bytes maximum size of an item in bytes
* @return upper bound on the serialized size
*/
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes);

/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
Expand Down
20 changes: 20 additions & 0 deletions kll/include/kll_sketch_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,26 @@ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
return size;
}

// implementation for fixed-size arithmetic types (integral and floating point)
template<typename T, typename C, typename S, typename A>
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n) {
const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
// the last integer in the levels_ array is not serialized because it can be derived
return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * sizeof(TT);
}

// implementation for all other types
template<typename T, typename C, typename S, typename A>
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes) {
const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
// the last integer in the levels_ array is not serialized because it can be derived
return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * max_item_size_bytes;
}

template<typename T, typename C, typename S, typename A>
void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
const bool is_single_item = n_ == 1;
Expand Down
16 changes: 16 additions & 0 deletions kll/test/kll_sketch_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,22 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
}
}

SECTION("max serialized size arithmetic type") {
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 10) == 1968);
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 100) == 2316);
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000) == 2440);
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000) == 2800);
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000000) == 3160);
}

SECTION("max serialized size non-arithmetic type") {
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 10, 4) == 1968);
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 100, 4) == 2316);
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000, 4) == 2440);
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000, 4) == 2800);
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
}

// cleanup
if (test_allocator_total_bytes != 0) {
REQUIRE(test_allocator_total_bytes == 0);
Expand Down

0 comments on commit 5cba443

Please sign in to comment.