Skip to content

Commit

Permalink
Merge pull request #390 from apache/java_serde_compat_testing
Browse files Browse the repository at this point in the history
Test serialization compatibility with Java
  • Loading branch information
AlexanderSaydakov committed Aug 14, 2023
2 parents 7f3c659 + b1f6694 commit 2c2b89e
Show file tree
Hide file tree
Showing 15 changed files with 189 additions and 83 deletions.
4 changes: 3 additions & 1 deletion fi/include/frequent_items_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,9 @@ class frequent_items_sketch {
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
static const uint8_t PREAMBLE_LONGS_NONEMPTY = 4;
static constexpr double EPSILON_FACTOR = 3.5;
enum flags { IS_EMPTY };
// due to a mistake different bits were used in C++ and Java to indicate empty sketch
// therefore both are set and checked for compatibility with historical binary format
enum flags { IS_EMPTY_1 = 0, IS_EMPTY_2 = 2 };
W total_weight;
W offset;
reverse_purge_hash_map<T, W, H, E, A> map;
Expand Down
10 changes: 6 additions & 4 deletions fi/include/frequent_items_sketch_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ void frequent_items_sketch<T, W, H, E, A>::serialize(std::ostream& os, const Ser
const uint8_t lg_cur_size = map.get_lg_cur_size();
write(os, lg_cur_size);
const uint8_t flags_byte(
(is_empty() ? 1 << flags::IS_EMPTY : 0)
(is_empty() ? 1 << flags::IS_EMPTY_1 : 0)
| (is_empty() ? 1 << flags::IS_EMPTY_2 : 0)
);
write(os, flags_byte);
const uint16_t unused16 = 0;
Expand Down Expand Up @@ -234,7 +235,8 @@ auto frequent_items_sketch<T, W, H, E, A>::serialize(unsigned header_size_bytes,
const uint8_t lg_cur_size = map.get_lg_cur_size();
ptr += copy_to_mem(lg_cur_size, ptr);
const uint8_t flags_byte(
(is_empty() ? 1 << flags::IS_EMPTY : 0)
(is_empty() ? 1 << flags::IS_EMPTY_1 : 0)
| (is_empty() ? 1 << flags::IS_EMPTY_2 : 0)
);
ptr += copy_to_mem(flags_byte, ptr);
ptr += sizeof(uint16_t); // unused
Expand Down Expand Up @@ -298,7 +300,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
const auto flags_byte = read<uint8_t>(is);
read<uint16_t>(is); // unused

const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY_1)) | (flags_byte & (1 << flags::IS_EMPTY_2));

check_preamble_longs(preamble_longs, is_empty);
check_serial_version(serial_version);
Expand Down Expand Up @@ -352,7 +354,7 @@ frequent_items_sketch<T, W, H, E, A> frequent_items_sketch<T, W, H, E, A>::deser
ptr += copy_from_mem(ptr, flags_byte);
ptr += sizeof(uint16_t); // unused

const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY_1)) | (flags_byte & (1 << flags::IS_EMPTY_2));

check_preamble_longs(preamble_longs, is_empty);
check_serial_version(serial_version);
Expand Down
7 changes: 7 additions & 0 deletions fi/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,10 @@ target_sources(fi_test
frequent_items_sketch_test.cpp
frequent_items_sketch_custom_type_test.cpp
)

if (SERDE_COMPAT)
target_sources(fi_test
PRIVATE
frequent_items_sketch_deserialize_from_java_test.cpp
)
endif()
95 changes: 95 additions & 0 deletions fi/test/frequent_items_sketch_deserialize_from_java_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <catch2/catch.hpp>
#include <fstream>
#include <frequent_items_sketch.hpp>

namespace datasketches {

// assume the binary sketches for this test have been generated by datasketches-java code
// in the subdirectory called "java" in the root directory of this project
static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";

TEST_CASE("frequent longs", "[serde_compat]") {
unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "frequent_long_n" + std::to_string(n) + ".sk", std::ios::binary);
auto sketch = frequent_items_sketch<int64_t>::deserialize(is);
REQUIRE(sketch.is_empty() == (n == 0));
if (n > 10) {
REQUIRE(sketch.get_maximum_error() > 0);
} else {
REQUIRE(sketch.get_maximum_error() == 0);
}
REQUIRE(sketch.get_total_weight() == n);
}
}

TEST_CASE("frequent strings", "[serde_compat]") {
unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "frequent_string_n" + std::to_string(n) + ".sk", std::ios::binary);
auto sketch = frequent_items_sketch<std::string>::deserialize(is);
REQUIRE(sketch.is_empty() == (n == 0));
if (n > 10) {
REQUIRE(sketch.get_maximum_error() > 0);
} else {
REQUIRE(sketch.get_maximum_error() == 0);
}
REQUIRE(sketch.get_total_weight() == n);
}
}

TEST_CASE("frequent strings ascii", "[serde_compat]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "frequent_string_ascii.sk", std::ios::binary);
auto sketch = frequent_items_sketch<std::string>::deserialize(is);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_maximum_error() == 0);
REQUIRE(sketch.get_total_weight() == 10);
REQUIRE(sketch.get_estimate("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == 1);
REQUIRE(sketch.get_estimate("bbbbbbbbbbbbbbbbbbbbbbbbbbbbb") == 2);
REQUIRE(sketch.get_estimate("ccccccccccccccccccccccccccccc") == 3);
REQUIRE(sketch.get_estimate("ddddddddddddddddddddddddddddd") == 4);
}

TEST_CASE("frequent strings utf8", "[serde_compat]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "frequent_string_utf8.sk", std::ios::binary);
auto sketch = frequent_items_sketch<std::string>::deserialize(is);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_maximum_error() == 0);
REQUIRE(sketch.get_total_weight() == 28);
REQUIRE(sketch.get_estimate("абвгд") == 1);
REQUIRE(sketch.get_estimate("еёжзи") == 2);
REQUIRE(sketch.get_estimate("йклмн") == 3);
REQUIRE(sketch.get_estimate("опрст") == 4);
REQUIRE(sketch.get_estimate("уфхцч") == 5);
REQUIRE(sketch.get_estimate("шщъыь") == 6);
REQUIRE(sketch.get_estimate("эюя") == 7);
}

} /* namespace datasketches */
42 changes: 0 additions & 42 deletions fi/test/frequent_items_sketch_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,48 +199,6 @@ TEST_CASE("frequent items: merge estimation mode", "[frequent_items_sketch]") {
REQUIRE(9 <= items[1].get_estimate()); // always overestimated
}

TEST_CASE("frequent items: deserialize from java long", "[frequent_items_sketch]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "longs_sketch_from_java.sk", std::ios::binary);
auto sketch = frequent_items_sketch<long long>::deserialize(is);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_total_weight() == 4);
REQUIRE(sketch.get_num_active_items() == 4);
REQUIRE(sketch.get_estimate(1) == 1);
REQUIRE(sketch.get_estimate(2) == 1);
REQUIRE(sketch.get_estimate(3) == 1);
REQUIRE(sketch.get_estimate(4) == 1);
}

TEST_CASE("frequent items: deserialize from java string", "[frequent_items_sketch]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "items_sketch_string_from_java.sk", std::ios::binary);
auto sketch = frequent_items_sketch<std::string>::deserialize(is);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_total_weight() == 4);
REQUIRE(sketch.get_num_active_items() == 4);
REQUIRE(sketch.get_estimate("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == 1);
REQUIRE(sketch.get_estimate("bbbbbbbbbbbbbbbbbbbbbbbbbbbbb") == 1);
REQUIRE(sketch.get_estimate("ccccccccccccccccccccccccccccc") == 1);
REQUIRE(sketch.get_estimate("ddddddddddddddddddddddddddddd") == 1);
}

TEST_CASE("frequent items: deserialize from java string, utf-8", "[frequent_items_sketch]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "items_sketch_string_utf8_from_java.sk", std::ios::binary);
auto sketch = frequent_items_sketch<std::string>::deserialize(is);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_total_weight() == 10);
REQUIRE(sketch.get_num_active_items() == 4);
REQUIRE(sketch.get_estimate("абвгд") == 1);
REQUIRE(sketch.get_estimate("еёжзи") == 2);
REQUIRE(sketch.get_estimate("йклмн") == 3);
REQUIRE(sketch.get_estimate("опрст") == 4);
}

TEST_CASE("frequent items: deserialize long64 stream", "[frequent_items_sketch]") {
frequent_items_sketch<long long> sketch1(3);
sketch1.update(1, 1);
Expand Down
Binary file removed fi/test/items_sketch_string_from_java.sk
Binary file not shown.
Binary file removed fi/test/items_sketch_string_utf8_from_java.sk
Binary file not shown.
Binary file removed fi/test/longs_sketch_from_java.sk
Binary file not shown.
16 changes: 8 additions & 8 deletions kll/test/kll_sketch_deserialize_from_java_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "

TEST_CASE("kll float", "[serde_compat]") {
unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (unsigned n: n_arr) {
for (const unsigned n: n_arr) {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "kll_float_n" + std::to_string(n) + ".sk", std::ios::binary);
Expand All @@ -38,10 +38,10 @@ TEST_CASE("kll float", "[serde_compat]") {
REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
REQUIRE(sketch.get_n() == n);
if (n > 0) {
REQUIRE(sketch.get_min_item() == 0.0);
REQUIRE(sketch.get_max_item() == static_cast<double>(n - 1));
REQUIRE(sketch.get_min_item() == 1.0f);
REQUIRE(sketch.get_max_item() == static_cast<float>(n));
uint64_t weight = 0;
for (auto pair: sketch) {
for (const auto pair: sketch) {
REQUIRE(pair.first >= sketch.get_min_item());
REQUIRE(pair.first <= sketch.get_max_item());
weight += pair.second;
Expand All @@ -53,7 +53,7 @@ TEST_CASE("kll float", "[serde_compat]") {

TEST_CASE("kll double", "[serde_compat]") {
unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (unsigned n: n_arr) {
for (const unsigned n: n_arr) {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "kll_double_n" + std::to_string(n) + ".sk", std::ios::binary);
Expand All @@ -62,10 +62,10 @@ TEST_CASE("kll double", "[serde_compat]") {
REQUIRE(sketch.is_estimation_mode() == (n > kll_constants::DEFAULT_K));
REQUIRE(sketch.get_n() == n);
if (n > 0) {
REQUIRE(sketch.get_min_item() == 0.0);
REQUIRE(sketch.get_max_item() == static_cast<double>(n - 1));
REQUIRE(sketch.get_min_item() == 1.0);
REQUIRE(sketch.get_max_item() == static_cast<double>(n));
uint64_t weight = 0;
for (auto pair: sketch) {
for (const auto pair: sketch) {
REQUIRE(pair.first >= sketch.get_min_item());
REQUIRE(pair.first <= sketch.get_max_item());
weight += pair.second;
Expand Down
Binary file removed kll/test/kll_sketch_from_java.sk
Binary file not shown.
13 changes: 0 additions & 13 deletions kll/test/kll_sketch_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,19 +264,6 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
}
}

SECTION("deserialize from java") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "kll_sketch_from_java.sk", std::ios::binary);
auto sketch = kll_float_sketch::deserialize(is, serde<float>(), std::less<float>(), 0);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.is_estimation_mode());
REQUIRE(sketch.get_n() == 1000000);
REQUIRE(sketch.get_num_retained() == 614);
REQUIRE(sketch.get_min_item() == 0.0);
REQUIRE(sketch.get_max_item() == 999999.0);
}

SECTION("stream serialize deserialize empty") {
kll_float_sketch sketch(200, std::less<float>(), 0);
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
Expand Down
28 changes: 17 additions & 11 deletions quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "

TEST_CASE("quantiles double", "[serde_compat]") {
unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (unsigned n: n_arr) {
for (const unsigned n: n_arr) {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "quantiles_double_n" + std::to_string(n) + ".sk", std::ios::binary);
Expand All @@ -38,10 +38,10 @@ TEST_CASE("quantiles double", "[serde_compat]") {
REQUIRE(sketch.is_estimation_mode() == (n > quantiles_constants::DEFAULT_K));
REQUIRE(sketch.get_n() == n);
if (n > 0) {
REQUIRE(sketch.get_min_item() == 0.0);
REQUIRE(sketch.get_max_item() == static_cast<double>(n - 1));
REQUIRE(sketch.get_min_item() == 1.0);
REQUIRE(sketch.get_max_item() == static_cast<double>(n));
uint64_t weight = 0;
for (auto pair: sketch) {
for (const auto pair: sketch) {
REQUIRE(pair.first >= sketch.get_min_item());
REQUIRE(pair.first <= sketch.get_max_item());
weight += pair.second;
Expand All @@ -51,23 +51,29 @@ TEST_CASE("quantiles double", "[serde_compat]") {
}
}

struct string_as_number_less {
bool operator()(const std::string& a, const std::string& b) const {
return std::stoi(a) < std::stoi(b);
}
};

TEST_CASE("quantiles string", "[serde_compat]") {
unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (unsigned n: n_arr) {
for (const unsigned n: n_arr) {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "quantiles_string_n" + std::to_string(n) + ".sk", std::ios::binary);
auto sketch = quantiles_sketch<std::string>::deserialize(is);
auto sketch = quantiles_sketch<std::string, string_as_number_less>::deserialize(is);
REQUIRE(sketch.is_empty() == (n == 0));
REQUIRE(sketch.is_estimation_mode() == (n > quantiles_constants::DEFAULT_K));
REQUIRE(sketch.get_n() == n);
if (n > 0) {
REQUIRE(sketch.get_min_item() == "0");
REQUIRE(sketch.get_max_item() == std::to_string(n - 1));
REQUIRE(sketch.get_min_item() == "1");
REQUIRE(sketch.get_max_item() == std::to_string(n));
uint64_t weight = 0;
for (auto pair: sketch) {
REQUIRE(pair.first >= sketch.get_min_item());
REQUIRE(pair.first <= sketch.get_max_item());
for (const auto pair: sketch) {
REQUIRE(std::stoi(pair.first) >= std::stoi(sketch.get_min_item()));
REQUIRE(std::stoi(pair.first) <= std::stoi(sketch.get_max_item()));
weight += pair.second;
}
REQUIRE(weight == sketch.get_n());
Expand Down
8 changes: 4 additions & 4 deletions req/test/req_sketch_deserialize_from_java_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "

TEST_CASE("req float", "[serde_compat]") {
unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (unsigned n: n_arr) {
for (const unsigned n: n_arr) {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "req_float_n" + std::to_string(n) + ".sk", std::ios::binary);
Expand All @@ -39,10 +39,10 @@ TEST_CASE("req float", "[serde_compat]") {
REQUIRE(sketch.is_estimation_mode() == (n > 10));
REQUIRE(sketch.get_n() == n);
if (n > 0) {
REQUIRE(sketch.get_min_item() == 0.0);
REQUIRE(sketch.get_max_item() == static_cast<float>(n - 1));
REQUIRE(sketch.get_min_item() == 1.0f);
REQUIRE(sketch.get_max_item() == static_cast<float>(n));
uint64_t weight = 0;
for (auto pair: sketch) {
for (const auto pair: sketch) {
REQUIRE(pair.first >= sketch.get_min_item());
REQUIRE(pair.first <= sketch.get_max_item());
weight += pair.second;
Expand Down
7 changes: 7 additions & 0 deletions sampling/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,10 @@ target_sources(sampling_test
var_opt_union_test.cpp
var_opt_allocation_test.cpp
)

if (SERDE_COMPAT)
target_sources(sampling_test
PRIVATE
var_opt_sketch_deserialize_from_java_test.cpp
)
endif()

0 comments on commit 2c2b89e

Please sign in to comment.