Skip to content

Commit

Permalink
Merge pull request #392 from apache/generate_sketches_for_java
Browse files Browse the repository at this point in the history
Generate sketches for java
  • Loading branch information
AlexanderSaydakov committed Aug 31, 2023
2 parents 52588ac + 6d5381e commit 060d5f6
Show file tree
Hide file tree
Showing 23 changed files with 724 additions and 25 deletions.
7 changes: 7 additions & 0 deletions cpc/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,10 @@ target_sources(cpc_test
cpc_sketch_deserialize_from_java_test.cpp
)
endif()

if (GENERATE)
target_sources(cpc_test
PRIVATE
cpc_sketch_serialize_for_java.cpp
)
endif()
18 changes: 18 additions & 0 deletions cpc/test/cpc_sketch_deserialize_from_java_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,22 @@ TEST_CASE("cpc sketch", "[serde_compat]") {
}
}

TEST_CASE("cpc sketch negative one", "[serde_compat]") {
std::ifstream is;
is.exceptions(std::ios::failbit | std::ios::badbit);
is.open(testBinaryInputPath + "cpc_negative_one_java.sk", std::ios::binary);
auto sketch = cpc_sketch::deserialize(is);
REQUIRE_FALSE(sketch.is_empty());
REQUIRE(sketch.get_estimate() == Approx(1).margin(0.01));
sketch.update((uint64_t) -1);
sketch.update((int64_t) -1);
sketch.update((uint32_t) -1);
sketch.update((int32_t) -1);
sketch.update((uint16_t) -1);
sketch.update((int16_t) -1);
sketch.update((uint8_t) -1);
sketch.update((int8_t) -1);
REQUIRE(sketch.get_estimate() == Approx(1).margin(0.01));
}

} /* namespace datasketches */
38 changes: 38 additions & 0 deletions cpc/test/cpc_sketch_serialize_for_java.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <catch2/catch.hpp>
#include <fstream>
#include <cpc_sketch.hpp>

namespace datasketches {

TEST_CASE("cpc sketch generate", "[serialize_for_java]") {
const unsigned n_arr[] = {0, 100, 200, 2000, 20000};
for (const unsigned n: n_arr) {
cpc_sketch sketch;
for (unsigned i = 1; i <= n; ++i) sketch.update(i);
REQUIRE(sketch.is_empty() == (n == 0));
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.02));
std::ofstream os("cpc_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
sketch.serialize(os);
}
}

} /* namespace datasketches */
25 changes: 0 additions & 25 deletions cpc/test/cpc_sketch_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,6 @@ TEST_CASE("cpc sketch: serialize deserialize empty", "[cpc_sketch]") {
REQUIRE(deserialized.is_empty() == sketch.is_empty());
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
REQUIRE(deserialized.validate());

std::ofstream os("cpc-empty.bin");
sketch.serialize(os);
}

TEST_CASE("cpc sketch: serialize deserialize sparse", "[cpc_sketch]") {
Expand All @@ -108,9 +105,6 @@ TEST_CASE("cpc sketch: serialize deserialize sparse", "[cpc_sketch]") {
for (int i = 0; i < n; i++) deserialized.update(i);
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
REQUIRE(deserialized.validate());

std::ofstream os("cpc-sparse.bin");
sketch.serialize(os);
}

TEST_CASE("cpc sketch: serialize deserialize hybrid", "[cpc_sketch]") {
Expand All @@ -128,9 +122,6 @@ TEST_CASE("cpc sketch: serialize deserialize hybrid", "[cpc_sketch]") {
for (int i = 0; i < n; i++) deserialized.update(i);
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
REQUIRE(deserialized.validate());

std::ofstream os("cpc-hybrid.bin");
sketch.serialize(os);
}

TEST_CASE("cpc sketch: serialize deserialize pinned", "[cpc_sketch]") {
Expand All @@ -148,9 +139,6 @@ TEST_CASE("cpc sketch: serialize deserialize pinned", "[cpc_sketch]") {
for (int i = 0; i < n; i++) deserialized.update(i);
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
REQUIRE(deserialized.validate());

std::ofstream os("cpc-pinned.bin");
sketch.serialize(os);
}

TEST_CASE("cpc sketch: serialize deserialize sliding", "[cpc_sketch]") {
Expand All @@ -168,9 +156,6 @@ TEST_CASE("cpc sketch: serialize deserialize sliding", "[cpc_sketch]") {
for (int i = 0; i < n; i++) deserialized.update(i);
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
REQUIRE(deserialized.validate());

std::ofstream os("cpc-sliding.bin");
sketch.serialize(os);
}

TEST_CASE("cpc sketch: serializing deserialize sliding large", "[cpc_sketch]") {
Expand All @@ -188,9 +173,6 @@ TEST_CASE("cpc sketch: serializing deserialize sliding large", "[cpc_sketch]") {
for (int i = 0; i < n; i++) deserialized.update(i);
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
REQUIRE(deserialized.validate());

std::ofstream os("cpc-sliding-large.bin");
sketch.serialize(os);
}

TEST_CASE("cpc sketch: serialize deserialize empty, bytes", "[cpc_sketch]") {
Expand All @@ -201,9 +183,6 @@ TEST_CASE("cpc sketch: serialize deserialize empty, bytes", "[cpc_sketch]") {
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
REQUIRE(deserialized.validate());
REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);

std::ofstream os("cpc-empty.bin");
sketch.serialize(os);
}

TEST_CASE("cpc sketch: serialize deserialize sparse, bytes", "[cpc_sketch]") {
Expand Down Expand Up @@ -261,8 +240,6 @@ TEST_CASE("cpc sketch: serialize deserialize pinned, bytes", "[cpc_sketch]") {
for (int i = 0; i < n; i++) deserialized.update(i);
REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
REQUIRE(deserialized.validate());

std::cout << sketch.to_string();
}

TEST_CASE("cpc sketch: serialize deserialize sliding, bytes", "[cpc_sketch]") {
Expand Down Expand Up @@ -380,8 +357,6 @@ TEST_CASE("cpc sketch: update int equivalence", "[cpc_sketch]") {
sketch.update((uint8_t) -1);
sketch.update((int8_t) -1);
REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
std::ofstream os("cpc-negative-one.bin"); // to compare with Java
sketch.serialize(os);
}

TEST_CASE("cpc sketch: update float equivalence", "[cpc_sketch]") {
Expand Down
7 changes: 7 additions & 0 deletions fi/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,10 @@ target_sources(fi_test
frequent_items_sketch_deserialize_from_java_test.cpp
)
endif()

if (GENERATE)
target_sources(fi_test
PRIVATE
frequent_items_sketch_serialize_for_java.cpp
)
endif()
83 changes: 83 additions & 0 deletions fi/test/frequent_items_sketch_serialize_for_java.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <catch2/catch.hpp>
#include <fstream>
#include <frequent_items_sketch.hpp>

namespace datasketches {

TEST_CASE("frequent longs sketch generate", "[serialize_for_java]") {
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
frequent_items_sketch<long> sketch(6);
for (unsigned i = 1; i <= n; ++i) sketch.update(i);
REQUIRE(sketch.is_empty() == (n == 0));
if (n > 10) {
REQUIRE(sketch.get_maximum_error() > 0);
} else {
REQUIRE(sketch.get_maximum_error() == 0);
}
REQUIRE(sketch.get_total_weight() == n);
std::ofstream os("frequent_long_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
sketch.serialize(os);
}
}

TEST_CASE("frequent strings sketch generate", "[serialize_for_java]") {
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
frequent_items_sketch<std::string> sketch(6);
for (unsigned i = 1; i <= n; ++i) sketch.update(std::to_string(i));
REQUIRE(sketch.is_empty() == (n == 0));
if (n > 10) {
REQUIRE(sketch.get_maximum_error() > 0);
} else {
REQUIRE(sketch.get_maximum_error() == 0);
}
REQUIRE(sketch.get_total_weight() == n);
std::ofstream os("frequent_string_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
sketch.serialize(os);
}
}

TEST_CASE("frequent strings sketch ascii", "[serialize_for_java]") {
frequent_items_sketch<std::string> sketch(6);
sketch.update("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1);
sketch.update("bbbbbbbbbbbbbbbbbbbbbbbbbbbbb", 2);
sketch.update("ccccccccccccccccccccccccccccc", 3);
sketch.update("ddddddddddddddddddddddddddddd", 4);
std::ofstream os("frequent_string_ascii_cpp.sk", std::ios::binary);
sketch.serialize(os);
}

TEST_CASE("frequent strings sketch utf8", "[serialize_for_java]") {
frequent_items_sketch<std::string> sketch(6);
sketch.update("абвгд", 1);
sketch.update("еёжзи", 2);
sketch.update("йклмн", 3);
sketch.update("опрст", 4);
sketch.update("уфхцч", 5);
sketch.update("шщъыь", 6);
sketch.update("эюя", 7);
std::ofstream os("frequent_string_utf8_cpp.sk", std::ios::binary);
sketch.serialize(os);
}

} /* namespace datasketches */
7 changes: 7 additions & 0 deletions hll/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,10 @@ target_sources(hll_test
hll_sketch_deserialize_from_java_test.cpp
)
endif()

if (GENERATE)
target_sources(hll_test
PRIVATE
hll_sketch_serialize_for_java.cpp
)
endif()
52 changes: 52 additions & 0 deletions hll/test/hll_sketch_serialize_for_java.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <catch2/catch.hpp>
#include <fstream>
#include <hll.hpp>

namespace datasketches {

TEST_CASE("hll sketch generate", "[serialize_for_java]") {
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
hll_sketch hll4(12, HLL_4);
hll_sketch hll6(12, HLL_6);
hll_sketch hll8(12, HLL_8);
for (unsigned i = 0; i < n; ++i) {
hll4.update(i);
hll6.update(i);
hll8.update(i);
}
{
std::ofstream os("hll4_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
hll4.serialize_compact(os);
}
{
std::ofstream os("hll6_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
hll6.serialize_compact(os);
}
{
std::ofstream os("hll8_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
hll8.serialize_compact(os);
}
}
}

} /* namespace datasketches */
7 changes: 7 additions & 0 deletions kll/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,10 @@ target_sources(kll_test
kll_sketch_deserialize_from_java_test.cpp
)
endif()

if (GENERATE)
target_sources(kll_test
PRIVATE
kll_sketch_serialize_for_java.cpp
)
endif()
62 changes: 62 additions & 0 deletions kll/test/kll_sketch_serialize_for_java.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <catch2/catch.hpp>
#include <fstream>
#include <kll_sketch.hpp>

namespace datasketches {

TEST_CASE("kll sketch float generate", "[serialize_for_java]") {
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
kll_sketch<float> sketch;
for (unsigned i = 1; i <= n; ++i) sketch.update(i);
std::ofstream os("kll_float_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
sketch.serialize(os);
}
}

TEST_CASE("kll sketch double generate", "[serialize_for_java]") {
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
kll_sketch<double> sketch;
for (unsigned i = 1; i <= n; ++i) sketch.update(i);
std::ofstream os("kll_double_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
sketch.serialize(os);
}
}

struct compare_as_number {
bool operator()(const std::string& a, const std::string& b) const {
return std::stoi(a) < std::stoi(b);
}
};

TEST_CASE("kll sketch string generate", "[serialize_for_java]") {
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
for (const unsigned n: n_arr) {
kll_sketch<std::string, compare_as_number> sketch;
for (unsigned i = 1; i <= n; ++i) sketch.update(std::to_string(i));
std::ofstream os("kll_string_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
sketch.serialize(os);
}
}

} /* namespace datasketches */

0 comments on commit 060d5f6

Please sign in to comment.