Skip to content

Commit

Permalink
Merge 55722ca into c67d92f
Browse files Browse the repository at this point in the history
  • Loading branch information
jmalkin committed Aug 11, 2020
2 parents c67d92f + 55722ca commit 48f7739
Show file tree
Hide file tree
Showing 10 changed files with 292 additions and 131 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_cmake.yml
Expand Up @@ -44,7 +44,7 @@ jobs:
with:
submodules: true
- name: Configure
run: mkdir build && cd build && cmake ..
run: cd build && cmake ..
- name: Build C++ unit tests
run: cmake --build build --config Release
- name: Run C++ tests
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/code_coverage.yml
Expand Up @@ -24,7 +24,7 @@ jobs:
cd "lcov-$VERSION"
sudo make install
- name: Configure
run: mkdir build && cd build && cmake .. -DCOVERAGE=ON
run: cd build && cmake .. -DCOVERAGE=ON
- name: Build unit tests
run: cmake --build build
- name: Run tests
Expand Down
1 change: 0 additions & 1 deletion .gitignore
Expand Up @@ -16,7 +16,6 @@
*.dll
*.dylib
bin/
build/
lib/
Default/

Expand Down
7 changes: 7 additions & 0 deletions build/.gitignore
@@ -0,0 +1,7 @@
# build/ directory for convenience, but should remain empty

# Ignore everything in here
*

# Add an exception for this file
!.gitignore
41 changes: 27 additions & 14 deletions python/src/cpc_wrapper.cpp
Expand Up @@ -53,25 +53,38 @@ void init_cpc(py::module &m) {
using namespace datasketches;

py::class_<cpc_sketch>(m, "cpc_sketch")
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k"), py::arg("seed")=DEFAULT_SEED)
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=CPC_DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED)
.def(py::init<const cpc_sketch&>())
.def("__str__", &cpc_sketch::to_string)
.def("to_string", &cpc_sketch::to_string)
.def("serialize", &dspy::cpc_sketch_serialize)
.def_static("deserialize", &dspy::cpc_sketch_deserialize)
.def<void (cpc_sketch::*)(uint64_t)>("update", &cpc_sketch::update, py::arg("datum"))
.def<void (cpc_sketch::*)(double)>("update", &cpc_sketch::update, py::arg("datum"))
.def<void (cpc_sketch::*)(const std::string&)>("update", &cpc_sketch::update, py::arg("datum"))
.def("is_empty", &cpc_sketch::is_empty)
.def("get_estimate", &cpc_sketch::get_estimate)
.def("get_lower_bound", &cpc_sketch::get_lower_bound, py::arg("kappa"))
.def("get_upper_bound", &cpc_sketch::get_upper_bound, py::arg("kappa"))
.def("__str__", &cpc_sketch::to_string,
"Produces a string summary of the sketch")
.def("to_string", &cpc_sketch::to_string,
"Produces a string summary of the sketch")
.def("serialize", &dspy::cpc_sketch_serialize,
"Serializes the sketch into a bytes object")
.def_static("deserialize", &dspy::cpc_sketch_deserialize,
"Reads a bytes object and returns the corresponding cpc_sketch")
.def<void (cpc_sketch::*)(uint64_t)>("update", &cpc_sketch::update, py::arg("datum"),
"Updates the sketch with the given 64-bit integer value")
.def<void (cpc_sketch::*)(double)>("update", &cpc_sketch::update, py::arg("datum"),
"Updates the sketch with the given 64-bit floating point")
.def<void (cpc_sketch::*)(const std::string&)>("update", &cpc_sketch::update, py::arg("datum"),
"Updates the sketch with the given string")
.def("is_empty", &cpc_sketch::is_empty,
"Returns True if the sketch is empty, otherwise Dalse")
.def("get_estimate", &cpc_sketch::get_estimate,
"Estimate of the distinct count of the input stream")
.def("get_lower_bound", &cpc_sketch::get_lower_bound, py::arg("kappa"),
"Returns an approximate lower bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations")
.def("get_upper_bound", &cpc_sketch::get_upper_bound, py::arg("kappa"),
"Returns an approximate upper bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations")
;

py::class_<cpc_union>(m, "cpc_union")
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k"), py::arg("seed")=DEFAULT_SEED)
.def(py::init<const cpc_union&>())
.def("update", (void (cpc_union::*)(const cpc_sketch&)) &cpc_union::update, py::arg("sketch"))
.def("get_result", &dspy::cpc_union_get_result)
.def("update", (void (cpc_union::*)(const cpc_sketch&)) &cpc_union::update, py::arg("sketch"),
"Updates the union with the provided CPC sketch")
.def("get_result", &dspy::cpc_union_get_result,
"Returns a CPC sketch with the result of the union")
;
}
48 changes: 32 additions & 16 deletions python/src/fi_wrapper.cpp
Expand Up @@ -75,23 +75,39 @@ void bind_fi_sketch(py::module &m, const char* name) {

py::class_<frequent_items_sketch<T>>(m, name)
.def(py::init<uint8_t>(), py::arg("lg_max_k"))
.def("__str__", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false)
.def("to_string", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false)
.def("update", (void (frequent_items_sketch<T>::*)(const T&, uint64_t)) &frequent_items_sketch<T>::update, py::arg("item"), py::arg("weight")=1)
.def("__str__", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
"Produces a string summary of the sketch")
.def("to_string", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
"Produces a string summary of the sketch")
.def("update", (void (frequent_items_sketch<T>::*)(const T&, uint64_t)) &frequent_items_sketch<T>::update, py::arg("item"), py::arg("weight")=1,
"Updates the sketch with the given string and, optionally, a weight")
.def("get_frequent_items", &dspy::fi_sketch_get_frequent_items<T>, py::arg("err_type"), py::arg("threshold")=0)
.def("merge", (void (frequent_items_sketch<T>::*)(const frequent_items_sketch<T>&)) &frequent_items_sketch<T>::merge)
.def("is_empty", &frequent_items_sketch<T>::is_empty)
.def("get_num_active_items", &frequent_items_sketch<T>::get_num_active_items)
.def("get_total_weight", &frequent_items_sketch<T>::get_total_weight)
.def("get_estimate", &frequent_items_sketch<T>::get_estimate, py::arg("item"))
.def("get_lower_bound", &frequent_items_sketch<T>::get_lower_bound, py::arg("item"))
.def("get_upper_bound", &frequent_items_sketch<T>::get_upper_bound, py::arg("item"))
.def("get_sketch_epsilon", (double (frequent_items_sketch<T>::*)(void) const) &frequent_items_sketch<T>::get_epsilon)
.def_static("get_epsilon_for_lg_size", &dspy::fi_sketch_get_generic_epsilon<T>, py::arg("lg_max_map_size"))
.def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"))
.def("get_serialized_size_bytes", &frequent_items_sketch<T>::get_serialized_size_bytes)
.def("serialize", &dspy::fi_sketch_serialize<T>)
.def_static("deserialize", &dspy::fi_sketch_deserialize<T>)
.def("merge", (void (frequent_items_sketch<T>::*)(const frequent_items_sketch<T>&)) &frequent_items_sketch<T>::merge,
"Merges the given sketch into this one")
.def("is_empty", &frequent_items_sketch<T>::is_empty,
"Returns True if the sketch is empty, otherwise False")
.def("get_num_active_items", &frequent_items_sketch<T>::get_num_active_items,
"Returns the number of active items in the sketch")
.def("get_total_weight", &frequent_items_sketch<T>::get_total_weight,
"Returns the sum of the weights (frequencies) in the stream seen so far by the sketch")
.def("get_estimate", &frequent_items_sketch<T>::get_estimate, py::arg("item"),
"Returns the estimate of the weight (frequency) of the given item.\n"
"Note: The true frequency of a item would be the sum of the counts as a result of the "
"two update functions.")
.def("get_lower_bound", &frequent_items_sketch<T>::get_lower_bound, py::arg("item"),
"Returns the guaranteed lower bound weight (frequency) of the given item.")
.def("get_upper_bound", &frequent_items_sketch<T>::get_upper_bound, py::arg("item"),
"Returns the guaranteed upper bound weight (frequency) of the given item.")
.def("get_sketch_epsilon", (double (frequent_items_sketch<T>::*)(void) const) &frequent_items_sketch<T>::get_epsilon,
"Returns the epsilon value used by the sketch to compute error")
.def_static("get_epsilon_for_lg_size", &dspy::fi_sketch_get_generic_epsilon<T>, py::arg("lg_max_map_size"),
"Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
.def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
"Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
.def("get_serialized_size_bytes", &frequent_items_sketch<T>::get_serialized_size_bytes,
"Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
.def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
.def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
;
}

Expand Down
101 changes: 64 additions & 37 deletions python/src/hll_wrapper.cpp
Expand Up @@ -59,51 +59,78 @@ void init_hll(py::module &m) {
.def(py::init<int>(), py::arg("lg_k"))
.def(py::init<int, target_hll_type>(), py::arg("lg_k"), py::arg("tgt_type"))
.def(py::init<int, target_hll_type, bool>(), py::arg("lg_k"), py::arg("tgt_type"), py::arg("start_max_size")=false)
.def_static("deserialize", &dspy::hll_sketch_deserialize)
.def("serialize_compact", &dspy::hll_sketch_serialize_compact)
.def("serialize_updatable", &dspy::hll_sketch_serialize_updatable)
.def_static("deserialize", &dspy::hll_sketch_deserialize,
"Reads a bytes object and returns the corresponding hll_sketch")
.def("serialize_compact", &dspy::hll_sketch_serialize_compact,
"Serializes the sketch into a bytes object, compressiong the exception table if HLL_4")
.def("serialize_updatable", &dspy::hll_sketch_serialize_updatable,
"Serializes the sketch into a bytes object")
.def("__str__", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false)
py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
"Produces a string summary of the sketch")
.def("to_string", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false)
.def_property_readonly("lg_config_k", &hll_sketch::get_lg_config_k)
.def_property_readonly("tgt_type", &hll_sketch::get_target_type)
.def("get_estimate", &hll_sketch::get_estimate)
.def("get_lower_bound", &hll_sketch::get_lower_bound, py::arg("num_std_devs"))
.def("get_upper_bound", &hll_sketch::get_upper_bound, py::arg("num_std_devs"))
.def("is_compact", &hll_sketch::is_compact)
.def("is_empty", &hll_sketch::is_empty)
.def("get_updatable_serialization_bytes", &hll_sketch::get_updatable_serialization_bytes)
.def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes)
.def("reset", &hll_sketch::reset)
.def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"))
.def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"))
.def("update", (void (hll_sketch::*)(const std::string&)) &hll_sketch::update, py::arg("datum"))
py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
"Produces a string summary of the sketch")
.def_property_readonly("lg_config_k", &hll_sketch::get_lg_config_k, "Configured lg_k value for the sketch")
.def_property_readonly("tgt_type", &hll_sketch::get_target_type, "Returns the HLL type (4, 6, or 8) when in estimation mode")
.def("get_estimate", &hll_sketch::get_estimate,
"Estimate of the distinct count of the input stream")
.def("get_lower_bound", &hll_sketch::get_lower_bound, py::arg("num_std_devs"),
"Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}")
.def("get_upper_bound", &hll_sketch::get_upper_bound, py::arg("num_std_devs"),
"Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}")
.def("is_compact", &hll_sketch::is_compact,
"True if the sketch is compact, otherwise False")
.def("is_empty", &hll_sketch::is_empty,
"True if the sketch is empty, otherwise False")
.def("get_updatable_serialization_bytes", &hll_sketch::get_updatable_serialization_bytes,
"Returns the size of the serialized sketch")
.def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes,
"Returns the size of the serialized sketch when compressing the exception table if HLL_4")
.def("reset", &hll_sketch::reset,
"Resets the sketch to the empty state in coupon colleciton mode")
.def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"),
"Updates the sketch with the given integral value")
.def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"),
"Updates the sketch with the given floating point value")
.def("update", (void (hll_sketch::*)(const std::string&)) &hll_sketch::update, py::arg("datum"),
"Updates the sketch with the given string value")
.def_static("get_max_updatable_serialization_bytes", &hll_sketch::get_max_updatable_serialization_bytes,
py::arg("lg_k"), py::arg("tgt_type"))
py::arg("lg_k"), py::arg("tgt_type"),
"Provides a likely upper bound on serialization size for the given paramters")
.def_static("get_rel_err", &hll_sketch::get_rel_err,
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"))
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
"Retuns the a priori relative error bound for the given parameters")
;

py::class_<hll_union>(m, "hll_union")
.def(py::init<int>(), py::arg("lg_max_k"))
.def_property_readonly("lg_config_k", &hll_union::get_lg_config_k)
.def_property_readonly("tgt_type", &hll_union::get_target_type)
.def("get_estimate", &hll_union::get_estimate)
.def("get_lower_bound", &hll_union::get_lower_bound, py::arg("num_std_devs"))
.def("get_upper_bound", &hll_union::get_upper_bound, py::arg("num_std_devs"))
.def("is_compact", &hll_union::is_compact)
.def("is_empty", &hll_union::is_empty)
.def("get_updatable_serialization_bytes", &hll_union::get_updatable_serialization_bytes)
.def("get_compact_serialization_bytes", &hll_union::get_compact_serialization_bytes)
.def("reset", &hll_union::reset)
.def("get_result", &hll_union::get_result, py::arg("tgt_type")=HLL_4)
.def<void (hll_union::*)(const hll_sketch&)>("update", &hll_union::update, py::arg("sketch"))
.def<void (hll_union::*)(int64_t)>("update", &hll_union::update, py::arg("datum"))
.def<void (hll_union::*)(double)>("update", &hll_union::update, py::arg("datum"))
.def<void (hll_union::*)(const std::string&)>("update", &hll_union::update, py::arg("datum"))
.def_static("get_max_serialization_bytes", &hll_union::get_max_serialization_bytes, py::arg("lg_k"))
.def_property_readonly("lg_config_k", &hll_union::get_lg_config_k, "Configured lg_k value for the union")
.def_property_readonly("tgt_type", &hll_union::get_target_type, "Returns the HLL type (4, 6, or 8) when in estimation mode")
.def("get_estimate", &hll_union::get_estimate,
"Estimate of the distinct count of the input stream")
.def("get_lower_bound", &hll_union::get_lower_bound, py::arg("num_std_devs"),
"Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}")
.def("get_upper_bound", &hll_union::get_upper_bound, py::arg("num_std_devs"),
"Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}")
.def("is_compact", &hll_union::is_compact,
"True if the union is compact, otherwise False")
.def("is_empty", &hll_union::is_empty,
"True if the union is empty, otherwise False")
.def("reset", &hll_union::reset,
"Resets the union to the empty state")
.def("get_result", &hll_union::get_result, py::arg("tgt_type")=HLL_4,
"Returns a sketch of the target type representing the current union state")
.def<void (hll_union::*)(const hll_sketch&)>("update", &hll_union::update, py::arg("sketch"),
"Updates the union with the given HLL sketch")
.def<void (hll_union::*)(int64_t)>("update", &hll_union::update, py::arg("datum"),
"Updates the union with the given integral value")
.def<void (hll_union::*)(double)>("update", &hll_union::update, py::arg("datum"),
"Updates the union with the given floating point value")
.def<void (hll_union::*)(const std::string&)>("update", &hll_union::update, py::arg("datum"),
"Updates the union with the given string value")
.def_static("get_rel_err", &hll_union::get_rel_err,
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"))
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
"Retuns the a priori relative error bound for the given parameters")
;
}

0 comments on commit 48f7739

Please sign in to comment.