diff --git a/common/include/memory_operations.hpp b/common/include/memory_operations.hpp index 986b2b06..ae08232a 100644 --- a/common/include/memory_operations.hpp +++ b/common/include/memory_operations.hpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace datasketches { diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 26028bbd..bc850928 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -50,7 +50,13 @@ target_link_libraries(python set_target_properties(python PROPERTIES PREFIX "" - OUTPUT_NAME datasketches + OUTPUT_NAME _datasketches +) + +target_include_directories(python + PUBLIC + $/include> + $ ) # ensure we make a .so on Mac rather than .dylib @@ -71,4 +77,5 @@ target_sources(python src/quantiles_wrapper.cpp src/ks_wrapper.cpp src/vector_of_kll.cpp + src/py_serde.cpp ) diff --git a/python/datasketches/PySerDe.py b/python/datasketches/PySerDe.py new file mode 100644 index 00000000..384f3de3 --- /dev/null +++ b/python/datasketches/PySerDe.py @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from _datasketches import PyObjectSerDe + +import struct + +# This file provides several Python SerDe implementation examples. +# +# Each implementation must extend the PyObjectSerDe class and define +# three methods: +# * get_size(item) returns an int of the number of bytes needed to +# serialize the given item +# * to_bytes(item) returns a bytes object representing a serialized +# version of the given item +# * from_bytes(data, offset) takes a bytes object (data) and an offset +# indicating where in the data array to start reading. The method +# returns a tuple with the newly reconstructed object and the +# total number of bytes beyond the offset read from the input data. + +# Implements a simple string-encoding scheme where a string is +# written as , with no null termination. +# This format allows pre-allocating each string, at the cost of +# additional storage. Using this format, the serialized string consumes +# 4 + len(item) bytes. +class PyStringsSerDe(PyObjectSerDe): + def get_size(self, item): + return int(4 + len(item)) + + def to_bytes(self, item: str): + b = bytearray() + b.extend(len(item).to_bytes(4, 'little')) + b.extend(map(ord,item)) + return bytes(b) + + def from_bytes(self, data: bytes, offset: int): + num_chars = int.from_bytes(data[offset:offset+3], 'little') + if (num_chars < 0 or num_chars > offset + len(data)): + raise IndexError(f'num_chars read must be non-negative and not larger than the buffer. Found {num_chars}') + str = data[offset+4:offset+4+num_chars].decode() + return (str, 4+num_chars) + +# Implements an integer-encoding scheme where each integer is written +# as a 32-bit (4 byte) little-endian value. +class PyIntsSerDe(PyObjectSerDe): + def get_size(self, item): + return int(4) + + def to_bytes(self, item): + return struct.pack('i', item) + + def from_bytes(self, data: bytes, offset: int): + val = struct.unpack_from('i', data, offset)[0] + return (val, 4) + + +class PyLongsSerDe(PyObjectSerDe): + def get_size(self, item): + return int(8) + + def to_bytes(self, item): + return struct.pack('l', item) + + def from_bytes(self, data: bytes, offset: int): + val = struct.unpack_from('l', data, offset)[0] + return (val, 8) + + +class PyFloatsSerDe(PyObjectSerDe): + def get_size(self, item): + return int(4) + + def to_bytes(self, item): + return struct.pack('f', item) + + def from_bytes(self, data: bytes, offset: int): + val = struct.unpack_from('f', data, offset)[0] + return (val, 4) + + +class PyDoublesSerDe(PyObjectSerDe): + def get_size(self, item): + return int(8) + + def to_bytes(self, item): + return struct.pack('d', item) + + def from_bytes(self, data: bytes, offset: int): + val = struct.unpack_from('d', data, offset)[0] + return (val, 8) diff --git a/python/datasketches/__init__.py b/python/datasketches/__init__.py new file mode 100644 index 00000000..6d645c05 --- /dev/null +++ b/python/datasketches/__init__.py @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name = 'datasketches' + +from .PySerDe import * + +from _datasketches import * diff --git a/python/include/py_serde.hpp b/python/include/py_serde.hpp new file mode 100644 index 00000000..3a5bb749 --- /dev/null +++ b/python/include/py_serde.hpp @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +#ifndef _PY_SERDE_HPP_ +#define _PY_SERDE_HPP_ + +namespace py = pybind11; + +namespace datasketches { + +/** + * @brief The py_object_serde is an abstract class that implements the + * datasketches serde interface, and is used to allow custom Python + * serialization of items wrapped as generic py::object types. The actual + * Python implementation classes must extend the PyObjectSerDe class. + */ +struct py_object_serde { + /** + * @brief Get the serialized size of an object, in bytes + * + * @param item A provided item + * @return int64_t The serialized size of the item, in bytes + */ + virtual int64_t get_size(const py::object& item) const = 0; + + /** + * @brief Serializes an item to a bytes object + * + * @param item A provided item + * @return The serialized image of the item as a Python bytes object + */ + virtual py::bytes to_bytes(const py::object& item) const = 0; + + /** + * @brief Constructs an object from a serialized image, reading the + * incoming buffer starting at the specified offset. + * + * @param bytes A buffer containing items from a serialized sketch + * @param offset The starting offset into the bytes buffer + * @return A Python tuple of the reconstructed item and the total number of bytes read + */ + virtual py::tuple from_bytes(py::bytes& bytes, size_t offset) const = 0; + + virtual ~py_object_serde() = default; + + // these methods are required by the serde interface; see common/include/serde.hpp for + // default implementations for C++ std::string and numeric types. + size_t size_of_item(const py::object& item) const; + size_t serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const; + size_t deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const; +}; + +/** + * @brief The PyObjectSerDe class provides a concrete base class + * that pybind11 uses as a "trampoline" to pass calls through to + * the abstract py_object_serde class. Custom Python serde implementations + * must extend this class. + */ +struct PyObjectSerDe : public py_object_serde { + using py_object_serde::py_object_serde; + + // trampoline definitions -- need one for each virtual function + int64_t get_size(const py::object& item) const override { + PYBIND11_OVERRIDE_PURE( + int64_t, // Return type + py_object_serde, // Parent class + get_size, // Name of function in C++ (must match Python name) + item // Argument(s) + ); + } + + py::bytes to_bytes(const py::object& item) const override { + PYBIND11_OVERRIDE_PURE( + py::bytes, // Return type + py_object_serde, // Parent class + to_bytes, // Name of function in C++ (must match Python name) + item // Argument(s) + ); + } + + py::tuple from_bytes(py::bytes& bytes, size_t offset) const override { + PYBIND11_OVERRIDE_PURE( + py::tuple, // Return type + py_object_serde, // Parent class + from_bytes, // Name of function in C++ (must match Python name) + bytes, offset // Argument(s) + ); + } +}; + +} + +#endif // _PY_SERDE_HPP_ diff --git a/python/pybind11Path.cmd b/python/pybind11Path.cmd index e07f0b56..354c88d3 100644 --- a/python/pybind11Path.cmd +++ b/python/pybind11Path.cmd @@ -1,3 +1,21 @@ +:: Licensed to the Apache Software Foundation (ASF) under one +:: or more contributor license agreements. See the NOTICE file +:: distributed with this work for additional information +:: regarding copyright ownership. The ASF licenses this file +:: to you under the Apache License, Version 2.0 (the +:: "License"); you may not use this file except in compliance +:: with the License. You may obtain a copy of the License at +:: +:: http://www.apache.org/licenses/LICENSE-2.0 +:: +:: Unless required by applicable law or agreed to in writing, +:: software distributed under the License is distributed on an +:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +:: KIND, either express or implied. See the License for the +:: specific language governing permissions and limitations +:: under the License. + + @echo off :: Takes path to the Python interpreter and returns the path to pybind11 %1 -c "import pybind11,sys;sys.stdout.write(pybind11.get_cmake_dir())" \ No newline at end of file diff --git a/python/src/__init__.py b/python/src/__init__.py index 1e1acc20..756e6938 100644 --- a/python/src/__init__.py +++ b/python/src/__init__.py @@ -1,2 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + name = "datasketches" - \ No newline at end of file diff --git a/python/src/datasketches.cpp b/python/src/datasketches.cpp index 8a39a0c2..b34eea7a 100644 --- a/python/src/datasketches.cpp +++ b/python/src/datasketches.cpp @@ -21,6 +21,7 @@ namespace py = pybind11; +// sketches void init_hll(py::module& m); void init_kll(py::module& m); void init_fi(py::module& m); @@ -29,10 +30,13 @@ void init_theta(py::module& m); void init_vo(py::module& m); void init_req(py::module& m); void init_quantiles(py::module& m); -void init_kolmogorov_smirnov(py::module& m); void init_vector_of_kll(py::module& m); -PYBIND11_MODULE(datasketches, m) { +// supporting objects +void init_kolmogorov_smirnov(py::module& m); +void init_serde(py::module& m); + +PYBIND11_MODULE(_datasketches, m) { init_hll(m); init_kll(m); init_fi(m); @@ -41,6 +45,8 @@ PYBIND11_MODULE(datasketches, m) { init_vo(m); init_req(m); init_quantiles(m); - init_kolmogorov_smirnov(m); init_vector_of_kll(m); + + init_kolmogorov_smirnov(m); + init_serde(m); } diff --git a/python/src/py_serde.cpp b/python/src/py_serde.cpp new file mode 100644 index 00000000..f6e4ef4c --- /dev/null +++ b/python/src/py_serde.cpp @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include "memory_operations.hpp" + +#include "py_serde.hpp" + +#include + +namespace py = pybind11; + +void init_serde(py::module& m) { + py::class_(m, "PyObjectSerDe") + .def(py::init<>()) + .def("get_size", &datasketches::py_object_serde::get_size, py::arg("item"), + "Returns the size in bytes of an item") + .def("to_bytes", &datasketches::py_object_serde::to_bytes, py::arg("item"), + "Retuns a bytes object with a serialized version of an item") + .def("from_bytes", &datasketches::py_object_serde::from_bytes, py::arg("data"), py::arg("offset"), + "Reads a bytes object starting from the given offest and returns a tuple of the reconstructed " + "object and the number of additional bytes read") + ; +} + +namespace datasketches { + size_t py_object_serde::size_of_item(const py::object& item) const { + return get_size(item); + } + + size_t py_object_serde::serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const { + size_t bytes_written = 0; + py::gil_scoped_acquire acquire; + for (unsigned i = 0; i < num; ++i) { + std::string bytes = to_bytes(items[i]); // implicit cast from py::bytes + check_memory_size(bytes_written + bytes.size(), capacity); + memcpy(ptr, bytes.c_str(), bytes.size()); + ptr = static_cast(ptr) + bytes.size(); + bytes_written += bytes.size(); + } + py::gil_scoped_release release; + return bytes_written; + } + + size_t py_object_serde::deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const { + size_t bytes_read = 0; + unsigned i = 0; + bool failure = false; + bool error_from_python = false; + py::gil_scoped_acquire acquire; + + // copy data into bytes only once + py::bytes bytes(static_cast(ptr), capacity); + for (; i < num && !failure; ++i) { + py::tuple bytes_and_len; + try { + bytes_and_len = from_bytes(bytes, bytes_read); + } catch (py::error_already_set &e) { + failure = true; + error_from_python = true; + break; + } + + size_t length = py::cast(bytes_and_len[1]); + if (bytes_read + length > capacity) { + bytes_read += length; // use this value to report the error + failure = true; + break; + } + + new (&items[i]) py::object(py::cast(bytes_and_len[0])); + ptr = static_cast(ptr) + length; + bytes_read += length; + } + + if (failure) { + // clean up what we've allocated + for (unsigned j = 0; j < i; ++j) { + items[j].dec_ref(); + } + + if (error_from_python) { + throw py::value_error("Error reading value in from_bytes"); + } else { + // this next call will throw + check_memory_size(bytes_read, capacity); + } + } + + py::gil_scoped_release release; + return bytes_read; + } + + +} // namespace datasketches \ No newline at end of file diff --git a/python/src/vo_wrapper.cpp b/python/src/vo_wrapper.cpp index a8eb0f51..32dfb6c3 100644 --- a/python/src/vo_wrapper.cpp +++ b/python/src/vo_wrapper.cpp @@ -19,16 +19,50 @@ #include "var_opt_sketch.hpp" #include "var_opt_union.hpp" +#include "py_serde.hpp" #include -#include -#include namespace py = pybind11; namespace datasketches { + namespace python { +template +var_opt_sketch vo_sketch_deserialize(py::bytes& skBytes, py_object_serde& sd) { + std::string skStr = skBytes; // implicit cast + return var_opt_sketch::deserialize(skStr.c_str(), skStr.length(), sd); +} + +template +py::object vo_sketch_serialize(const var_opt_sketch& sk, py_object_serde& sd) { + auto serResult = sk.serialize(0, sd); + return py::bytes((char*)serResult.data(), serResult.size()); +} + +template +size_t vo_sketch_size_bytes(const var_opt_sketch& sk, py_object_serde& sd) { + return sk.get_serialized_size_bytes(sd); +} + +template +var_opt_union vo_union_deserialize(py::bytes& uBytes, py_object_serde& sd) { + std::string uStr = uBytes; // implicit cast + return var_opt_union::deserialize(uStr.c_str(), uStr.length(), sd); +} + +template +py::object vo_union_serialize(const var_opt_union& u, py_object_serde& sd) { + auto serResult = u.serialize(0, sd); + return py::bytes((char*)serResult.data(), serResult.size()); +} + +template +size_t vo_union_size_bytes(const var_opt_union& u, py_object_serde& sd) { + return u.get_serialized_size_bytes(sd); +} + template py::list vo_sketch_get_samples(const var_opt_sketch& sk) { py::list list; @@ -63,7 +97,6 @@ std::string vo_sketch_to_string(const var_opt_sketch& sk, bool print_items) { // using internal str() method then casting to C++ std::string py::str item_pystr(item.first); std::string item_str = py::cast(item_pystr); - // item.second is guaranteed to be a double ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl; } return ss.str(); @@ -96,17 +129,17 @@ void bind_vo_sketch(py::module &m, const char* name) { .def_property_readonly("num_samples", &var_opt_sketch::get_num_samples, "Returns the number of samples currently in the sketch") .def("get_samples", &dspy::vo_sketch_get_samples, - "Retyrns the set of samples in the sketch") + "Returns the set of samples in the sketch") .def("is_empty", &var_opt_sketch::is_empty, "Returns True if the sketch is empty, otherwise False") .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum, "Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well " "as upper and lower bounds on the estimate and the total weight processed by the sketch") - // As of writing, not yet clear how to serialize arbitrary python objects, - // especially in any sort of language-portable way - //.def("get_serialized_size_bytes", &var_opt_sketch::get_serialized_size_bytes) - //.def("serialize", &dspy::vo_sketch_serialize) - //.def_static("deserialize", &dspy::vo_sketch_deserialize) + .def("get_serialized_size_bytes", &dspy::vo_sketch_size_bytes, py::arg("serde"), + "Computes the size in bytes needed to serialize the current sketch") + .def("serialize", &dspy::vo_sketch_serialize, py::arg("serde"), "Serialize the var opt sketch using the provided serde") + .def_static("deserialize", &dspy::vo_sketch_deserialize, py::arg("bytes"), py::arg("serde"), + "Constructs a var opt sketch from the given bytes using the provided serde") ; } @@ -126,11 +159,11 @@ void bind_vo_union(py::module &m, const char* name) { "Returns a sketch corresponding to the union result") .def("reset", &var_opt_union::reset, "Resets the union to the empty state") - // As of writing, not yet clear how to serialize arbitrary python objects, - // especially in any sort of language-portable way - //.def("get_serialized_size_bytes", &var_opt_sketch::get_serialized_size_bytes) - //.def("serialize", &dspy::vo_union_serialize) - //.def_static("deserialize", &dspy::vo_union_deserialize) + .def("get_serialized_size_bytes", &dspy::vo_union_size_bytes, py::arg("serde"), + "Computes the size in bytes needed to serialize the current sketch") + .def("serialize", &dspy::vo_union_serialize, py::arg("serde"), "Serialize the var opt union using the provided serde") + .def_static("deserialize", &dspy::vo_union_deserialize, py::arg("bytes"), py::arg("serde"), + "Constructs a var opt union from the given bytes using the provided serde") ; } diff --git a/python/tests/__init__.py b/python/tests/__init__.py index e69de29b..13a83393 100644 --- a/python/tests/__init__.py +++ b/python/tests/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/tests/vo_test.py b/python/tests/vo_test.py index 1cc3e3ab..7e71d354 100644 --- a/python/tests/vo_test.py +++ b/python/tests/vo_test.py @@ -16,7 +16,7 @@ # under the License. import unittest -from datasketches import var_opt_sketch, var_opt_union +from datasketches import var_opt_sketch, var_opt_union, PyIntsSerDe, PyStringsSerDe class VoTest(unittest.TestCase): def test_vo_example(self): @@ -97,5 +97,29 @@ def geq_zero(x): # calls to __str__() with parameters. print(result.to_string(True)) + # finally, we can serialize the sketch by providing an + # appropriate serde class. + expected_size = result.get_serialized_size_bytes(PyIntsSerDe()) + b = result.serialize(PyIntsSerDe()) + self.assertEqual(expected_size, len(b)) + + # if we try to deserialize with the wrong serde, things break + try: + var_opt_sketch.deserialize(b, PyStringsSerDe()) + self.fail() + except: + # expected; do nothing + self.assertTrue(True) + + # using the correct serde gives us back a copy of the original + rebuilt = var_opt_sketch.deserialize(b, PyIntsSerDe()) + self.assertEqual(result.k, rebuilt.k) + self.assertEqual(result.num_samples, rebuilt.num_samples) + self.assertEqual(result.n, rebuilt.n) + summary1 = result.estimate_subset_sum(geq_zero) + summary2 = rebuilt.estimate_subset_sum(geq_zero) + self.assertEqual(summary1['estimate'], summary2['estimate']) + self.assertEqual(summary1['total_sketch_weight'], summary2['total_sketch_weight']) + if __name__ == '__main__': unittest.main() diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp index 88d5d6dc..65d43f13 100644 --- a/sampling/include/var_opt_sketch.hpp +++ b/sampling/include/var_opt_sketch.hpp @@ -305,8 +305,7 @@ class var_opt_sketch { friend class var_opt_union; var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n); - var_opt_sketch(T* data, double* weights, size_t len, uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r, const A& allocator); - + string items_to_string(bool print_gap) const; // internal-use-only update diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp index 7ad13396..d245baa9 100644 --- a/sampling/include/var_opt_sketch_impl.hpp +++ b/sampling/include/var_opt_sketch_impl.hpp @@ -120,25 +120,6 @@ var_opt_sketch::var_opt_sketch(const var_opt_sketch& other, bool as_sketc } } -template -var_opt_sketch::var_opt_sketch(T* data, double* weights, size_t len, - uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r, const A& allocator) : - k_(k), - h_(h_count), - m_(0), - r_(r_count), - n_(n), - total_wt_r_(total_wt_r), - rf_(var_opt_constants::DEFAULT_RESIZE_FACTOR), - curr_items_alloc_(len), - filled_data_(n > k), - allocator_(allocator), - data_(data), - weights_(weights), - num_marks_in_h_(0), - marks_(nullptr) - {} - template var_opt_sketch::var_opt_sketch(var_opt_sketch&& other) noexcept : k_(other.k_), @@ -875,7 +856,8 @@ void var_opt_sketch::update_light(O&& item, double weight, bool mark) { const uint32_t m_slot = h_; // index of the gap, which becomes the M region if (filled_data_) { - data_[m_slot] = std::forward(item); + if (&data_[m_slot] != &item) + data_[m_slot] = std::forward(item); } else { new (&data_[m_slot]) T(std::forward(item)); filled_data_ = true; @@ -952,9 +934,10 @@ void var_opt_sketch::decrease_k_by_1() { // first, slide the R zone to the left by 1, temporarily filling the gap const uint32_t old_gap_idx = h_; const uint32_t old_final_r_idx = (h_ + 1 + r_) - 1; - //if (old_final_r_idx != k_) throw std::logic_error("gadget in invalid state"); + if (old_final_r_idx != k_) throw std::logic_error("gadget in invalid state"); swap_values(old_final_r_idx, old_gap_idx); + filled_data_ = true; // we just filled the gap, and no need to check previous state // now we pull an item out of H; any item is ok, but if we grab the rightmost and then // reduce h_, the heap invariant will be preserved (and the gap will be restored), plus @@ -1124,7 +1107,8 @@ template template void var_opt_sketch::push(O&& item, double wt, bool mark) { if (filled_data_) { - data_[h_] = std::forward(item); + if (&data_[h_] != &item) + data_[h_] = std::forward(item); } else { new (&data_[h_]) T(std::forward(item)); filled_data_ = true; @@ -1225,9 +1209,8 @@ void var_opt_sketch::downsample_candidate_set(double wt_cands, uint32_t n weights_[j] = -1.0; } - // The next two lines work even when delete_slot == leftmost_cand_slot + // The next line works even when delete_slot == leftmost_cand_slot data_[delete_slot] = std::move(data_[leftmost_cand_slot]); - // cannot set data_[leftmost_cand_slot] to null since not uisng T* m_ = 0; r_ = num_cands - 1; @@ -1704,7 +1687,7 @@ bool var_opt_sketch::iterator::get_mark() const { */ template uint32_t var_opt_sketch::get_adjusted_size(uint32_t max_size, uint32_t resize_target) { - if (max_size - (resize_target << 1) < 0L) { + if (max_size < (resize_target << 1)) { return max_size; } return resize_target; diff --git a/sampling/include/var_opt_union.hpp b/sampling/include/var_opt_union.hpp index 0a55cba4..eafeda08 100644 --- a/sampling/include/var_opt_union.hpp +++ b/sampling/include/var_opt_union.hpp @@ -171,7 +171,6 @@ class var_opt_union { */ string to_string() const; - private: typedef typename std::allocator_traits::template rebind_alloc> AllocSketch;