Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/include/memory_operations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <memory>
#include <exception>
#include <iostream>
#include <string>

namespace datasketches {

Expand Down
9 changes: 8 additions & 1 deletion python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@ target_link_libraries(python

set_target_properties(python PROPERTIES
PREFIX ""
OUTPUT_NAME datasketches
OUTPUT_NAME _datasketches
)

target_include_directories(python
PUBLIC
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
)

# ensure we make a .so on Mac rather than .dylib
Expand All @@ -71,4 +77,5 @@ target_sources(python
src/quantiles_wrapper.cpp
src/ks_wrapper.cpp
src/vector_of_kll.cpp
src/py_serde.cpp
)
104 changes: 104 additions & 0 deletions python/datasketches/PySerDe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from _datasketches import PyObjectSerDe

import struct

# This file provides several Python SerDe implementation examples.
#
# Each implementation must extend the PyObjectSerDe class and define
# three methods:
# * get_size(item) returns an int of the number of bytes needed to
# serialize the given item
# * to_bytes(item) returns a bytes object representing a serialized
# version of the given item
# * from_bytes(data, offset) takes a bytes object (data) and an offset
# indicating where in the data array to start reading. The method
# returns a tuple with the newly reconstructed object and the
# total number of bytes beyond the offset read from the input data.

# Implements a simple string-encoding scheme where a string is
# written as <num_bytes> <string_contents>, with no null termination.
# This format allows pre-allocating each string, at the cost of
# additional storage. Using this format, the serialized string consumes
# 4 + len(item) bytes.
class PyStringsSerDe(PyObjectSerDe):
def get_size(self, item):
return int(4 + len(item))

def to_bytes(self, item: str):
b = bytearray()
b.extend(len(item).to_bytes(4, 'little'))
b.extend(map(ord,item))
return bytes(b)

def from_bytes(self, data: bytes, offset: int):
num_chars = int.from_bytes(data[offset:offset+3], 'little')
if (num_chars < 0 or num_chars > offset + len(data)):
raise IndexError(f'num_chars read must be non-negative and not larger than the buffer. Found {num_chars}')
str = data[offset+4:offset+4+num_chars].decode()
return (str, 4+num_chars)

# Implements an integer-encoding scheme where each integer is written
# as a 32-bit (4 byte) little-endian value.
class PyIntsSerDe(PyObjectSerDe):
def get_size(self, item):
return int(4)

def to_bytes(self, item):
return struct.pack('i', item)

def from_bytes(self, data: bytes, offset: int):
val = struct.unpack_from('i', data, offset)[0]
return (val, 4)


class PyLongsSerDe(PyObjectSerDe):
def get_size(self, item):
return int(8)

def to_bytes(self, item):
return struct.pack('l', item)

def from_bytes(self, data: bytes, offset: int):
val = struct.unpack_from('l', data, offset)[0]
return (val, 8)


class PyFloatsSerDe(PyObjectSerDe):
def get_size(self, item):
return int(4)

def to_bytes(self, item):
return struct.pack('f', item)

def from_bytes(self, data: bytes, offset: int):
val = struct.unpack_from('f', data, offset)[0]
return (val, 4)


class PyDoublesSerDe(PyObjectSerDe):
def get_size(self, item):
return int(8)

def to_bytes(self, item):
return struct.pack('d', item)

def from_bytes(self, data: bytes, offset: int):
val = struct.unpack_from('d', data, offset)[0]
return (val, 8)
22 changes: 22 additions & 0 deletions python/datasketches/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

name = 'datasketches'

from .PySerDe import *

from _datasketches import *
113 changes: 113 additions & 0 deletions python/include/py_serde.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <pybind11/pybind11.h>
#include <pybind11/functional.h>
#include <sstream>

#ifndef _PY_SERDE_HPP_
#define _PY_SERDE_HPP_

namespace py = pybind11;

namespace datasketches {

/**
* @brief The py_object_serde is an abstract class that implements the
* datasketches serde interface, and is used to allow custom Python
* serialization of items wrapped as generic py::object types. The actual
* Python implementation classes must extend the PyObjectSerDe class.
*/
struct py_object_serde {
/**
* @brief Get the serialized size of an object, in bytes
*
* @param item A provided item
* @return int64_t The serialized size of the item, in bytes
*/
virtual int64_t get_size(const py::object& item) const = 0;

/**
* @brief Serializes an item to a bytes object
*
* @param item A provided item
* @return The serialized image of the item as a Python bytes object
*/
virtual py::bytes to_bytes(const py::object& item) const = 0;

/**
* @brief Constructs an object from a serialized image, reading the
* incoming buffer starting at the specified offset.
*
* @param bytes A buffer containing items from a serialized sketch
* @param offset The starting offset into the bytes buffer
* @return A Python tuple of the reconstructed item and the total number of bytes read
*/
virtual py::tuple from_bytes(py::bytes& bytes, size_t offset) const = 0;

virtual ~py_object_serde() = default;

// these methods are required by the serde interface; see common/include/serde.hpp for
// default implementations for C++ std::string and numeric types.
size_t size_of_item(const py::object& item) const;
size_t serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const;
size_t deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const;
};

/**
* @brief The PyObjectSerDe class provides a concrete base class
* that pybind11 uses as a "trampoline" to pass calls through to
* the abstract py_object_serde class. Custom Python serde implementations
* must extend this class.
*/
struct PyObjectSerDe : public py_object_serde {
using py_object_serde::py_object_serde;

// trampoline definitions -- need one for each virtual function
int64_t get_size(const py::object& item) const override {
PYBIND11_OVERRIDE_PURE(
int64_t, // Return type
py_object_serde, // Parent class
get_size, // Name of function in C++ (must match Python name)
item // Argument(s)
);
}

py::bytes to_bytes(const py::object& item) const override {
PYBIND11_OVERRIDE_PURE(
py::bytes, // Return type
py_object_serde, // Parent class
to_bytes, // Name of function in C++ (must match Python name)
item // Argument(s)
);
}

py::tuple from_bytes(py::bytes& bytes, size_t offset) const override {
PYBIND11_OVERRIDE_PURE(
py::tuple, // Return type
py_object_serde, // Parent class
from_bytes, // Name of function in C++ (must match Python name)
bytes, offset // Argument(s)
);
}
};

}

#endif // _PY_SERDE_HPP_
18 changes: 18 additions & 0 deletions python/pybind11Path.cmd
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
:: Licensed to the Apache Software Foundation (ASF) under one
:: or more contributor license agreements. See the NOTICE file
:: distributed with this work for additional information
:: regarding copyright ownership. The ASF licenses this file
:: to you under the Apache License, Version 2.0 (the
:: "License"); you may not use this file except in compliance
:: with the License. You may obtain a copy of the License at
::
:: http://www.apache.org/licenses/LICENSE-2.0
::
:: Unless required by applicable law or agreed to in writing,
:: software distributed under the License is distributed on an
:: "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
:: KIND, either express or implied. See the License for the
:: specific language governing permissions and limitations
:: under the License.


@echo off
:: Takes path to the Python interpreter and returns the path to pybind11
%1 -c "import pybind11,sys;sys.stdout.write(pybind11.get_cmake_dir())"
18 changes: 17 additions & 1 deletion python/src/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

name = "datasketches"

12 changes: 9 additions & 3 deletions python/src/datasketches.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

namespace py = pybind11;

// sketches
void init_hll(py::module& m);
void init_kll(py::module& m);
void init_fi(py::module& m);
Expand All @@ -29,10 +30,13 @@ void init_theta(py::module& m);
void init_vo(py::module& m);
void init_req(py::module& m);
void init_quantiles(py::module& m);
void init_kolmogorov_smirnov(py::module& m);
void init_vector_of_kll(py::module& m);

PYBIND11_MODULE(datasketches, m) {
// supporting objects
void init_kolmogorov_smirnov(py::module& m);
void init_serde(py::module& m);

PYBIND11_MODULE(_datasketches, m) {
init_hll(m);
init_kll(m);
init_fi(m);
Expand All @@ -41,6 +45,8 @@ PYBIND11_MODULE(datasketches, m) {
init_vo(m);
init_req(m);
init_quantiles(m);
init_kolmogorov_smirnov(m);
init_vector_of_kll(m);

init_kolmogorov_smirnov(m);
init_serde(m);
}
Loading