Skip to content
Permalink
Browse files
clone and minorly adapt python kll testing to classic quantiles
  • Loading branch information
jmalkin committed Apr 26, 2022
1 parent 9e224d7 commit 64272ddfbca72dd608b945859cab5254fe4a7c2d
Showing 3 changed files with 137 additions and 10 deletions.
@@ -28,6 +28,7 @@ void init_cpc(py::module& m);
void init_theta(py::module& m);
void init_vo(py::module& m);
void init_req(py::module& m);
void init_quantiles(py::module& m);
void init_vector_of_kll(py::module& m);

PYBIND11_MODULE(datasketches, m) {
@@ -38,5 +39,6 @@ PYBIND11_MODULE(datasketches, m) {
init_theta(m);
init_vo(m);
init_req(m);
init_quantiles(m);
init_vector_of_kll(m);
}
@@ -51,8 +51,8 @@ double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {

template<typename T>
double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
const T& item,
bool inclusive) {
const T& item,
bool inclusive) {
if (inclusive)
return sk.template get_rank<true>(item);
else
@@ -61,8 +61,8 @@ double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,

template<typename T>
T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
double rank,
bool inclusive) {
double rank,
bool inclusive) {
if (inclusive)
return T(sk.template get_quantile<true>(rank));
else
@@ -71,8 +71,8 @@ T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,

template<typename T>
py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
std::vector<double>& fractions,
bool inclusive) {
std::vector<double>& fractions,
bool inclusive) {
size_t n_quantiles = fractions.size();
auto result = inclusive
? sk.template get_quantiles<true>(&fractions[0], static_cast<uint32_t>(n_quantiles))
@@ -89,8 +89,8 @@ py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,

template<typename T>
py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
std::vector<T>& split_points,
bool inclusive) {
std::vector<T>& split_points,
bool inclusive) {
size_t n_points = split_points.size();
auto result = inclusive
? sk.template get_PMF<true>(&split_points[0], n_points)
@@ -106,8 +106,8 @@ py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,

template<typename T>
py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
std::vector<T>& split_points,
bool inclusive) {
std::vector<T>& split_points,
bool inclusive) {
size_t n_points = split_points.size();
auto result = inclusive
? sk.template get_CDF<true>(&split_points[0], n_points)
@@ -0,0 +1,125 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import unittest
from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch
import numpy as np

class QuantilesTest(unittest.TestCase):
def test_quantiles_example(self):
k = 128
n = 2 ** 20

# create a sketch and inject ~1 million N(0,1) points as an array and as a single item
quantiles = quantiles_floats_sketch(k)
quantiles.update(np.random.normal(size=n-1))
quantiles.update(0.0)

# 0 should be near the median
self.assertAlmostEqual(0.5, quantiles.get_rank(0.0), delta=0.035)

# the median should be near 0
self.assertAlmostEqual(0.0, quantiles.get_quantile(0.5), delta=0.035)

# we also track the min/max independently from the rest of the data
# which lets us know the full observed data range
self.assertLessEqual(quantiles.get_min_value(), quantiles.get_quantile(0.01))
self.assertLessEqual(0.0, quantiles.get_rank(quantiles.get_min_value()))
self.assertGreaterEqual(quantiles.get_max_value(), quantiles.get_quantile(0.99))
self.assertGreaterEqual(1.0, quantiles.get_rank(quantiles.get_max_value()))

# we can also extract a list of values at a time,
# here the values should give us something close to [-2, -1, 0, 1, 2].
# then get the CDF, which will return something close to
# the original values used in get_quantiles()
# finally, can check the normalized rank error bound
pts = quantiles.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
cdf = quantiles.get_cdf(pts) # include 1.0 at end to account for all probability mass
self.assertEqual(len(cdf), len(pts)+1)
err = quantiles.normalized_rank_error(False)
self.assertEqual(err, quantiles_floats_sketch.get_normalized_rank_error(k, False))

# and a few basic queries about the sketch
self.assertFalse(quantiles.is_empty())
self.assertTrue(quantiles.is_estimation_mode())
self.assertEqual(quantiles.get_n(), n)
self.assertEqual(quantiles.get_k(), k)
self.assertLess(quantiles.get_num_retained(), n)

# merging itself will double the number of items the sketch has seen
quantiles.merge(quantiles)
self.assertEqual(quantiles.get_n(), 2*n)

# we can then serialize and reconstruct the sketch
quantiles_bytes = quantiles.serialize()
new_quantiles = quantiles.deserialize(quantiles_bytes)
self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
self.assertEqual(quantiles.get_rank(0.0), new_quantiles.get_rank(0.0))

def test_quantiles_ints_sketch(self):
k = 128
n = 10
quantiles = quantiles_ints_sketch(k)
for i in range(0, n):
quantiles.update(i)

self.assertEqual(quantiles.get_min_value(), 0)
self.assertEqual(quantiles.get_max_value(), n-1)
self.assertEqual(quantiles.get_n(), n)
self.assertFalse(quantiles.is_empty())
self.assertFalse(quantiles.is_estimation_mode()) # n < k
self.assertEqual(quantiles.get_k(), k)

pmf = quantiles.get_pmf([round(n/2)])
self.assertIsNotNone(pmf)
self.assertEqual(len(pmf), 2)

cdf = quantiles.get_cdf([round(n/2)])
self.assertIsNotNone(cdf)
self.assertEqual(len(cdf), 2)

self.assertEqual(quantiles.get_quantile(0.5), round(n/2))
quants = quantiles.get_quantiles([0.25, 0.5, 0.75])
self.assertIsNotNone(quants)
self.assertEqual(len(quants), 3)

self.assertEqual(quantiles.get_rank(round(n/2)), 0.5)

# merge self
quantiles.merge(quantiles)
self.assertEqual(quantiles.get_n(), 2 * n)

sk_bytes = quantiles.serialize()
self.assertTrue(isinstance(quantiles_ints_sketch.deserialize(sk_bytes), quantiles_ints_sketch))

def test_quantiles_floats_sketch(self):
# already tested ints and it's templatized, so just make sure it instantiates properly
k = 256
quantiles = quantiles_floats_sketch(k)
self.assertTrue(quantiles.is_empty())

def test_quantiles_doubles_sketch(self):
# already tested ints and it's templatized, so just make sure it instantiates properly
k = 128
quantiles = quantiles_doubles_sketch(k)
self.assertTrue(quantiles.is_empty())

if __name__ == '__main__':
unittest.main()

0 comments on commit 64272dd

Please sign in to comment.