Skip to content
Permalink
Browse files
Merge pull request #275 from apache/ks_python
Add KS Test to python, updating docs where appropriate
  • Loading branch information
jmalkin committed May 10, 2022
2 parents b78b208 + 8d8ef2b commit 38d50385af1b3bfa48ad36ebb2db41de76d06a65
Showing 11 changed files with 120 additions and 17 deletions.
@@ -25,7 +25,8 @@ namespace datasketches {
class kolmogorov_smirnov {
public:
/**
* Computes the raw delta area between two KLL quantile sketches for the Kolmogorov-Smirnov Test.
* Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test.
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
* @param sketch1 KLL sketch 1
* @param sketch2 KLL sketch 2
* @return the raw delta between two KLL quantile sketches
@@ -37,6 +38,7 @@ class kolmogorov_smirnov {
* Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
* Adjusts the computed threshold by the error epsilons of the two given sketches.
* See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
* @param sketch1 KLL sketch 1
* @param sketch2 KLL sketch 2
* @param p Target p-value. Typically .001 to .1, e.g., .05.
@@ -46,7 +48,8 @@ class kolmogorov_smirnov {
static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);

/**
* Performs the Kolmogorov-Smirnov Test between two KLL quantiles sketches.
* Performs the Kolmogorov-Smirnov Test between two quantile sketches.
* Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T.
* Note: if the given sketches have insufficient data or if the sketch sizes are too small,
* this will return false.
* @param sketch1 KLL sketch 1
@@ -57,7 +60,6 @@ class kolmogorov_smirnov {
*/
template<typename Sketch>
static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);

};

} /* namespace datasketches */
@@ -20,6 +20,9 @@
#ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
#define KOLMOGOROV_SMIRNOV_IMPL_HPP_

#include <cmath>
#include <algorithm>

namespace datasketches {

template<typename Sketch>
@@ -69,5 +69,6 @@ target_sources(python
src/vo_wrapper.cpp
src/req_wrapper.cpp
src/quantiles_wrapper.cpp
src/ks_wrapper.cpp
src/vector_of_kll.cpp
)
@@ -27,6 +27,11 @@ Having installed the library, loading the Apache Datasketches Library in Python
- KLL (Absolute Error Quantiles)
- `kll_ints_sketch`
- `kll_floats_sketch`
- `kll_doubles_sketch`
- Quantiles (Absolute Error Quantiles, inferior algorithm)
- `quantiles_ints_sketch`
- `quantiles_floats_sketch`
- `quantiles_doubles_sketch`
- REQ (Relative Error Quantiles)
- `req_ints_sketch`
- `req_floats_sketch`
@@ -52,6 +57,8 @@ Having installed the library, loading the Apache Datasketches Library in Python
- Vector of KLL
- `vector_of_kll_ints_sketches`
- `vector_of_kll_floats_sketches`
- Kolmogorov-Smirnov Test
- `ks_test` applied to a pair of matched-type Absolute Error quantiles sketches

## Known Differences from C++

@@ -29,6 +29,7 @@ void init_theta(py::module& m);
void init_vo(py::module& m);
void init_req(py::module& m);
void init_quantiles(py::module& m);
void init_kolmogorov_smirnov(py::module& m);
void init_vector_of_kll(py::module& m);

PYBIND11_MODULE(datasketches, m) {
@@ -40,5 +41,6 @@ PYBIND11_MODULE(datasketches, m) {
init_vo(m);
init_req(m);
init_quantiles(m);
init_kolmogorov_smirnov(m);
init_vector_of_kll(m);
}
@@ -242,4 +242,5 @@ void bind_kll_sketch(py::module &m, const char* name) {
void init_kll(py::module &m) {
bind_kll_sketch<int>(m, "kll_ints_sketch");
bind_kll_sketch<float>(m, "kll_floats_sketch");
bind_kll_sketch<double>(m, "kll_doubles_sketch");
}
@@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "kolmogorov_smirnov.hpp"
#include "kll_sketch.hpp"
#include "quantiles_sketch.hpp"

#include <pybind11/pybind11.h>

namespace py = pybind11;

void init_kolmogorov_smirnov(py::module &m) {
using namespace datasketches;

m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
"Performs the Kolmogorov-Smirnov Test between kll_ints_sketches.\n"
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
"this will return false.\n"
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
"distribution) using the provided p-value, otherwise False.");
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
"Performs the Kolmogorov-Smirnov Test between kll_floats_sketches.\n"
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
"this will return false.\n"
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
"distribution) using the provided p-value, otherwise False.");
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
"Performs the Kolmogorov-Smirnov Test between kll_doubles_sketches.\n"
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
"this will return false.\n"
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
"distribution) using the provided p-value, otherwise False.");

m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
"Performs the Kolmogorov-Smirnov Test between quantiles_ints_sketches.\n"
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
"this will return false.\n"
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
"distribution) using the provided p-value, otherwise False.");
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
"Performs the Kolmogorov-Smirnov Test between quantiles_floats_sketches.\n"
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
"this will return false.\n"
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
"distribution) using the provided p-value, otherwise False.");
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
"Performs the Kolmogorov-Smirnov Test between quantiles_doubles_sketches.\n"
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
"this will return false.\n"
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
"distribution) using the provided p-value, otherwise False.");
}
@@ -16,7 +16,7 @@
# under the License.

import unittest
from datasketches import kll_ints_sketch, kll_floats_sketch
from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch, ks_test
import numpy as np

class KllTest(unittest.TestCase):
@@ -73,6 +73,12 @@ def test_kll_example(self):
self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7))
self.assertEqual(kll.get_rank(0.0), new_kll.get_rank(0.0))

# A Kolmogorov-Smirnov Test of kll and new_kll should match, even for
# a fairly small p-value -- cannot reject the null hypothesis that
# they come from the same distribution (since they do)
self.assertFalse(ks_test(kll, new_kll, 0.001))


def test_kll_ints_sketch(self):
k = 100
n = 10
@@ -109,10 +115,10 @@ def test_kll_ints_sketch(self):
sk_bytes = kll.serialize()
self.assertTrue(isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch))

def test_kll_floats_sketch(self):
# already tested ints and it's templatized, so just make sure it instantiates properly
def test_kll_doubles_sketch(self):
# already tested float and ints and it's templatized, so just make sure it instantiates properly
k = 75
kll = kll_floats_sketch(k)
kll = kll_doubles_sketch(k)
self.assertTrue(kll.is_empty())

if __name__ == '__main__':
@@ -16,7 +16,7 @@
# under the License.

import unittest
from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch
from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch, ks_test
import numpy as np

class QuantilesTest(unittest.TestCase):
@@ -73,6 +73,13 @@ def test_quantiles_example(self):
self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
self.assertEqual(quantiles.get_rank(0.0), new_quantiles.get_rank(0.0))

# If we create a new sketch with a very different distribution, a Kolmogorov-Smirnov Test
# of the two should return True: we can reject the null hypothesis that the sketches
# come from the same distributions.
unif_quantiles = quantiles_floats_sketch(k)
unif_quantiles.update(np.random.uniform(10, 20, size=n-1))
self.assertTrue(ks_test(quantiles, unif_quantiles, 0.001))

def test_quantiles_ints_sketch(self):
k = 128
n = 10
@@ -109,14 +116,8 @@ def test_quantiles_ints_sketch(self):
sk_bytes = quantiles.serialize()
self.assertTrue(isinstance(quantiles_ints_sketch.deserialize(sk_bytes), quantiles_ints_sketch))

def test_quantiles_floats_sketch(self):
# already tested ints and it's templatized, so just make sure it instantiates properly
k = 256
quantiles = quantiles_floats_sketch(k)
self.assertTrue(quantiles.is_empty())

def test_quantiles_doubles_sketch(self):
# already tested ints and it's templatized, so just make sure it instantiates properly
# already tested floats and ints and it's templatized, so just make sure it instantiates properly
k = 128
quantiles = quantiles_doubles_sketch(k)
self.assertTrue(quantiles.is_empty())
@@ -36,10 +36,11 @@ template<
>
class req_sketch {
public:
using value_type = T;
using comparator = Comparator;
using Compactor = req_compactor<T, Comparator, Allocator>;
using AllocCompactor = typename std::allocator_traits<Allocator>::template rebind_alloc<Compactor>;
using AllocDouble = typename std::allocator_traits<Allocator>::template rebind_alloc<double>;
using vector_double = std::vector<double, AllocDouble>;
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;

/**
* Constructor
@@ -115,6 +116,12 @@ class req_sketch {
*/
const T& get_max_value() const;

/**
* Returns an instance of the comparator for this sketch.
* @return comparator
*/
Comparator get_comparator() const;

/**
* Returns an approximation to the normalized (fractional) rank of the given item from 0 to 1 inclusive.
* With the template parameter inclusive=true the weight of the given item is included into the rank.
@@ -196,6 +196,11 @@ const T& req_sketch<T, C, S, A>::get_max_value() const {
return *max_value_;
}

template<typename T, typename C, typename S, typename A>
C req_sketch<T, C, S, A>::get_comparator() const {
return C();
}

template<typename T, typename C, typename S, typename A>
template<bool inclusive>
double req_sketch<T, C, S, A>::get_rank(const T& item) const {

0 comments on commit 38d5038

Please sign in to comment.