Skip to content
Permalink
Browse files
common Kolmogorov-Smirnov test, some minor fixes
  • Loading branch information
AlexanderSaydakov committed May 6, 2022
1 parent c7155bb commit 92431faaa4fa0c533a6682b68a30fca840043460
Showing 8 changed files with 185 additions and 46 deletions.
File renamed without changes.
@@ -24,7 +24,7 @@ namespace datasketches {

template<typename Sketch>
double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
using Comparator = typename Sketch::comparator;
auto comparator = sketch1.get_comparator(); // assuming the same comparator in sketch2
auto view1 = sketch1.get_sorted_view(true);
auto view2 = sketch2.get_sorted_view(true);
auto it1 = view1.begin();
@@ -36,9 +36,9 @@ double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
if (Comparator()((*it1).first, (*it2).first)) {
if (comparator((*it1).first, (*it2).first)) {
++it1;
} else if (Comparator()((*it2).first, (*it1).first)) {
} else if (comparator((*it2).first, (*it1).first)) {
++it2;
} else {
++it1;
@@ -242,6 +242,12 @@ class kll_sketch {
*/
T get_max_value() const;

/**
* Returns an instance of the comparator for this sketch.
* @return comparator
*/
C get_comparator() const;

/**
* Returns an approximation to the value of the data item
* that would be preceded by the given fraction of a hypothetical sorted
@@ -384,6 +390,7 @@ class kll_sketch {
/**
* Computes size needed to serialize the current state of the sketch.
* This version is for fixed-size arithmetic types (integral and floating point).
* @param instance of a SerDe
* @return size in bytes needed to serialize this sketch
*/
template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
@@ -429,7 +436,6 @@ class kll_sketch {
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
* @param instance of a SerDe
*
*/
template<typename SerDe = S>
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
@@ -445,6 +451,7 @@ class kll_sketch {
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
* @param instance of a SerDe
* @return serialized sketch as a vector of bytes
*/
template<typename SerDe = S>
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
@@ -457,7 +464,7 @@ class kll_sketch {
*
* Deprecated, to be removed in the next major version
*/
static kll_sketch<T, C, S, A> deserialize(std::istream& is, const A& allocator = A());
static kll_sketch deserialize(std::istream& is, const A& allocator = A());

/**
* This method deserializes a sketch from a given stream.
@@ -467,7 +474,7 @@ class kll_sketch {
* @return an instance of a sketch
*/
template<typename SerDe = S>
static kll_sketch<T, C, S, A> deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
static kll_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());

/**
* This method deserializes a sketch from a given array of bytes.
@@ -478,7 +485,7 @@ class kll_sketch {
*
* Deprecated, to be removed in the next major version
*/
static kll_sketch<T, C, S, A> deserialize(const void* bytes, size_t size, const A& allocator = A());
static kll_sketch deserialize(const void* bytes, size_t size, const A& allocator = A());

/**
* This method deserializes a sketch from a given array of bytes.
@@ -489,7 +496,7 @@ class kll_sketch {
* @return an instance of a sketch
*/
template<typename SerDe = S>
static kll_sketch<T, C, S, A> deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
static kll_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());

/*
* Gets the normalized rank error given k and pmf.
@@ -522,7 +529,7 @@ class kll_sketch {

private:
/* Serialized sketch layout:
* Adr:
* Addr:
* || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
* 0 || unused | M |--------K--------| Flags | FamID | SerVer | PreambleInts |
* || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
@@ -237,6 +237,11 @@ T kll_sketch<T, C, S, A>::get_max_value() const {
return *max_value_;
}

template<typename T, typename C, typename S, typename A>
C kll_sketch<T, C, S, A>::get_comparator() const {
return C();
}

template<typename T, typename C, typename S, typename A>
template<bool inclusive>
auto kll_sketch<T, C, S, A>::get_quantile(double rank) const -> quantile_return_type {
@@ -1013,7 +1018,6 @@ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
os << " Sorted : " << (is_level_zero_sorted_ ? "true" : "false") << std::endl;
os << " Capacity items : " << items_size_ << std::endl;
os << " Retained items : " << get_num_retained() << std::endl;
os << " Storage bytes : " << get_serialized_size_bytes() << std::endl;
if (!is_empty()) {
os << " Min value : " << *min_value_ << std::endl;
os << " Max value : " << *max_value_ << std::endl;
@@ -146,14 +146,13 @@ namespace quantiles_constants {
}

template <typename T,
typename Comparator = std::less<T>,
typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
typename Allocator = std::allocator<T>>
class quantiles_sketch {
public:
using C = Comparator;
using A = Allocator;
using AllocDouble = typename std::allocator_traits<Allocator>::template rebind_alloc<double>;
using vector_double = std::vector<double, AllocDouble>;
using value_type = T;
using comparator = Comparator;
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;

explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K, const Allocator& allocator = Allocator());
quantiles_sketch(const quantiles_sketch& other);
@@ -222,6 +221,12 @@ class quantiles_sketch {
*/
const T& get_max_value() const;

/**
* Returns an instance of the comparator for this sketch.
* @return comparator
*/
Comparator get_comparator() const;

/**
* Returns an approximation to the value of the data item
* that would be preceded by the given fraction of a hypothetical sorted
@@ -241,7 +246,7 @@ class quantiles_sketch {
*
* @return the approximation to the value at the given rank
*/
using quantile_return_type = typename quantile_sketch_sorted_view<T, C, A>::quantile_return_type;
using quantile_return_type = typename quantile_sketch_sorted_view<T, Comparator, Allocator>::quantile_return_type;
template<bool inclusive = false>
quantile_return_type get_quantile(double rank) const;

@@ -361,14 +366,16 @@ class quantiles_sketch {
/**
* Computes size needed to serialize the current state of the sketch.
* This version is for fixed-size arithmetic types (integral and floating point).
* @param instance of a SerDe
* @return size in bytes needed to serialize this sketch
*/
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
size_t get_serialized_size_bytes() const;
template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
size_t get_serialized_size_bytes(const SerDe& serde = SerDe()) const;

/**
* Computes size needed to serialize the current state of the sketch.
* This version is for all other types and can be expensive since every item needs to be looked at.
* @param instance of a SerDe
* @return size in bytes needed to serialize this sketch
*/
template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
@@ -377,6 +384,7 @@ class quantiles_sketch {
/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
* @param instance of a SerDe
*/
template<typename SerDe = serde<T>>
void serialize(std::ostream& os, const SerDe& serde = SerDe()) const;
@@ -391,26 +399,32 @@ class quantiles_sketch {
* It is a blank space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
* @param instance of a SerDe
* @return serialized sketch as a vector of bytes
*/
template<typename SerDe = serde<T>>
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& serde = SerDe()) const;

/**
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param instance of a SerDe
* @param instance of an Allocator
* @return an instance of a sketch
*/
template<typename SerDe = serde<T>>
static quantiles_sketch<T, C, A> deserialize(std::istream& is, const SerDe& serde = SerDe(), const A& allocator = A());
static quantiles_sketch deserialize(std::istream& is, const SerDe& serde = SerDe(), const Allocator& allocator = Allocator());

/**
* This method deserializes a sketch from a given array of bytes.
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param instance of a SerDe
* @param instance of an Allocator
* @return an instance of a sketch
*/
template<typename SerDe = serde<T>>
static quantiles_sketch<T, C, A> deserialize(const void* bytes, size_t size, const SerDe& serde = SerDe(), const A& allocator = A());
static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe& serde = SerDe(), const Allocator& allocator = Allocator());

/**
* Gets the normalized rank error for this sketch. Constants were derived as the best fit to 99 percentile
@@ -436,23 +450,22 @@ class quantiles_sketch {
* @param print_levels if true include information about levels
* @param print_items if true include sketch data
*/
string<A> to_string(bool print_levels = false, bool print_items = false) const;
string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;

class const_iterator;
const_iterator begin() const;
const_iterator end() const;

template<bool inclusive = false>
quantile_sketch_sorted_view<T, C, A> get_sorted_view(bool cumulative) const;
quantile_sketch_sorted_view<T, Comparator, Allocator> get_sorted_view(bool cumulative) const;

private:
using Level = std::vector<T, Allocator>;
using AllocLevel = typename std::allocator_traits<Allocator>::template rebind_alloc<Level>;
using VectorLevels = std::vector<Level, AllocLevel>;
using VectorLevels = std::vector<Level, typename std::allocator_traits<Allocator>::template rebind_alloc<Level>>;

/* Serialized sketch layout:
* Long || Start Byte Adr:
* Adr:
* Long || Start Byte Addr:
* Addr:
* || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
* 0 || Preamble_Longs | SerVer | FamID | Flags |----- K ---------|---- unused -----|
*
@@ -503,7 +516,7 @@ class quantiles_sketch {
template<typename FwdV>
static void in_place_propagate_carry(uint8_t starting_level, FwdV&& buf_size_k,
Level& buf_size_2k, bool apply_as_update,
quantiles_sketch<T,C,A>& sketch);
quantiles_sketch& sketch);
static void zip_buffer(Level& buf_in, Level& buf_out);
static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out);

@@ -529,7 +542,7 @@ class quantiles_sketch {
* src is modified only if elements can be moved out of it.
*/
template<typename FwdSk>
static void standard_merge(quantiles_sketch<T,C,A>& tgt, FwdSk&& src);
static void standard_merge(quantiles_sketch& tgt, FwdSk&& src);

/**
* Merges the src sketch into the tgt sketch with a smaller value of K.
@@ -538,7 +551,7 @@ class quantiles_sketch {
* src is modified only if elements can be moved out of it.
*/
template<typename FwdSk>
static void downsampling_merge(quantiles_sketch<T,C,A>& tgt, FwdSk&& src);
static void downsampling_merge(quantiles_sketch& tgt, FwdSk&& src);

template<typename FwdV>
static void zip_buffer_with_stride(FwdV&& buf_in, Level& buf_out, uint16_t stride);

0 comments on commit 92431fa

Please sign in to comment.