-
Notifications
You must be signed in to change notification settings - Fork 71
/
frequent_items_sketch.hpp
374 lines (328 loc) · 13.8 KB
/
frequent_items_sketch.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef FREQUENT_ITEMS_SKETCH_HPP_
#define FREQUENT_ITEMS_SKETCH_HPP_
#include <memory>
#include <vector>
#include <iostream>
#include <functional>
#include <type_traits>
#include "reverse_purge_hash_map.hpp"
#include "common_defs.hpp"
#include "serde.hpp"
namespace datasketches {
/*
* Based on Java implementation here:
* https://github.com/DataSketches/sketches-core/blob/master/src/main/java/com/yahoo/sketches/frequencies/ItemsSketch.java
* author Alexander Saydakov
*/
enum frequent_items_error_type { NO_FALSE_POSITIVES, NO_FALSE_NEGATIVES };
// type W for weight must be an arithmetic type (integral or floating point)
template<
typename T,
typename W = uint64_t,
typename H = std::hash<T>,
typename E = std::equal_to<T>,
typename S = serde<T>, // deprecated, to be removed in the next major version
typename A = std::allocator<T>
>
class frequent_items_sketch {
public:
static const uint8_t LG_MIN_MAP_SIZE = 3;
/**
* Construct this sketch with parameters lg_max_map_size and lg_start_map_size.
*
* @param lg_max_map_size Log2 of the physical size of the internal hash map managed by this
* sketch. The maximum capacity of this internal hash map is 0.75 times 2^lg_max_map_size.
* Both the ultimate accuracy and size of this sketch are functions of lg_max_map_size.
*
* @param lg_start_map_size Log2 of the starting physical size of the internal hash
* map managed by this sketch.
*/
explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE, const A& allocator = A());
/**
* Update this sketch with an item and a positive weight (frequency count).
* @param item for which the weight should be increased (lvalue)
* @param weight the amount by which the weight of the item should be increased
* A count of zero is a no-op, and a negative count will throw an exception.
*/
void update(const T& item, W weight = 1);
/**
* Update this sketch with an item and a positive weight (frequency count).
* @param item for which the weight should be increased (rvalue)
* @param weight the amount by which the weight of the item should be increased
* A count of zero is a no-op, and a negative count will throw an exception.
*/
void update(T&& item, W weight = 1);
/**
* This function merges the other sketch into this one.
* The other sketch may be of a different size.
* @param other sketch to be merged into this (lvalue)
*/
void merge(const frequent_items_sketch& other);
/**
* This function merges the other sketch into this one.
* The other sketch may be of a different size.
* @param other sketch to be merged into this (rvalue)
*/
void merge(frequent_items_sketch&& other);
/**
* @return true if this sketch is empty
*/
bool is_empty() const;
/**
* @return the number of active items in the sketch
*/
uint32_t get_num_active_items() const;
/**
* Returns the sum of the weights (frequencies) in the stream seen so far by the sketch
*
* @return the total weight of all items in the stream seen so far by the sketch
*/
W get_total_weight() const;
/**
* Returns the estimate of the weight (frequency) of the given item.
* Note: The true frequency of a item would be the sum of the counts as a result of the
* two update functions.
*
* @param item the given item
* @return the estimate of the weight (frequency) of the given item
*/
W get_estimate(const T& item) const;
/**
* Returns the guaranteed lower bound weight (frequency) of the given item.
*
* @param item the given item.
* @return the guaranteed lower bound weight of the given item. That is, a number which
* is guaranteed to be no larger than the real weight.
*/
W get_lower_bound(const T& item) const;
/**
* Returns the guaranteed upper bound weight (frequency) of the given item.
*
* @param item the given item
* @return the guaranteed upper bound weight of the given item. That is, a number which
* is guaranteed to be no smaller than the real frequency.
*/
W get_upper_bound(const T& item) const;
/**
* @return An upper bound on the maximum error of get_estimate(item) for any item.
* This is equivalent to the maximum distance between the upper bound and the lower bound
* for any item.
*/
W get_maximum_error() const;
/**
* Returns epsilon value of this sketch.
* This is just the value <i>3.5 / max_map_size</i>.
* @return epsilon used by the sketch to compute error.
*/
double get_epsilon() const;
/**
* Returns epsilon used to compute <i>a priori</i> error.
* This is just the value <i>3.5 / maxMapSize</i>.
* @param maxMapSize the planned map size to be used when constructing this sketch.
* @return epsilon used to compute <i>a priori</i> error.
*/
static double get_epsilon(uint8_t lg_max_map_size);
/**
* Returns the estimated <i>a priori</i> error given the max_map_size for the sketch and the
* estimated_total_stream_weight.
* @param lg_max_map_size the planned map size to be used when constructing this sketch.
* @param estimated_total_stream_weight the estimated total stream weight.
* @return the estimated <i>a priori</i> error.
*/
static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight);
class row;
typedef typename std::vector<row, typename std::allocator_traits<A>::template rebind_alloc<row>> vector_row; // alias for users
/**
* Returns an array of rows that include frequent items, estimates, upper and lower bounds
* given an error_type and using get_maximum_error() as a threshold.
*
* <p>The method first examines all active items in the sketch (items that have a counter).
*
* <p>If <i>error_type = NO_FALSE_NEGATIVES</i>, this will include an item in the result
* list if get_upper_bound(item) > threshold.
* There will be no false negatives, i.e., no Type II error.
* There may be items in the set with true frequencies less than the threshold
* (false positives).</p>
*
* <p>If <i>error_type = NO_FALSE_POSITIVES</i>, this will include an item in the result
* list if get_lower_bound(item) > threshold.
* There will be no false positives, i.e., no Type I error.
* There may be items omitted from the set with true frequencies greater than the
* threshold (false negatives).</p>
*
* @param error_type determines whether no false positives or no false negatives are desired.
* @return an array of frequent items
*/
vector_row get_frequent_items(frequent_items_error_type err_type) const;
/**
* Returns an array of rows that include frequent items, estimates, upper and lower bounds
* given an error_type and a threshold.
*
* <p>The method first examines all active items in the sketch (items that have a counter).
*
* <p>If <i>error_type = NO_FALSE_NEGATIVES</i>, this will include an item in the result
* list if get_upper_bound(item) > threshold.
* There will be no false negatives, i.e., no Type II error.
* There may be items in the set with true frequencies less than the threshold
* (false positives).</p>
*
* <p>If <i>error_type = NO_FALSE_POSITIVES</i>, this will include an item in the result
* list if get_lower_bound(item) > threshold.
* There will be no false positives, i.e., no Type I error.
* There may be items omitted from the set with true frequencies greater than the
* threshold (false negatives).</p>
*
* @param error_type determines whether no false positives or no false negatives are desired.
* @param threshold to include items in the result list
* @return an array of frequent items
*/
vector_row get_frequent_items(frequent_items_error_type err_type, W threshold) const;
/**
* Computes size needed to serialize the current state of the sketch.
* This can be expensive since every item needs to be looked at.
* @return size in bytes needed to serialize this sketch
*/
size_t get_serialized_size_bytes() const;
/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
*
* Deprecated, to be removed in the next major version
*/
void serialize(std::ostream& os) const;
/**
* This method serializes the sketch into a given stream in a binary form
* @param os output stream
* @param instance of a SerDe
*/
template<typename SerDe = S>
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
// This is a convenience alias for users
// The type returned by the following serialize method
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
/**
* This method serializes the sketch as a vector of bytes.
* An optional header can be reserved in front of the sketch.
* It is a blank space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
* @return serialized sketch as a vector of bytes
*
* Deprecated, to be removed in the next major version
*/
vector_bytes serialize(unsigned header_size_bytes = 0) const;
/**
* This method serializes the sketch as a vector of bytes.
* An optional header can be reserved in front of the sketch.
* It is a blank space of a given size.
* This header is used in Datasketches PostgreSQL extension.
* @param header_size_bytes space to reserve in front of the sketch
* @param instance of a SerDe
* @return serialized sketch as a vector of bytes
*/
template<typename SerDe = S>
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
/**
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param instance of an Allocator
* @return an instance of the sketch
*
* Deprecated, to be removed in the next major version
*/
static frequent_items_sketch deserialize(std::istream& is, const A& allocator = A());
/**
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param instance of a SerDe
* @param instance of an Allocator
* @return an instance of the sketch
*/
template<typename SerDe = S>
static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
/**
* This method deserializes a sketch from a given array of bytes.
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param instance of an Allocator
* @return an instance of the sketch
*
* Deprecated, to be removed in the next major version
*/
static frequent_items_sketch deserialize(const void* bytes, size_t size, const A& allocator = A());
/**
* This method deserializes a sketch from a given array of bytes.
* @param bytes pointer to the array of bytes
* @param size the size of the array
* @param instance of a SerDe
* @param instance of an Allocator
* @return an instance of the sketch
*/
template<typename SerDe = S>
static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
/**
* Returns a human readable summary of this sketch
* @param print_items if true include the list of items retained by the sketch
*/
string<A> to_string(bool print_items = false) const;
private:
static const uint8_t SERIAL_VERSION = 1;
static const uint8_t FAMILY_ID = 10;
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
static const uint8_t PREAMBLE_LONGS_NONEMPTY = 4;
static constexpr double EPSILON_FACTOR = 3.5;
enum flags { IS_EMPTY };
W total_weight;
W offset;
reverse_purge_hash_map<T, W, H, E, A> map;
static void check_preamble_longs(uint8_t preamble_longs, bool is_empty);
static void check_serial_version(uint8_t serial_version);
static void check_family_id(uint8_t family_id);
static void check_size(uint8_t lg_cur_size, uint8_t lg_max_size);
// version for integral signed type
template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_signed<WW>::value, int>::type = 0>
static inline void check_weight(WW weight);
// version for integral unsigned type
template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_unsigned<WW>::value, int>::type = 0>
static inline void check_weight(WW weight);
// version for floating point type
template<typename WW = W, typename std::enable_if<std::is_floating_point<WW>::value, int>::type = 0>
static inline void check_weight(WW weight);
// for deserialize
class items_deleter;
};
template<typename T, typename W, typename H, typename E, typename S, typename A>
class frequent_items_sketch<T, W, H, E, S, A>::row {
public:
row(const T* item, W weight, W offset):
item(item), weight(weight), offset(offset) {}
const T& get_item() const { return *item; }
W get_estimate() const { return weight + offset; }
W get_lower_bound() const { return weight; }
W get_upper_bound() const { return weight + offset; }
private:
const T* item;
W weight;
W offset;
};
}
#include "frequent_items_sketch_impl.hpp"
# endif