/
FdtSketch.java
168 lines (151 loc) · 6.27 KB
/
FdtSketch.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.fdt;
import static org.apache.datasketches.Util.MAX_LG_NOM_LONGS;
import java.util.List;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.tuple.strings.ArrayOfStringsSketch;
/**
* A Frequent Distinct Tuples sketch.
*
* <p>Suppose our data is a stream of pairs {IP address, User ID} and we want to identify the
* IP addresses that have the most distinct User IDs. Or conversely, we would like to identify
* the User IDs that have the most distinct IP addresses. This is a common challenge in the
* analysis of big data and the FDT sketch helps solve this problem using probabilistic techniques.
*
* <p>More generally, given a multiset of tuples with dimensions <i>{d1,d2, d3, ..., dN}</i>,
* and a primary subset of dimensions <i>M < N</i>, our task is to identify the combinations of
* <i>M</i> subset dimensions that have the most frequent number of distinct combinations of
* the <i>N-M</i> non-primary dimensions.
*
* <p>Please refer to the web page
* <a href="https://datasketches.apache.org/docs/Frequency/FrequentDistinctTuplesSketch.html">
* https://datasketches.apache.org/docs/Frequency/FrequentDistinctTuplesSketch.html</a> for a more
* complete discussion about this sketch.
*
* @author Lee Rhodes
*/
public class FdtSketch extends ArrayOfStringsSketch {
/**
* Create new instance of Frequent Distinct Tuples sketch with the given
* Log-base2 of required nominal entries.
* @param lgK Log-base2 of required nominal entries.
*/
public FdtSketch(final int lgK) {
super(lgK);
}
/**
* Used by deserialization.
* @param mem the image of a FdtSketch
* @deprecated As of 3.0.0, heapifying an UpdatableSketch is deprecated.
* This capability will be removed in a future release.
* Heapifying a CompactSketch is not deprecated.
*/
@Deprecated
FdtSketch(final Memory mem) {
super(mem);
}
/**
* Create a new instance of Frequent Distinct Tuples sketch with a size determined by the given
* threshold and rse.
* @param threshold : the fraction, between zero and 1.0, of the total distinct stream length
* that defines a "Frequent" (or heavy) item.
* @param rse the maximum Relative Standard Error for the estimate of the distinct population of a
* reported tuple (selected with a primary key) at the threshold.
*/
public FdtSketch(final double threshold, final double rse) {
super(computeLgK(threshold, rse));
}
/**
* Copy Constructor
* @param sketch the sketch to copy
*/
public FdtSketch(final FdtSketch sketch) {
super(sketch);
}
/**
* @return a deep copy of this sketch
*/
@SuppressWarnings("unchecked")
@Override
public FdtSketch copy() {
return new FdtSketch(this);
}
/**
* Update the sketch with the given string array tuple.
* @param tuple the given string array tuple.
*/
public void update(final String[] tuple) {
super.update(tuple, tuple);
}
/**
* Returns an ordered List of Groups of the most frequent distinct population of subset tuples
* represented by the count of entries of each group.
* @param priKeyIndices these indices define the dimensions used for the Primary Keys.
* @param limit the maximum number of groups to return. If this value is ≤ 0, all
* groups will be returned.
* @param numStdDev the number of standard deviations for the upper and lower error bounds,
* this value is an integer and must be one of 1, 2, or 3.
* <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of Standard Deviations</a>
* @param sep the separator character
* @return an ordered List of Groups of the most frequent distinct population of subset tuples
* represented by the count of entries of each group.
*/
public List<Group> getResult(final int[] priKeyIndices, final int limit, final int numStdDev,
final char sep) {
final PostProcessor proc = new PostProcessor(this, new Group(), sep);
return proc.getGroupList(priKeyIndices, numStdDev, limit);
}
/**
* Returns the PostProcessor that enables multiple queries against the sketch results.
* This assumes the default Group and the default separator character '|'.
* @return the PostProcessor
*/
public PostProcessor getPostProcessor() {
return getPostProcessor(new Group(), '|');
}
/**
* Returns the PostProcessor that enables multiple queries against the sketch results.
* @param group the Group class to use during post processing.
* @param sep the separator character.
* @return the PostProcessor
*/
public PostProcessor getPostProcessor(final Group group, final char sep) {
return new PostProcessor(this, group, sep);
}
// Restricted
/**
* Computes LgK given the threshold and RSE.
* @param threshold the fraction, between zero and 1.0, of the total stream length that defines
* a "Frequent" (or heavy) tuple.
* @param rse the maximum Relative Standard Error for the estimate of the distinct population of a
* reported tuple (selected with a primary key) at the threshold.
* @return LgK
*/
static int computeLgK(final double threshold, final double rse) {
final double v = Math.ceil(1.0 / (threshold * rse * rse));
final int lgK = (int) Math.ceil(Math.log(v) / Math.log(2));
if (lgK > MAX_LG_NOM_LONGS) {
throw new SketchesArgumentException("Requested Sketch (LgK = " + lgK + " > 2^26), "
+ "either increase the threshold, the rse or both.");
}
return lgK;
}
}