/
utils.go
195 lines (170 loc) · 6.43 KB
/
utils.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package hll
import (
"fmt"
"math"
"github.com/apache/datasketches-go/internal"
)
const (
defaultLgK = 12
lgInitListSize = 3
lgInitSetSize = 5
)
const (
minLogK = 4
maxLogK = 21
empty = 0
keyBits26 = 26
valBits6 = 6
keyMask26 = (1 << keyBits26) - 1
valMask6 = (1 << valBits6) - 1
resizeNumber = 3
resizeDenom = 4
couponRSEFactor = .409 //at transition point not the asymptote
couponRSE = couponRSEFactor / (1 << 13)
hiNibbleMask = 0xf0
loNibbleMask = 0x0f
auxToken = 0xf
)
var (
hllNonHipRSEFactor = math.Sqrt((3.0 * math.Log(2.0)) - 1.0) //1.03896
hllHipRSEFActor = math.Sqrt(math.Log(2.0)) //.8325546
)
type TgtHllType int
type curMode int
const (
curModeList curMode = 0
curModeSet curMode = 1
curModeHll curMode = 2
)
// Specifies the target type of HLL sketch to be created. It is a target in that the actual
// allocation of the HLL array is deferred until sufficient number of items have been received by
// the warm-up phases.
//
// These three target types are isomorphic representations of the same underlying HLL algorithm.
// Thus, given the same value of <i>lgConfigK</i> and the same input, all three HLL target types
// will produce identical estimates and have identical error distributions.
//
// The memory (and also the serialization) of the sketch during this early warmup phase starts
// out very small (8 bytes, when empty) and then grows in increments of 4 bytes as required
// until the full HLL array is allocated. This transition point occurs at about 10% of K for
// sketches where lgConfigK is > 8.
//
// - Hll 8 This uses an 8-bit byte per HLL bucket. It is generally the
// fastest in terms of update time, but has the largest storage footprint of about
// K bytes.
//
// - Hll 6 This uses a 6-bit field per HLL bucket. It is the generally the next fastest
// in terms of update time with a storage footprint of about 3/4 * K bytes.
//
// - Hll 4 This uses a 4-bit field per HLL bucket and for large counts may require
// the use of a small internal auxiliary array for storing statistical exceptions, which are rare.
// For the values of lgConfigK > 13 (K = 8192),
// this additional array adds about 3% to the overall storage. It is generally the slowest in
// terms of update time, but has the smallest storage footprint of about
// K/2 * 1.03 bytes.
const (
TgtHllTypeHll4 = TgtHllType(0)
TgtHllTypeHll6 = TgtHllType(1)
TgtHllTypeHll8 = TgtHllType(2)
TgtHllTypeDefault = TgtHllTypeHll4
)
var (
// lgAuxArrInts is the Log2 table sizes for exceptions based on lgK from 0 to 26.
//However, only lgK from 4 to 21 are used.
lgAuxArrInts = []int{
0, 2, 2, 2, 2, 2, 2, 3, 3, 3, //0 - 9
4, 4, 5, 5, 6, 7, 8, 9, 10, 11, //10 - 19
12, 13, 14, 15, 16, 17, 18, //20 - 26
}
)
// CheckLgK checks the given lgK and returns it if it is valid and return an error otherwise.
func checkLgK(lgK int) (int, error) {
if lgK >= minLogK && lgK <= maxLogK {
return lgK, nil
}
return 0, fmt.Errorf("log K must be between 4 and 21, inclusive: %d", lgK)
}
// pair returns a value where the lower 26 bits are the slotNo and the upper 6 bits are the value.
func pair(slotNo int, value int) int {
return (value << keyBits26) | (slotNo & keyMask26)
}
// pairString returns a string representation of the pair.
func pairString(pair int) string {
return fmt.Sprintf("SlotNo: %d, Value: %d", getPairLow26(pair), getPairValue(pair))
}
// getPairLow26 returns the pair, the lower 26 bits of the pair.
func getPairLow26(pair int) int {
return pair & keyMask26
}
// getPairValue returns the value of the pair.
// The value is the upper 6 bits of the pair.
func getPairValue(pair int) int {
return pair >> keyBits26
}
func checkNumStdDev(numStdDev int) error {
if numStdDev < 1 || numStdDev > 3 {
return fmt.Errorf("NumStdDev may not be less than 1 or greater than 3: %d", numStdDev)
}
return nil
}
// checkPreamble checks the given preamble and returns the curMode if it is valid and return an error otherwise.
func checkPreamble(preamble []byte) (curMode, error) {
if len(preamble) == 0 {
return 0, fmt.Errorf("preamble cannot be nil or empty")
}
preInts := extractPreInts(preamble)
if len(preamble) < (preInts * 4) {
return 0, fmt.Errorf("preamble length mismatch: %d, %d", len(preamble), preInts)
}
serVer := extractSerVer(preamble)
famId := extractFamilyID(preamble)
curMode := extractCurMode(preamble)
if famId != internal.FamilyEnum.HLL.Id {
return 0, fmt.Errorf("possible Corruption: Invalid Family: %d", famId)
}
if serVer != 1 {
return 0, fmt.Errorf("possible Corruption: Invalid Serialization Version: %d", serVer)
}
if preInts != listPreInts && preInts != hashSetPreInts && preInts != hllPreInts {
return 0, fmt.Errorf("possible Corruption: Invalid Preamble Ints: %d", preInts)
}
if curMode == curModeList && preInts != listPreInts {
return 0, fmt.Errorf("possible Corruption: Invalid Preamble Ints: %d", preInts)
}
if curMode == curModeSet && preInts != hashSetPreInts {
return 0, fmt.Errorf("possible Corruption: Invalid Preamble Ints: %d", preInts)
}
if curMode == curModeHll && preInts != hllPreInts {
return 0, fmt.Errorf("possible Corruption: Invalid Preamble Ints: %d", preInts)
}
return curMode, nil
}
func getMaxUpdatableSerializationBytes(lgConfigK int, tgtHllType TgtHllType) int {
var arrBytes int
if tgtHllType == TgtHllTypeHll4 {
auxBytes := 4 << lgAuxArrInts[lgConfigK]
arrBytes = (1 << (lgConfigK - 1)) + auxBytes
} else if tgtHllType == TgtHllTypeHll6 {
numSlots := 1 << lgConfigK
arrBytes = ((numSlots * 3) >> 2) + 1
} else {
arrBytes = 1 << lgConfigK
}
return hllByteArrStart + arrBytes
}