/
bit_set_run_reader.go
361 lines (319 loc) · 10.2 KB
/
bit_set_run_reader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package bitutils
import (
"encoding/binary"
"math/bits"
"github.com/apache/arrow/go/v15/arrow/bitutil"
"github.com/apache/arrow/go/v15/internal/utils"
)
// IsMultipleOf64 returns whether v is a multiple of 64.
func IsMultipleOf64(v int64) bool { return v&63 == 0 }
// LeastSignificantBitMask returns a bit mask to return the least significant
// bits for a value starting from the bit index passed in. ie: if you want a
// mask for the 4 least significant bits, you call LeastSignificantBitMask(4)
func LeastSignificantBitMask(index int64) uint64 {
return (uint64(1) << index) - 1
}
// SetBitRun describes a run of contiguous set bits in a bitmap with Pos being
// the starting position of the run and Length being the number of bits.
type SetBitRun struct {
Pos int64
Length int64
}
// AtEnd returns true if this bit run is the end of the set by checking
// that the length is 0.
func (s SetBitRun) AtEnd() bool {
return s.Length == 0
}
// Equal returns whether rhs is the same run as s
func (s SetBitRun) Equal(rhs SetBitRun) bool {
return s.Pos == rhs.Pos && s.Length == rhs.Length
}
// SetBitRunReader is an interface for reading groups of contiguous set bits
// from a bitmap. The interface allows us to create different reader implementations
// that share the same interface easily such as a reverse set reader.
type SetBitRunReader interface {
// NextRun will return the next run of contiguous set bits in the bitmap
NextRun() SetBitRun
// Reset allows re-using the reader by providing a new bitmap, offset and length. The arguments
// match the New function for the reader being used.
Reset([]byte, int64, int64)
// VisitSetBitRuns calls visitFn for each set in a loop starting from the current position
// it's roughly equivalent to simply looping, calling NextRun and calling visitFn on the run
// for each run.
VisitSetBitRuns(visitFn VisitFn) error
}
type baseSetBitRunReader struct {
bitmap []byte
pos int64
length int64
remaining int64
curWord uint64
curNumBits int32
reversed bool
firstBit uint64
}
// NewSetBitRunReader returns a SetBitRunReader for the bitmap starting at startOffset which will read
// numvalues bits.
func NewSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader {
return newBaseSetBitRunReader(validBits, startOffset, numValues, false)
}
// NewReverseSetBitRunReader returns a SetBitRunReader like NewSetBitRunReader, except it will
// return runs starting from the end of the bitmap until it reaches startOffset rather than starting
// at startOffset and reading from there. The SetBitRuns will still operate the same, so Pos
// will still be the position of the "left-most" bit of the run or the "start" of the run. It
// just returns runs starting from the end instead of starting from the beginning.
func NewReverseSetBitRunReader(validBits []byte, startOffset, numValues int64) SetBitRunReader {
return newBaseSetBitRunReader(validBits, startOffset, numValues, true)
}
func newBaseSetBitRunReader(bitmap []byte, startOffset, length int64, reverse bool) *baseSetBitRunReader {
ret := &baseSetBitRunReader{reversed: reverse}
ret.Reset(bitmap, startOffset, length)
return ret
}
func (br *baseSetBitRunReader) Reset(bitmap []byte, startOffset, length int64) {
br.bitmap = bitmap
br.length = length
br.remaining = length
br.curNumBits = 0
br.curWord = 0
if !br.reversed {
br.pos = startOffset / 8
br.firstBit = 1
bitOffset := int8(startOffset % 8)
if length > 0 && bitOffset != 0 {
br.curNumBits = int32(utils.Min(int(length), int(8-bitOffset)))
br.curWord = br.loadPartial(bitOffset, int64(br.curNumBits))
}
return
}
br.pos = (startOffset + length) / 8
br.firstBit = uint64(0x8000000000000000)
endBitOffset := int8((startOffset + length) % 8)
if length > 0 && endBitOffset != 0 {
br.pos++
br.curNumBits = int32(utils.Min(int(length), int(endBitOffset)))
br.curWord = br.loadPartial(8-endBitOffset, int64(br.curNumBits))
}
}
func (br *baseSetBitRunReader) consumeBits(word uint64, nbits int32) uint64 {
if br.reversed {
return word << nbits
}
return word >> nbits
}
func (br *baseSetBitRunReader) countFirstZeros(word uint64) int32 {
if br.reversed {
return int32(bits.LeadingZeros64(word))
}
return int32(bits.TrailingZeros64(word))
}
func (br *baseSetBitRunReader) loadPartial(bitOffset int8, numBits int64) uint64 {
var word [8]byte
nbytes := bitutil.BytesForBits(numBits)
if br.reversed {
br.pos -= nbytes
copy(word[8-nbytes:], br.bitmap[br.pos:br.pos+nbytes])
return (binary.LittleEndian.Uint64(word[:]) << bitOffset) &^ LeastSignificantBitMask(64-numBits)
}
copy(word[:], br.bitmap[br.pos:br.pos+nbytes])
br.pos += nbytes
return (binary.LittleEndian.Uint64(word[:]) >> bitOffset) & LeastSignificantBitMask(numBits)
}
func (br *baseSetBitRunReader) findCurrentRun() SetBitRun {
nzeros := br.countFirstZeros(br.curWord)
if nzeros >= br.curNumBits {
br.remaining -= int64(br.curNumBits)
br.curWord = 0
br.curNumBits = 0
return SetBitRun{0, 0}
}
br.curWord = br.consumeBits(br.curWord, nzeros)
br.curNumBits -= nzeros
br.remaining -= int64(nzeros)
pos := br.position()
numOnes := br.countFirstZeros(^br.curWord)
br.curWord = br.consumeBits(br.curWord, numOnes)
br.curNumBits -= numOnes
br.remaining -= int64(numOnes)
return SetBitRun{pos, int64(numOnes)}
}
func (br *baseSetBitRunReader) position() int64 {
if br.reversed {
return br.remaining
}
return br.length - br.remaining
}
func (br *baseSetBitRunReader) adjustRun(run SetBitRun) SetBitRun {
if br.reversed {
run.Pos -= run.Length
}
return run
}
func (br *baseSetBitRunReader) loadFull() (ret uint64) {
if br.reversed {
br.pos -= 8
}
ret = binary.LittleEndian.Uint64(br.bitmap[br.pos : br.pos+8])
if !br.reversed {
br.pos += 8
}
return
}
func (br *baseSetBitRunReader) skipNextZeros() {
for br.remaining >= 64 {
br.curWord = br.loadFull()
nzeros := br.countFirstZeros(br.curWord)
if nzeros < 64 {
br.curWord = br.consumeBits(br.curWord, nzeros)
br.curNumBits = 64 - nzeros
br.remaining -= int64(nzeros)
return
}
br.remaining -= 64
}
// run of zeros continues in last bitmap word
if br.remaining > 0 {
br.curWord = br.loadPartial(0, br.remaining)
br.curNumBits = int32(br.remaining)
nzeros := int32(utils.Min(int(br.curNumBits), int(br.countFirstZeros(br.curWord))))
br.curWord = br.consumeBits(br.curWord, nzeros)
br.curNumBits -= nzeros
br.remaining -= int64(nzeros)
}
}
func (br *baseSetBitRunReader) countNextOnes() int64 {
var length int64
if ^br.curWord != 0 {
numOnes := br.countFirstZeros(^br.curWord)
br.remaining -= int64(numOnes)
br.curWord = br.consumeBits(br.curWord, numOnes)
br.curNumBits -= numOnes
if br.curNumBits != 0 {
return int64(numOnes)
}
length = int64(numOnes)
} else {
br.remaining -= 64
br.curNumBits = 0
length = 64
}
for br.remaining >= 64 {
br.curWord = br.loadFull()
numOnes := br.countFirstZeros(^br.curWord)
length += int64(numOnes)
br.remaining -= int64(numOnes)
if numOnes < 64 {
br.curWord = br.consumeBits(br.curWord, numOnes)
br.curNumBits = 64 - numOnes
return length
}
}
if br.remaining > 0 {
br.curWord = br.loadPartial(0, br.remaining)
br.curNumBits = int32(br.remaining)
numOnes := br.countFirstZeros(^br.curWord)
br.curWord = br.consumeBits(br.curWord, numOnes)
br.curNumBits -= numOnes
br.remaining -= int64(numOnes)
length += int64(numOnes)
}
return length
}
func (br *baseSetBitRunReader) NextRun() SetBitRun {
var (
pos int64 = 0
length int64 = 0
)
if br.curNumBits != 0 {
run := br.findCurrentRun()
if run.Length != 0 && br.curNumBits != 0 {
return br.adjustRun(run)
}
pos = run.Pos
length = run.Length
}
if length == 0 {
// we didn't get any ones in curWord, so we can skip any zeros
// in the following words
br.skipNextZeros()
if br.remaining == 0 {
return SetBitRun{0, 0}
}
pos = br.position()
} else if br.curNumBits == 0 {
if br.remaining >= 64 {
br.curWord = br.loadFull()
br.curNumBits = 64
} else if br.remaining > 0 {
br.curWord = br.loadPartial(0, br.remaining)
br.curNumBits = int32(br.remaining)
} else {
return br.adjustRun(SetBitRun{pos, length})
}
if (br.curWord & br.firstBit) == 0 {
return br.adjustRun(SetBitRun{pos, length})
}
}
length += br.countNextOnes()
return br.adjustRun(SetBitRun{pos, length})
}
// VisitFn is a callback function for visiting runs of contiguous bits
type VisitFn func(pos int64, length int64) error
func (br *baseSetBitRunReader) VisitSetBitRuns(visitFn VisitFn) error {
for {
run := br.NextRun()
if run.Length == 0 {
break
}
if err := visitFn(run.Pos, run.Length); err != nil {
return err
}
}
return nil
}
// VisitSetBitRuns is just a convenience function for calling NewSetBitRunReader and then VisitSetBitRuns
func VisitSetBitRuns(bitmap []byte, bitmapOffset int64, length int64, visitFn VisitFn) error {
if bitmap == nil {
return visitFn(0, length)
}
rdr := NewSetBitRunReader(bitmap, bitmapOffset, length)
for {
run := rdr.NextRun()
if run.Length == 0 {
break
}
if err := visitFn(run.Pos, run.Length); err != nil {
return err
}
}
return nil
}
func VisitSetBitRunsNoErr(bitmap []byte, bitmapOffset int64, length int64, visitFn func(pos, length int64)) {
if bitmap == nil {
visitFn(0, length)
return
}
rdr := NewSetBitRunReader(bitmap, bitmapOffset, length)
for {
run := rdr.NextRun()
if run.Length == 0 {
break
}
visitFn(run.Pos, run.Length)
}
}