Skip to content

Commit

Permalink
Clean up new bloom filter implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
dbaggerman committed Mar 22, 2021
1 parent 95d766e commit b24b909
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 12 deletions.
29 changes: 21 additions & 8 deletions processor/bloom.go
@@ -1,7 +1,6 @@
package processor

// Prime number less than 256
const BloomPrime = 251
import "math/rand"

var BloomTable [256]uint64

Expand All @@ -12,13 +11,27 @@ func init() {
}

func BloomHash(b byte) uint64 {
i := uint64(b)
// Since our input is based on ASCII characters (and majority lower case
// characters) the values are not well distributed through the 0-255 byte
// range. math/rand gives us a way to generate a value with more well
// distributed randomness.
k := rand.New(rand.NewSource(int64(b))).Uint64()

k := (i^BloomPrime) * i
// Mask to slice out a 0-63 value
var mask64 uint64 = 0b00111111

k1 := k & 0x3f
k2 := k >> 1 & 0x3f
k3 := k >> 2 & 0x3f
// For a bloom filter we only want a few bits set, but distributed
// through the 64 bit space.
// The logic here is to slice a value between 0 and 63 from k, and set a
// single bit in the output hash based on that.
// Setting three bits this way seems to give the best results. Fewer bits
// makes the hash not unique enough, more leads to overcrowding the bloom
// filter.
var hash uint64
for i := uint64(0); i < 3; i++ {
n := k >> (i*8) & mask64
hash |= 1 << n
}

return (1 << k1) | (1 << k2) | (1 << k3)
return hash
}
8 changes: 4 additions & 4 deletions processor/processor.go
Expand Up @@ -357,7 +357,7 @@ func processLanguageFeature(name string, value Language) {
var processMask uint64

for _, v := range value.ComplexityChecks {
complexityMask |= BloomHash(v[0])
complexityMask |= BloomTable[v[0]]
complexityTrie.Insert(TComplexity, []byte(v))
if !Complexity {
tokenTrie.Insert(TComplexity, []byte(v))
Expand All @@ -368,21 +368,21 @@ func processLanguageFeature(name string, value Language) {
}

for _, v := range value.LineComment {
singleLineCommentMask |= BloomHash(v[0])
singleLineCommentMask |= BloomTable[v[0]]
slCommentTrie.Insert(TSlcomment, []byte(v))
tokenTrie.Insert(TSlcomment, []byte(v))
}
processMask |= singleLineCommentMask

for _, v := range value.MultiLine {
multiLineCommentMask |= BloomHash(v[0][0])
multiLineCommentMask |= BloomTable[v[0][0]]
mlCommentTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
tokenTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
}
processMask |= multiLineCommentMask

for _, v := range value.Quotes {
stringMask |= BloomHash(v.Start[0])
stringMask |= BloomTable[v.Start[0]]
stringTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
tokenTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
}
Expand Down

0 comments on commit b24b909

Please sign in to comment.