-
Notifications
You must be signed in to change notification settings - Fork 11
/
mph.go
96 lines (86 loc) · 2.37 KB
/
mph.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
// Package mph implements a minimal perfect hash table over strings.
package mph
import "sort"
// A Table is an immutable hash table that provides constant-time lookups of key
// indices using a minimal perfect hash.
type Table struct {
keys []string
level0 []uint32 // power of 2 size
level0Mask int // len(Level0) - 1
level1 []uint32 // power of 2 size >= len(keys)
level1Mask int // len(Level1) - 1
}
// Build builds a Table from keys using the "Hash, displace, and compress"
// algorithm described in http://cmph.sourceforge.net/papers/esa09.pdf.
func Build(keys []string) *Table {
var (
level0 = make([]uint32, nextPow2(len(keys)/4))
level0Mask = len(level0) - 1
level1 = make([]uint32, nextPow2(len(keys)))
level1Mask = len(level1) - 1
sparseBuckets = make([][]int, len(level0))
zeroSeed = murmurSeed(0)
)
for i, s := range keys {
n := int(zeroSeed.hash(s)) & level0Mask
sparseBuckets[n] = append(sparseBuckets[n], i)
}
var buckets []indexBucket
for n, vals := range sparseBuckets {
if len(vals) > 0 {
buckets = append(buckets, indexBucket{n, vals})
}
}
sort.Sort(bySize(buckets))
occ := make([]bool, len(level1))
var tmpOcc []int
for _, bucket := range buckets {
var seed murmurSeed
trySeed:
tmpOcc = tmpOcc[:0]
for _, i := range bucket.vals {
n := int(seed.hash(keys[i])) & level1Mask
if occ[n] {
for _, n := range tmpOcc {
occ[n] = false
}
seed++
goto trySeed
}
occ[n] = true
tmpOcc = append(tmpOcc, n)
level1[n] = uint32(i)
}
level0[int(bucket.n)] = uint32(seed)
}
return &Table{
keys: keys,
level0: level0,
level0Mask: level0Mask,
level1: level1,
level1Mask: level1Mask,
}
}
func nextPow2(n int) int {
for i := 1; ; i *= 2 {
if i >= n {
return i
}
}
}
// Lookup searches for s in t and returns its index and whether it was found.
func (t *Table) Lookup(s string) (n uint32, ok bool) {
i0 := int(murmurSeed(0).hash(s)) & t.level0Mask
seed := t.level0[i0]
i1 := int(murmurSeed(seed).hash(s)) & t.level1Mask
n = t.level1[i1]
return n, s == t.keys[int(n)]
}
type indexBucket struct {
n int
vals []int
}
type bySize []indexBucket
func (s bySize) Len() int { return len(s) }
func (s bySize) Less(i, j int) bool { return len(s[i].vals) > len(s[j].vals) }
func (s bySize) Swap(i, j int) { s[i], s[j] = s[j], s[i] }