forked from HouzuoGuo/tiedot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hashtable.go
238 lines (224 loc) · 7.21 KB
/
hashtable.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
/* A static hash table made of uint64 key-value pairs. */
package chunkfile
import (
"encoding/binary"
"github.com/HouzuoGuo/tiedot/commonfile"
"github.com/HouzuoGuo/tiedot/tdlog"
"math"
"sync"
)
const (
HT_FILE_SIZE = uint64(1024 * 1024 * 8) // Size of a hash table, it may grow by this size
ENTRY_VALID = byte(1) // Entry valid flag
ENTRY_INVALID = byte(0) // Entry invalid flag
ENTRY_SIZE = uint64(1 + 10 + 10) // Size of entry header - validity (byte), hash key (uint64) and value (uint64)
BUCKET_HEADER_SIZE = uint64(10) // Size of bucket header - next bucket in chain (uint64)
// Hash table configuration
PER_BUCKET = uint64(15)
BUCKET_SIZE = uint64(PER_BUCKET*ENTRY_SIZE + BUCKET_HEADER_SIZE)
HASH_BITS = uint64(14)
// INITIAL_BUCKETS = 2 to the power of HASH_BITS
INITIAL_BUCKETS = uint64(16384)
)
type HashTable struct {
Path []string
File *commonfile.File
Mutex *sync.RWMutex
NumBuckets uint64 // Total number of buckets
}
// Open a hash table file.
func OpenHash(name string, path []string) (ht *HashTable, err error) {
file, err := commonfile.Open(name, HT_FILE_SIZE)
if err != nil {
return
}
ht = &HashTable{File: file, Path: path, Mutex: &sync.RWMutex{}}
ht.calculateSizeInfo()
return ht, nil
}
// Calculate used size, total size, total number of buckets.
func (ht *HashTable) calculateSizeInfo() {
// Find out how many buckets there are in table - hence the amount of used space
// .. assume the entire file is Full of buckets
ht.File.UsedSize = ht.File.Size
ht.NumBuckets = ht.File.Size / BUCKET_SIZE
// .. starting from every head bucket, find the longest chain
longestBucketChain := INITIAL_BUCKETS
for i := uint64(0); i < INITIAL_BUCKETS; i++ {
lastBucket := ht.lastBucket(i)
if lastBucket+1 > longestBucketChain && lastBucket+1 <= ht.NumBuckets {
longestBucketChain = lastBucket + 1
}
}
// .. the longest chain tells amount of used space
ht.NumBuckets = longestBucketChain
usedSize := ht.NumBuckets * BUCKET_SIZE
// Grow the file, if it is not yet large enough for all the buckets used
if usedSize > ht.File.Size {
ht.File.UsedSize = ht.File.Size
ht.File.CheckSizeAndEnsure(((usedSize-ht.File.Size)/BUCKET_SIZE + 1) * BUCKET_SIZE)
}
ht.File.UsedSize = usedSize
tdlog.Printf("%s has %d buckets, and %d bytes out of %d bytes in-use", ht.File.Name, ht.NumBuckets, ht.File.UsedSize, ht.File.Size)
}
// Return the number (not address) of next chained bucket, 0 if there is not any.
func (ht *HashTable) NextBucket(bucket uint64) uint64 {
if bucket >= ht.NumBuckets {
return 0
}
bucketAddr := bucket * BUCKET_SIZE
if next, _ := binary.Uvarint(ht.File.Buf[bucketAddr : bucketAddr+BUCKET_HEADER_SIZE]); next == 0 {
return 0
} else if next <= bucket {
tdlog.Errorf("ERROR: Bucket loop in hash table %s at bucket no.%d, address %d", ht.File.Name, bucket, bucketAddr)
return 0
} else if next >= ht.NumBuckets || next < INITIAL_BUCKETS {
tdlog.Errorf("ERROR: Bad bucket refernece (%d is out of range %d - %d) in %s", next, INITIAL_BUCKETS, ht.NumBuckets, ht.File.Name)
return 0
} else {
return next
}
}
// Return the last bucket number (not address) in chain.
func (ht *HashTable) lastBucket(bucket uint64) uint64 {
curr := bucket
for {
next := ht.NextBucket(curr)
if next == 0 {
return curr
}
curr = next
}
}
// Grow a new bucket on the chain of buckets.
func (ht *HashTable) grow(bucket uint64) {
ht.File.CheckSizeAndEnsure(BUCKET_SIZE)
// Write down new bucket number
lastBucketAddr := ht.lastBucket(bucket) * BUCKET_SIZE
binary.PutUvarint(ht.File.Buf[lastBucketAddr:lastBucketAddr+10], ht.NumBuckets)
ht.File.UsedSize += BUCKET_SIZE
ht.NumBuckets += 1
}
// Return a hash key to be used by hash table by masking non-key bits.
func (ht *HashTable) HashKey(key uint64) uint64 {
return key & ((1 << HASH_BITS) - 1)
}
// Clear all index entries, return to the initial size as well.
func (ht *HashTable) Clear() {
ht.File.Clear()
// Recalculate size information
ht.calculateSizeInfo()
}
// Put a new key-value pair.
func (ht *HashTable) Put(key, val uint64) {
var bucket, entry uint64 = ht.HashKey(key), 0
for {
entryAddr := bucket*BUCKET_SIZE + BUCKET_HEADER_SIZE + entry*ENTRY_SIZE
if ht.File.Buf[entryAddr] != ENTRY_VALID {
ht.File.Buf[entryAddr] = ENTRY_VALID
binary.PutUvarint(ht.File.Buf[entryAddr+1:entryAddr+11], key)
binary.PutUvarint(ht.File.Buf[entryAddr+11:entryAddr+21], val)
return
}
if entry++; entry == PER_BUCKET {
entry = 0
if bucket = ht.NextBucket(bucket); bucket == 0 {
ht.grow(ht.HashKey(key))
ht.Put(key, val)
return
}
}
}
}
// Get key-value pairs.
func (ht *HashTable) Get(key, limit uint64) (keys, vals []uint64) {
// This function is partially inlined in chunkcol.go
var count, entry, bucket uint64 = 0, 0, ht.HashKey(key)
if limit == 0 {
keys = make([]uint64, 0, 10)
vals = make([]uint64, 0, 10)
} else {
keys = make([]uint64, 0, limit)
vals = make([]uint64, 0, limit)
}
for {
entryAddr := bucket*BUCKET_SIZE + BUCKET_HEADER_SIZE + entry*ENTRY_SIZE
entryKey, _ := binary.Uvarint(ht.File.Buf[entryAddr+1 : entryAddr+11])
entryVal, _ := binary.Uvarint(ht.File.Buf[entryAddr+11 : entryAddr+21])
if ht.File.Buf[entryAddr] == ENTRY_VALID {
if entryKey == key {
keys = append(keys, entryKey)
vals = append(vals, entryVal)
if count++; count == limit {
return
}
}
} else if entryKey == 0 && entryVal == 0 {
return
}
if entry++; entry == PER_BUCKET {
entry = 0
if bucket = ht.NextBucket(bucket); bucket == 0 {
return
}
}
}
}
// Remove specific key-value pair.
func (ht *HashTable) Remove(key, val uint64) {
var entry, bucket uint64 = 0, ht.HashKey(key)
for {
entryAddr := bucket*BUCKET_SIZE + BUCKET_HEADER_SIZE + entry*ENTRY_SIZE
entryKey, _ := binary.Uvarint(ht.File.Buf[entryAddr+1 : entryAddr+11])
entryVal, _ := binary.Uvarint(ht.File.Buf[entryAddr+11 : entryAddr+21])
if ht.File.Buf[entryAddr] == ENTRY_VALID {
if entryKey == key && entryVal == val {
ht.File.Buf[entryAddr] = ENTRY_INVALID
return
}
} else if entryKey == 0 && entryVal == 0 {
return
}
if entry++; entry == PER_BUCKET {
entry = 0
if bucket = ht.NextBucket(bucket); bucket == 0 {
return
}
}
}
}
// Return all entries in the hash table.
func (ht *HashTable) GetAll(limit uint64) (keys, vals []uint64) {
prealloc := limit
if prealloc == 0 {
prealloc = INITIAL_BUCKETS * PER_BUCKET / 2
}
keys = make([]uint64, 0, prealloc)
vals = make([]uint64, 0, prealloc)
counter := uint64(0)
for head := uint64(0); head < uint64(math.Pow(2, float64(HASH_BITS))); head++ {
var entry, bucket uint64 = 0, head
for {
entryAddr := bucket*BUCKET_SIZE + BUCKET_HEADER_SIZE + entry*ENTRY_SIZE
entryKey, _ := binary.Uvarint(ht.File.Buf[entryAddr+1 : entryAddr+11])
entryVal, _ := binary.Uvarint(ht.File.Buf[entryAddr+11 : entryAddr+21])
if ht.File.Buf[entryAddr] == ENTRY_VALID {
counter++
keys = append(keys, entryKey)
vals = append(vals, entryVal)
if counter == limit {
return
}
} else if entryKey == 0 && entryVal == 0 {
break
}
if entry++; entry == PER_BUCKET {
entry = 0
if bucket = ht.NextBucket(bucket); bucket == 0 {
return
}
}
}
}
return
}