-
Notifications
You must be signed in to change notification settings - Fork 5
/
bloom.nim
260 lines (221 loc) · 9.47 KB
/
bloom.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
from math import ceil, ln, pow, round
import hashes
import strutils
import private/probabilities
# Import MurmurHash3 code and compile at the same time as Nim code
{.compile: "murmur3.c".}
type
BloomFilterError = object of CatchableError
MurmurHashes = array[0..1, int]
BloomFilter = object
capacity*: int
error_rate*: float
k_hashes*: int
m_bits*: int
int_array: seq[int]
n_bits_per_elem*: int
use_murmur_hash*: bool
proc raw_murmur_hash(key: cstring, len: int, seed: uint32,
out_hashes: var MurmurHashes): void {.
importc: "MurmurHash3_x64_128".}
proc murmur_hash(key: string, seed: uint32 = 0'u32): MurmurHashes =
result = [0, 0]
raw_murmur_hash(key = key, len = key.len, seed = seed, out_hashes = result)
proc hash_a(item: string, max_value: int): int =
result = hash(item) mod max_value
proc hash_b(item: string, max_value: int): int =
result = hash(item & " b") mod max_value
proc hash_n(item: string, n: int, max_value: int): int =
## Get the nth hash of a string using the formula hash_a + n * hash_b
## which uses 2 hash functions vs. k and has comparable properties
## See Kirsch and Mitzenmacher, 2008:
## http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/rsa.pdf
result = abs((hash_a(item, max_value) + n * hash_b(item,
max_value))) mod max_value
proc get_m_over_n_bits_for_k(k: int, target_error: float,
probability_table: TAllErrorRates = k_errors): int =
## Returns the optimal number of m/n bits for a given k.
if k > 12:
raise newException(BloomFilterError,
"K must be <= 12 if force_n_bits_per_elem is not also specified.")
var
searching_for_m_over_n = true
m_over_n = 2
while searching_for_m_over_n:
try:
if probability_table[k][m_over_n] < target_error:
searching_for_m_over_n = false
result = m_over_n
return result
else:
m_over_n += 1
except IndexError:
raise newException(BloomFilterError,
"Specified value of k and error rate for which is not achievable using less than 4 bytes / element.")
proc initialize_bloom_filter*(capacity: int, error_rate: float, k: int = 0,
force_n_bits_per_elem: int = 0,
use_murmur_hash: bool = true): BloomFilter =
## Initializes a Bloom filter, using a specified ``capacity``,
## ``error_rate``, and – optionally – specific number of k hash functions.
## If ``k_hashes`` is < 1 (default argument is 0), ``k_hashes`` will be
## optimally calculated on the fly. Otherwise, ``k_hashes`` will be set to
## the passed integer, which requires that ``force_n_bits_per_elem`` is
## also set to be greater than 0. Otherwise a ``BloomFilterError``
## exception is raised.
## See http://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for
## useful tables on k and m/n (n bits per element) combinations.
##
## The Bloom filter uses the MurmurHash3 implementation by default,
## though it can fall back to using the built-in nim ``hash`` function
## if ``use_murmur_hash = false``. This is compiled alongside the Nim
## code using the ``{.compile.}`` pragma.
var
k_hashes: int
bits_per_elem: float
n_bits_per_elem: int
if k < 1: # Calculate optimal k and use that
bits_per_elem = ceil(-1.0 * (ln(error_rate) / (pow(ln(2.float), 2))))
k_hashes = round(ln(2.float) * bits_per_elem).int
n_bits_per_elem = round(bits_per_elem).int
else: # Use specified k if possible
if force_n_bits_per_elem < 1: # Use lookup table
n_bits_per_elem = get_m_over_n_bits_for_k(k = k,
target_error = error_rate)
else:
n_bits_per_elem = force_n_bits_per_elem
k_hashes = k
let m_bits = capacity * n_bits_per_elem
let m_ints = 1 + m_bits div (sizeof(int) * 8)
result = BloomFilter(capacity: capacity, error_rate: error_rate,
k_hashes: k_hashes, m_bits: m_bits,
int_array: newSeq[int](m_ints),
n_bits_per_elem: n_bits_per_elem,
use_murmur_hash: use_murmur_hash)
proc `$`*(bf: BloomFilter): string =
## Prints the capacity, set error rate, number of k hash functions,
## and total bits of memory allocated by the Bloom filter.
result = ("Bloom filter with $1 capacity, $2 error rate, $3 hash functions, and requiring $4 bits per stored element." %
[$bf.capacity, formatFloat(bf.error_rate, format = ffScientific,
precision = 1), $bf.k_hashes, $bf.n_bits_per_elem])
{.push overflowChecks: off.}
proc hash_murmur(bf: BloomFilter, item: string): seq[int] =
result = newSeq[int](bf.k_hashes)
let murmur_hashes = murmur_hash(key = item, seed = 0'u32)
for i in 0..(bf.k_hashes - 1):
result[i] = abs(murmur_hashes[0] + i * murmur_hashes[1]) mod bf.m_bits
return result
{.pop.}
proc hash_nimrod(bf: BloomFilter, item: string): seq[int] =
newSeq(result, bf.k_hashes)
for i in 0..(bf.k_hashes - 1):
result[i] = hash_n(item, i, bf.m_bits)
return result
proc hash(bf: BloomFilter, item: string): seq[int] =
if bf.use_murmur_hash:
result = bf.hash_murmur(item = item)
else:
result = bf.hash_nimrod(item = item)
return result
proc insert*(bf: var BloomFilter, item: string) =
## Insert an item (string) into the Bloom filter. Can be called with
## method style syntax like ``bf.insert("test item")``.
var hash_set = bf.hash(item)
for h in hash_set:
let int_address = h div (sizeof(int) * 8)
let bit_offset = h mod (sizeof(int) * 8)
bf.int_array[int_address] = bf.int_array[int_address] or (1 shl bit_offset)
proc lookup*(bf: BloomFilter, item: string): bool =
## Lookup an item (string) into the Bloom filter. Can be called with
## method style syntax like ``bf.lookup("test item")``.
## If the item is present, ``lookup`` is guaranteed to return ``true``.
## If the item is not present, ``lookup`` will return ``false``
## with a probability 1 - ``bf.error_rate``.
var hash_set = bf.hash(item)
for h in hash_set:
let int_address = h div (sizeof(int) * 8)
let bit_offset = h mod (sizeof(int) * 8)
let current_int = bf.int_array[int_address]
if (current_int) != (current_int or (1 shl bit_offset)):
return false
return true
when isMainModule:
from random import rand, randomize
import times
# Test murmurhash 3
echo("Testing MurmurHash3 code...")
var hash_outputs: MurmurHashes
hash_outputs = [0, 0]
raw_murmur_hash("hello", 5, 0, hash_outputs)
assert int(hash_outputs[0]) == -3758069500696749310 # Correct murmur outputs (cast to int64)
assert int(hash_outputs[1]) == 6565844092913065241
let hash_outputs2 = murmur_hash("hello", 0)
assert hash_outputs2[0] == hash_outputs[0]
assert hash_outputs2[1] == hash_outputs[1]
let hash_outputs3 = murmur_hash("hello", 10)
assert hash_outputs3[0] != hash_outputs[0]
assert hash_outputs3[1] != hash_outputs[1]
# Some quick and dirty tests (not complete)
var n_elements_to_test = 100000
var bf = initialize_bloom_filter(n_elements_to_test, 0.001)
assert(bf of BloomFilter)
echo(bf)
var bf2 = initialize_bloom_filter(10000, 0.001, k = 4,
force_n_bits_per_elem = 20)
assert(bf2 of BloomFilter)
echo(bf2)
echo("Testing insertions and lookups...")
echo("Test element in BF2?: ", bf2.lookup("testing"))
echo("Inserting element.")
bf2.insert("testing")
echo("Test element in BF2?: ", bf2.lookup("testing"))
assert(bf2.lookup("testing"))
# Now test for speed with bf
randomize(2882) # Seed the RNG
var
sample_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
k_test_elements, sample_letters: seq[string]
k_test_elements = newSeq[string](n_elements_to_test)
sample_letters = newSeq[string](62)
for i in 0..(n_elements_to_test - 1):
var new_string = ""
for j in 0..7:
new_string.add(sample_chars[rand(51)])
k_test_elements[i] = new_string
var start_time, end_time: float
start_time = cpuTime()
for i in 0..(n_elements_to_test - 1):
bf.insert(k_test_elements[i])
end_time = cpuTime()
echo("Took ", formatFloat(end_time - start_time, format = ffDecimal,
precision = 4), " seconds to insert ", n_elements_to_test, " items.")
var false_positives = 0
for i in 0..(n_elements_to_test - 1):
var false_positive_string = ""
for j in 0..8: # By definition not in bf as 9 chars not 8
false_positive_string.add(sample_chars[rand(51)])
if bf.lookup(false_positive_string):
false_positives += 1
echo("N false positives (of ", n_elements_to_test, " lookups): ", false_positives)
echo("False positive rate ", formatFloat(false_positives / n_elements_to_test,
format = ffDecimal, precision = 4))
var lookup_errors = 0
start_time = cpuTime()
for i in 0..(n_elements_to_test - 1):
if not bf.lookup(k_test_elements[i]):
lookup_errors += 1
end_time = cpuTime()
echo("Took ", formatFloat(end_time - start_time, format = ffDecimal,
precision = 4), " seconds to lookup ", n_elements_to_test, " items.")
echo("N lookup errors (should be 0): ", lookup_errors)
# Finally test correct k / m_over_n specification,
# first case raises an error, second works
try:
discard get_m_over_n_bits_for_k(k = 2, target_error = 0.00001)
assert false
except BloomFilterError:
assert true
assert get_m_over_n_bits_for_k(k = 2, target_error = 0.1) == 6
assert get_m_over_n_bits_for_k(k = 7, target_error = 0.01) == 10
assert get_m_over_n_bits_for_k(k = 7, target_error = 0.001) == 16
var bf3 = initialize_bloom_filter(1000, 0.01, k = 4)
assert bf3.n_bits_per_elem == 11