Skip to content

Commit

Permalink
first pass at an expanding bloom filter!
Browse files Browse the repository at this point in the history
  • Loading branch information
barrust committed Nov 10, 2018
1 parent 9ad336c commit 84ee18c
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 5 deletions.
6 changes: 4 additions & 2 deletions probables/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
''' pyprobables module '''
from __future__ import (unicode_literals, absolute_import, print_function)
from . blooms import (BloomFilter, BloomFilterOnDisk, CountingBloomFilter)
from . blooms import (BloomFilter, BloomFilterOnDisk, CountingBloomFilter,
ExpandingBloomFilter)
from . countminsketch import (CountMinSketch, HeavyHitters, StreamThreshold,
CountMeanSketch, CountMeanMinSketch)
from . cuckoo import (CuckooFilter, CountingCuckooFilter)
Expand All @@ -20,4 +21,5 @@
'CountMinSketch', 'CountMeanSketch', 'CountMeanMinSketch',
'HeavyHitters', 'StreamThreshold', 'CuckooFilter',
'CountingCuckooFilter', 'InitializationError', 'NotSupportedError',
'ProbablesBaseException', 'CuckooFilterFullError']
'ProbablesBaseException', 'CuckooFilterFullError',
'ExpandingBloomFilter']
4 changes: 3 additions & 1 deletion probables/blooms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@

from . bloom import (BloomFilter, BloomFilterOnDisk)
from . countingbloom import (CountingBloomFilter)
from . expandingbloom import (ExpandingBloomFilter)

__all__ = ['BloomFilter', 'BloomFilterOnDisk', 'CountingBloomFilter']
__all__ = ['BloomFilter', 'BloomFilterOnDisk', 'CountingBloomFilter',
'ExpandingBloomFilter']
4 changes: 2 additions & 2 deletions probables/blooms/basebloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ def __init__(self, blm_type, est_elements=None, false_positive_rate=None,
self._els_added = 0
self._on_disk = False # not on disk
self.__blm_type = blm_type
if self.__blm_type in ['regular', 'reg-ondisk']:
if self.__blm_type in ['regular', 'reg-ondisk', 'expanding']:
self.__impt_type = 'B'
else:
self.__impt_type = 'I'

if blm_type in ['regular', 'reg-ondisk']:
if blm_type in ['regular', 'reg-ondisk', 'expanding']:
msg = ('Insufecient parameters to set up the Bloom Filter')
else:
msg = ('Insufecient parameters to set up the Counting Bloom '
Expand Down
74 changes: 74 additions & 0 deletions probables/blooms/expandingbloom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
''' BloomFilter, python implementation
License: MIT
Author: Tyler Barrus (barrust@gmail.com)
URL: https://github.com/barrust/pyprobables
'''
from __future__ import (unicode_literals, absolute_import, print_function)

from . bloom import (BloomFilter)


class ExpandingBloomFilter(object):

def __init__(self, est_elements=None, false_positive_rate=None,
hash_function=None):
''' '''
self._blooms = list()
self.__fpr = false_positive_rate
self.__est_elements = est_elements
self.__hash_func = hash_function
self.__added_elements = 0 # total added...
# add in the initial bloom filter!
self.__add_bloom_filter()

def __contains__(self, key):
return self.check(key)

@property
def blooms(self):
return self._blooms

@property
def false_positive_rate(self):
return self.__fpr

@property
def estimated_elements(self):
return self.__est_elements

@property
def elements_added(self):
return self.__added_elements

def __add_bloom_filter(self):
''' build a new bloom and add it on! '''
blm = BloomFilter(self.__est_elements, self.__fpr, self.__hash_func)
self._blooms.append(blm)

def __check_for_growth(self):
if self._blooms[-1].elements_added >= self.__est_elements:
self.__add_bloom_filter()

def check(self, key):
''' check to see if it is in any of the bloom filters '''
hashes = self._blooms[0].hashes(key)
return self.check_alt(hashes)

def check_alt(self, hashes):
''' an alternative method to check the bloom filter '''
for blm in self._blooms:
if blm.check_alt(hashes):
return True
return False

def add(self, key, force=False):
''' Adds the key if it isn't in the bloom filter '''
hashes = self._blooms[0].hashes(key)
self.add_alt(hashes, force)

def add_alt(self, hashes, force=False):
''' '''
self.__added_elements += 1
if force or not self.check_alt(hashes):
self.__check_for_growth()
self._blooms[-1].add_alt(hashes)
56 changes: 56 additions & 0 deletions tests/expandingbloom_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
''' Unittest class '''
from __future__ import (unicode_literals, absolute_import, print_function)
import unittest
from probables import (ExpandingBloomFilter)

class TestExpandingBloomFilter(unittest.TestCase):

def test_ebf_init(self):
''' test the initialization of an expanding bloom filter '''
blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05)
self.assertEqual(len(blm.blooms), 1)

def test_ebf_add_lots(self):
''' test adding "lots" of elements to force the expansion '''
blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05)
for i in range(100):
blm.add("{}".format(i), True)
self.assertEqual(len(blm.blooms), 10)

def test_ebf_add_lots_without_force(self):
''' testing adding "lots" but force them to be inserted multiple times'''
blm = ExpandingBloomFilter(est_elements=10, false_positive_rate=0.05)
# simulate false positives... notice it didn't grow a few...
for i in range(120):
blm.add("{}".format(i))
self.assertEqual(len(blm.blooms), 10)
self.assertEqual(blm.elements_added, 120)

def test_ebf_check(self):
''' ensure that checking the expanding bloom filter works '''
blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
# expand it out some first!
for i in range(100):
blm.add("{}".format(i))
blm.add('this is a test')
blm.add('this is another test')
self.assertGreater(len(blm.blooms), 2)
self.assertEqual(blm.check('this is a test'), True)
self.assertEqual(blm.check('this is another test'), True)
self.assertEqual(blm.check('this is yet another test'), False)
self.assertEqual(blm.check('this is not another test'), False)

def test_ebf_contains(self):
''' ensure that "in" functionality for the expanding bloom filter works '''
blm = ExpandingBloomFilter(est_elements=25, false_positive_rate=0.05)
# expand it out some first!
for i in range(100):
blm.add("{}".format(i))
blm.add('this is a test')
blm.add('this is another test')
self.assertGreater(len(blm.blooms), 2)
self.assertEqual('this is a test' in blm, True)
self.assertEqual('this is another test' in blm, True)
self.assertEqual('this is yet another test' in blm, False)
self.assertEqual('this is not another test' in blm, False)

0 comments on commit 84ee18c

Please sign in to comment.