Skip to content

Commit

Permalink
lots of changes; fix some issues; better init
Browse files Browse the repository at this point in the history
  • Loading branch information
barrust committed Jul 13, 2017
1 parent cc9f6b2 commit b3ba754
Show file tree
Hide file tree
Showing 10 changed files with 197 additions and 56 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# PyProbables Changelog

### Initial Version:
* Probabilistic data structures:
* Bloom Filter
* Bloom Filter (on disk)
* Count-Min Sketch
* Heavy Hitters
* Stream Threshold
* Import and export of each
2 changes: 0 additions & 2 deletions README.md

This file was deleted.

86 changes: 86 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
PyProbables
===========

**pyprobables** is a python library for probabilistic data structures. The goal
is to provide the developer with a pure-python implementation of common
probabilistic data-structures to use in their work.

Installation
------------------

Pip Installation: ** coming**

::

$ pip install pyprobables

To install from source:

To install `pyprobables`, simply clone the `repository on GitHub
<https://github.com/barrust/pyprobables>`__, then run from the folder:

::

$ python setup.py install

`pyprobables` supports python versions 2.7 and 3.3 - 3.6

Documentation
-------------

Documentation is currently under development. The documentation of
the latest release will be hosted on
`readthedocs.io <http://pyprobables.readthedocs.io/en/stable/?>`__

Once completed, you can build the documentation yourself by running:

::

$ pip install sphinx
$ cd docs/
$ make html



Automated Tests
------------------

To run automated tests, one must simply run the following command from the
downloaded folder:

::

$ python setup.py test


Quickstart
------------------

Import pyprobables and setup a Bloom Filter:

.. code:: python
>>> from probables import (BloomFilter)
>>> blm = BloomFilter(est_elements=1000, false_positive_rate=0.05)
>>> blm.add('google.com')
>>> blm.check('facebook.com') # should return False
>>> blm.check('google.com') # should return True
Import pyprobables and setup a Count-Min Sketch:

.. code:: python
>>> from probables import (CountMinSketch)
>>> cms = CountMinSketch(width=1000, depth=5)
>>> cms.add('google.com') # should return 1
>>> cms.add('facebook.com', 25) # insert 25 at once; should return 25
See the documentation for other data structures available and for further

Changelog
------------------

Please see the `changelog
<https://github.com/barrust/pyprobables/blob/master/CHANGELOG.md>`__ for a list
of all changes.
38 changes: 9 additions & 29 deletions probables/blooms/bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from shutil import (copyfile)
from binascii import (hexlify, unhexlify)
from .. exceptions import (InitializationError, NotSupportedError)
from .. hashes import (default_fnv_1a)
from .. utilities import (is_hex_string, is_valid_file)


class BloomFilter(object):
Expand All @@ -33,13 +35,13 @@ def __init__(self, est_elements=None, false_positive_rate=None,
self._fpr = 0.0
self.__number_hashes = 0
self.__bloom_length = self.number_bits // 8
self.__hash_func = self._default_hash
self.__hash_func = default_fnv_1a
self.__els_added = 0
self._on_disk = False # not on disk

if filepath is not None: # TODO: check that it is a real file?
if is_valid_file(filepath):
self._load(filepath, hash_function)
elif hex_string is not None: # TODO: verify it is actually hex?
elif is_hex_string(hex_string):
self._load_hex(hex_string, hash_function)
elif est_elements is not None and false_positive_rate is not None:
self._set_optimized_params(est_elements, false_positive_rate, 0,
Expand Down Expand Up @@ -283,7 +285,7 @@ def _set_optimized_params(self, estimated_elements, false_positive_rate,
elements_added, hash_function):
''' set the parameters to the optimal sizes '''
if hash_function is None:
self.__hash_func = self._default_hash
self.__hash_func = default_fnv_1a
else:
self.__hash_func = hash_function
self._est_elements = estimated_elements
Expand Down Expand Up @@ -323,29 +325,6 @@ def __cnt_number_bits_set(self):
setbits += self.__cnt_set_bits(self.get_element(i))
return setbits

def _default_hash(self, key, depth):
''' the default fnv-1a hashing routine '''
res = list()
tmp = key
for _ in list(range(0, depth)):
if tmp != key:
tmp = self.__fnv_1a("{0:x}".format(tmp))
else:
tmp = self.__fnv_1a(key)
res.append(tmp)
return res

@staticmethod
def __fnv_1a(key):
''' 64 bit fnv-1a hash '''
hval = 14695981039346656073
fnv_64_prime = 1099511628211
uint64_max = 2 ** 64
for tmp_s in key:
hval = hval ^ ord(tmp_s)
hval = (hval * fnv_64_prime) % uint64_max
return hval

@staticmethod
def get_set_element(tmp_bit):
''' wrappper to use similar functions always! '''
Expand Down Expand Up @@ -377,6 +356,7 @@ def __init__(self, filepath, est_elements=None, false_positive_rate=None,
self._on_disk = True

if est_elements is not None and false_positive_rate is not None:
# no need to check the file since this will over write it
fpr = false_positive_rate
super(BloomFilterOnDisk,
self)._set_optimized_params(est_elements, fpr, 0,
Expand All @@ -389,9 +369,9 @@ def __init__(self, filepath, est_elements=None, false_positive_rate=None,
false_positive_rate))
filepointer.flush()
self._load(filepath, hash_function)
elif hex_string is not None: # TODO: check to see if is hex?
elif hex_string is not None and is_hex_string(hex_string):
self._load_hex(hex_string, hash_function)
elif filepath is not None: # TODO: should we check if file exists?
elif is_valid_file(filepath):
self._load(filepath, hash_function)
else:
msg = ('Insufecient parameters to set up the Bloom Filter')
Expand Down
29 changes: 5 additions & 24 deletions probables/countminsketch/countminsketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import math
from struct import (pack, unpack, calcsize)
from .. exceptions import (InitializationError, NotSupportedError)
from .. hashes import (default_fnv_1a)
from .. utilities import (is_hex_string, is_valid_file)


class CountMinSketch(object):
Expand All @@ -30,7 +32,7 @@ def __init__(self, width=None, depth=None, confidence=None,
self.__int64_t_max = 9223372036854775807
self.__uint64_t_max = 2 ** 64

if filepath is not None:
if is_valid_file(filepath):
self.__load(filepath, hash_function)
elif width is not None and depth is not None:
self.__width = width
Expand All @@ -54,7 +56,7 @@ def __init__(self, width=None, depth=None, confidence=None,
raise InitializationError(msg)

if hash_function is None:
self._hash_function = self.__default_hash
self._hash_function = default_fnv_1a
else:
self._hash_function = hash_function

Expand Down Expand Up @@ -205,31 +207,10 @@ def __load(self, filepath, hash_function=None):
self._bins = list(unpack(rep, filepointer.read(offset)))

if hash_function is None:
self._hash_function = self.__default_hash
self._hash_function = default_fnv_1a
else:
self._hash_function = hash_function

def __default_hash(self, key, depth):
''' the default fnv-1a hashing routine '''
res = list()
tmp = key
for _ in range(0, depth):
if tmp != key:
tmp = self.__fnv_1a("{0:x}".format(tmp))
else:
tmp = self.__fnv_1a(key)
res.append(tmp)
return res

def __fnv_1a(self, key):
''' 64 bit fnv-1a hash '''
hval = 14695981039346656073
fnv_64_prime = 1099511628211
for t_str in key:
hval = hval ^ ord(t_str)
hval = (hval * fnv_64_prime) % self.__uint64_t_max
return hval

def __get_values_sorted(self, hashes):
''' get the values sorted '''
bins = list()
Expand Down
50 changes: 50 additions & 0 deletions probables/hashes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
''' Probables Hashing library '''
from __future__ import (unicode_literals, absolute_import, print_function)
from hashlib import (md5, sha256)
from struct import (unpack) # needed to turn digests into numbers


UIN64_MAX = 2 ** 64


def default_fnv_1a(key, depth):
''' the default fnv-1a hashing routine '''
res = list()
tmp = key
for _ in range(depth):
if tmp != key:
tmp = fnv_1a("{0:x}".format(tmp))
else:
tmp = fnv_1a(key)
res.append(tmp)
return res


def fnv_1a(key):
''' 64 bit fnv-1a hash '''
hval = 14695981039346656073
fnv_64_prime = 1099511628211
for t_str in key:
hval = hval ^ ord(t_str)
hval = (hval * fnv_64_prime) % UIN64_MAX
return hval


def default_md5(key, depth):
''' the defualt md5 hashing routine '''
res = list()
tmp = key
for _ in range(depth):
tmp = md5(tmp).digest()
res.append(str(unpack('Q', tmp[:8])[0])) # turn into 64 bit number
return res


def default_sha256(key, depth):
''' the defualt sha256 hashing routine '''
res = list()
tmp = key
for _ in range(depth):
tmp = sha256(tmp).digest()
res.append(str(unpack('Q', tmp[:8])[0])) # turn into 64 bit number
return res
18 changes: 18 additions & 0 deletions probables/utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
''' some utility functions '''
from __future__ import (unicode_literals, absolute_import, print_function)
import string
import os


def is_hex_string(hex_string):
''' check if the passed in string is really hex '''
if hex_string is None:
return False
return all(c in string.hexdigits for c in hex_string)


def is_valid_file(filepath):
''' check if the passed filepath points to a real file '''
if filepath is None:
return False
return os.path.isfile(filepath)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def read_file(filepath):
download_url = '{0}/tarball/v{1}'.format(__url__, __version__),
install_requires = read_file('./requirements/python').splitlines(),
packages = ['probables'],
long_description = read_file('README.md'),
long_description = read_file('README.rst'),
classifiers = [
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
Expand Down
12 changes: 12 additions & 0 deletions tests/bloom_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,12 @@ def test_bf_load_hex(self):
self.assertEqual('this is a test 11' in blm, False)
self.assertEqual('this is a test 12' in blm, False)

def test_bf_load_invalid_hex(self):
''' test importing a bloom filter from an invalid hex value '''
hex_val = '85f240623b6d9459000000000000000a000000000000000a3d4ccccQ'
self.assertRaises(InitializationError,
lambda: BloomFilter(hex_string=hex_val))

def test_bf_export_file(self):
''' test exporting bloom filter to file '''
filename = 'test.blm'
Expand All @@ -234,6 +240,12 @@ def test_bf_load_file(self):
self.assertEqual('this is not a test' in blm2, False)
os.remove(filename)

def test_bf_load_invalid_file(self):
''' test importing a bloom filter from an invalid filepath '''
filename = 'invalid.blm'
self.assertRaises(InitializationError,
lambda: BloomFilter(filepath=filename))

def test_bf_clear(self):
''' test clearing out the bloom filter '''
blm = BloomFilter(est_elements=10, false_positive_rate=0.05)
Expand Down
6 changes: 6 additions & 0 deletions tests/countminsketch_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,12 @@ def test_cms_load_diff_hash(self):
cms2.hashes('this is a test'))
os.remove(filename)

def test_cms_load_invalid_file(self):
''' test loading a count-min sketch from invalid file '''
filename = 'invalid.cms'
self.assertRaises(InitializationError,
lambda: CountMinSketch(filepath=filename))

def test_cms_different_hash(self):
''' test using a different hash function '''
cms = CountMinSketch(width=1000, depth=5)
Expand Down

0 comments on commit b3ba754

Please sign in to comment.