lots of changes; fix some issues; better init

barrust · Jul 13, 2017 · b3ba754 · b3ba754
1 parent cc9f6b2
commit b3ba754
Show file tree

Hide file tree

Showing 10 changed files with 197 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,10 @@
+# PyProbables Changelog
+
+### Initial Version:
+* Probabilistic data structures:
+    * Bloom Filter
+    * Bloom Filter (on disk)
+    * Count-Min Sketch
+    * Heavy Hitters
+    * Stream Threshold
+* Import and export of each
diff --git a/README.md b/README.md
diff --git a/README.rst b/README.rst
@@ -0,0 +1,86 @@
+PyProbables
+===========
+
+**pyprobables** is a python library for probabilistic data structures. The goal
+is to provide the developer with a pure-python implementation of common
+probabilistic data-structures to use in their work.
+
+Installation
+------------------
+
+Pip Installation: ** coming**
+
+::
+
+    $ pip install pyprobables
+
+To install from source:
+
+To install `pyprobables`, simply clone the `repository on GitHub
+<https://github.com/barrust/pyprobables>`__, then run from the folder:
+
+::
+
+    $ python setup.py install
+
+`pyprobables` supports python versions 2.7 and 3.3 - 3.6
+
+Documentation
+-------------
+
+Documentation is currently under development. The documentation of
+the latest release will be hosted on
+`readthedocs.io <http://pyprobables.readthedocs.io/en/stable/?>`__
+
+Once completed, you can build the documentation yourself by running:
+
+::
+
+    $ pip install sphinx
+    $ cd docs/
+    $ make html
+
+
+
+Automated Tests
+------------------
+
+To run automated tests, one must simply run the following command from the
+downloaded folder:
+
+::
+
+  $ python setup.py test
+
+
+Quickstart
+------------------
+
+Import pyprobables and setup a Bloom Filter:
+
+.. code:: python
+
+    >>> from probables import (BloomFilter)
+    >>> blm = BloomFilter(est_elements=1000, false_positive_rate=0.05)
+    >>> blm.add('google.com')
+    >>> blm.check('facebook.com')  # should return False
+    >>> blm.check('google.com')  # should return True
+
+
+Import pyprobables and setup a Count-Min Sketch:
+
+.. code:: python
+
+    >>> from probables import (CountMinSketch)
+    >>> cms = CountMinSketch(width=1000, depth=5)
+    >>> cms.add('google.com')  # should return 1
+    >>> cms.add('facebook.com', 25)  # insert 25 at once; should return 25
+
+See the documentation for other data structures available and for further
+
+Changelog
+------------------
+
+Please see the `changelog
+<https://github.com/barrust/pyprobables/blob/master/CHANGELOG.md>`__ for a list
+of all changes.
diff --git a/probables/blooms/bloom.py b/probables/blooms/bloom.py
@@ -13,6 +13,8 @@
 from shutil import (copyfile)
 from binascii import (hexlify, unhexlify)
 from .. exceptions import (InitializationError, NotSupportedError)
+from .. hashes import (default_fnv_1a)
+from .. utilities import (is_hex_string, is_valid_file)
 
 
 class BloomFilter(object):
@@ -33,13 +35,13 @@ def __init__(self, est_elements=None, false_positive_rate=None,
         self._fpr = 0.0
         self.__number_hashes = 0
         self.__bloom_length = self.number_bits // 8
-        self.__hash_func = self._default_hash
+        self.__hash_func = default_fnv_1a
         self.__els_added = 0
         self._on_disk = False  # not on disk
 
-        if filepath is not None:  # TODO: check that it is a real file?
+        if is_valid_file(filepath):
             self._load(filepath, hash_function)
-        elif hex_string is not None:  # TODO: verify it is actually hex?
+        elif is_hex_string(hex_string):
             self._load_hex(hex_string, hash_function)
         elif est_elements is not None and false_positive_rate is not None:
             self._set_optimized_params(est_elements, false_positive_rate, 0,
@@ -283,7 +285,7 @@ def _set_optimized_params(self, estimated_elements, false_positive_rate,
                               elements_added, hash_function):
         ''' set the parameters to the optimal sizes '''
         if hash_function is None:
-            self.__hash_func = self._default_hash
+            self.__hash_func = default_fnv_1a
         else:
             self.__hash_func = hash_function
         self._est_elements = estimated_elements
@@ -323,29 +325,6 @@ def __cnt_number_bits_set(self):
             setbits += self.__cnt_set_bits(self.get_element(i))
         return setbits
 
-    def _default_hash(self, key, depth):
-        ''' the default fnv-1a hashing routine '''
-        res = list()
-        tmp = key
-        for _ in list(range(0, depth)):
-            if tmp != key:
-                tmp = self.__fnv_1a("{0:x}".format(tmp))
-            else:
-                tmp = self.__fnv_1a(key)
-            res.append(tmp)
-        return res
-
-    @staticmethod
-    def __fnv_1a(key):
-        ''' 64 bit fnv-1a hash '''
-        hval = 14695981039346656073
-        fnv_64_prime = 1099511628211
-        uint64_max = 2 ** 64
-        for tmp_s in key:
-            hval = hval ^ ord(tmp_s)
-            hval = (hval * fnv_64_prime) % uint64_max
-        return hval
-
     @staticmethod
     def get_set_element(tmp_bit):
         ''' wrappper to use similar functions always! '''
@@ -377,6 +356,7 @@ def __init__(self, filepath, est_elements=None, false_positive_rate=None,
         self._on_disk = True
 
         if est_elements is not None and false_positive_rate is not None:
+            # no need to check the file since this will over write it
             fpr = false_positive_rate
             super(BloomFilterOnDisk,
                   self)._set_optimized_params(est_elements, fpr, 0,
@@ -389,9 +369,9 @@ def __init__(self, filepath, est_elements=None, false_positive_rate=None,
                                        false_positive_rate))
                 filepointer.flush()
             self._load(filepath, hash_function)
-        elif hex_string is not None:  # TODO: check to see if is hex?
+        elif hex_string is not None and is_hex_string(hex_string):
             self._load_hex(hex_string, hash_function)
-        elif filepath is not None:  # TODO: should we check if file exists?
+        elif is_valid_file(filepath):
             self._load(filepath, hash_function)
         else:
             msg = ('Insufecient parameters to set up the Bloom Filter')

diff --git a/probables/countminsketch/countminsketch.py b/probables/countminsketch/countminsketch.py
@@ -9,6 +9,8 @@
 import math
 from struct import (pack, unpack, calcsize)
 from .. exceptions import (InitializationError, NotSupportedError)
+from .. hashes import (default_fnv_1a)
+from .. utilities import (is_hex_string, is_valid_file)
 
 
 class CountMinSketch(object):
@@ -30,7 +32,7 @@ def __init__(self, width=None, depth=None, confidence=None,
         self.__int64_t_max = 9223372036854775807
         self.__uint64_t_max = 2 ** 64
 
-        if filepath is not None:
+        if is_valid_file(filepath):
             self.__load(filepath, hash_function)
         elif width is not None and depth is not None:
             self.__width = width
@@ -54,7 +56,7 @@ def __init__(self, width=None, depth=None, confidence=None,
             raise InitializationError(msg)
 
         if hash_function is None:
-            self._hash_function = self.__default_hash
+            self._hash_function = default_fnv_1a
         else:
             self._hash_function = hash_function
 
@@ -205,31 +207,10 @@ def __load(self, filepath, hash_function=None):
             self._bins = list(unpack(rep, filepointer.read(offset)))
 
         if hash_function is None:
-            self._hash_function = self.__default_hash
+            self._hash_function = default_fnv_1a
         else:
             self._hash_function = hash_function
 
-    def __default_hash(self, key, depth):
-        ''' the default fnv-1a hashing routine '''
-        res = list()
-        tmp = key
-        for _ in range(0, depth):
-            if tmp != key:
-                tmp = self.__fnv_1a("{0:x}".format(tmp))
-            else:
-                tmp = self.__fnv_1a(key)
-            res.append(tmp)
-        return res
-
-    def __fnv_1a(self, key):
-        ''' 64 bit fnv-1a hash '''
-        hval = 14695981039346656073
-        fnv_64_prime = 1099511628211
-        for t_str in key:
-            hval = hval ^ ord(t_str)
-            hval = (hval * fnv_64_prime) % self.__uint64_t_max
-        return hval
-
     def __get_values_sorted(self, hashes):
         ''' get the values sorted '''
         bins = list()

diff --git a/probables/hashes.py b/probables/hashes.py
@@ -0,0 +1,50 @@
+''' Probables Hashing library '''
+from __future__ import (unicode_literals, absolute_import, print_function)
+from hashlib import (md5, sha256)
+from struct import (unpack)  # needed to turn digests into numbers
+
+
+UIN64_MAX = 2 ** 64
+
+
+def default_fnv_1a(key, depth):
+    ''' the default fnv-1a hashing routine '''
+    res = list()
+    tmp = key
+    for _ in range(depth):
+        if tmp != key:
+            tmp = fnv_1a("{0:x}".format(tmp))
+        else:
+            tmp = fnv_1a(key)
+        res.append(tmp)
+    return res
+
+
+def fnv_1a(key):
+    ''' 64 bit fnv-1a hash '''
+    hval = 14695981039346656073
+    fnv_64_prime = 1099511628211
+    for t_str in key:
+        hval = hval ^ ord(t_str)
+        hval = (hval * fnv_64_prime) % UIN64_MAX
+    return hval
+
+
+def default_md5(key, depth):
+    ''' the defualt md5 hashing routine '''
+    res = list()
+    tmp = key
+    for _ in range(depth):
+        tmp = md5(tmp).digest()
+        res.append(str(unpack('Q', tmp[:8])[0]))  # turn into 64 bit number
+    return res
+
+
+def default_sha256(key, depth):
+    ''' the defualt sha256 hashing routine '''
+    res = list()
+    tmp = key
+    for _ in range(depth):
+        tmp = sha256(tmp).digest()
+        res.append(str(unpack('Q', tmp[:8])[0]))  # turn into 64 bit number
+    return res
diff --git a/probables/utilities.py b/probables/utilities.py
@@ -0,0 +1,18 @@
+''' some utility functions '''
+from __future__ import (unicode_literals, absolute_import, print_function)
+import string
+import os
+
+
+def is_hex_string(hex_string):
+    ''' check if the passed in string is really hex '''
+    if hex_string is None:
+        return False
+    return all(c in string.hexdigits for c in hex_string)
+
+
+def is_valid_file(filepath):
+    ''' check if the passed filepath points to a real file '''
+    if filepath is None:
+        return False
+    return os.path.isfile(filepath)
diff --git a/setup.py b/setup.py
@@ -21,7 +21,7 @@ def read_file(filepath):
     download_url = '{0}/tarball/v{1}'.format(__url__, __version__),
     install_requires = read_file('./requirements/python').splitlines(),
     packages = ['probables'],
-    long_description = read_file('README.md'),
+    long_description = read_file('README.rst'),
     classifiers = [
         'Development Status :: 5 - Production/Stable',
         'Intended Audience :: Developers',

diff --git a/tests/bloom_test.py b/tests/bloom_test.py
@@ -209,6 +209,12 @@ def test_bf_load_hex(self):
         self.assertEqual('this is a test 11' in blm, False)
         self.assertEqual('this is a test 12' in blm, False)
 
+    def test_bf_load_invalid_hex(self):
+        ''' test importing a bloom filter from an invalid hex value '''
+        hex_val = '85f240623b6d9459000000000000000a000000000000000a3d4ccccQ'
+        self.assertRaises(InitializationError,
+                          lambda: BloomFilter(hex_string=hex_val))
+
     def test_bf_export_file(self):
         ''' test exporting bloom filter to file '''
         filename = 'test.blm'
@@ -234,6 +240,12 @@ def test_bf_load_file(self):
         self.assertEqual('this is not a test' in blm2, False)
         os.remove(filename)
 
+    def test_bf_load_invalid_file(self):
+        ''' test importing a bloom filter from an invalid filepath '''
+        filename = 'invalid.blm'
+        self.assertRaises(InitializationError,
+                          lambda: BloomFilter(filepath=filename))
+
     def test_bf_clear(self):
         ''' test clearing out the bloom filter '''
         blm = BloomFilter(est_elements=10, false_positive_rate=0.05)

diff --git a/tests/countminsketch_test.py b/tests/countminsketch_test.py
@@ -200,6 +200,12 @@ def test_cms_load_diff_hash(self):
                             cms2.hashes('this is a test'))
         os.remove(filename)
 
+    def test_cms_load_invalid_file(self):
+        ''' test loading a count-min sketch from invalid file '''
+        filename = 'invalid.cms'
+        self.assertRaises(InitializationError,
+                          lambda: CountMinSketch(filepath=filename))
+
     def test_cms_different_hash(self):
         ''' test using a different hash function '''
         cms = CountMinSketch(width=1000, depth=5)