Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Replaced djb hash with md5 hash after test shows that djb is not suff…

…icient (at least as it was written).
  • Loading branch information...
commit ff9e023bb7e367029800fb77fb8671c0edd1bea9 1 parent 6b8fde0
@axiak authored
View
1  .gitignore
@@ -0,0 +1 @@
+build
View
4 CHANGELOG
@@ -1,3 +1,7 @@
+0.2.20 2011-11-13)
+ - Fixed hashing to use md5 hash and change the bit computation to pass
+ accuracy test. [BUG]
+
0.1.28 2011-03-12)
- Added check to ensure that the required permissions are available when
opening a file. [BUG]
View
2  LICENSE
@@ -1,4 +1,4 @@
- Copyright (c) 2010 Michael Axiak <mike@axiak.net>
+ Copyright (c) 2010-2011 Michael Axiak <mike@axiak.net>
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
View
3  setup.py
@@ -3,6 +3,7 @@
ext_files = ["src/mmapbitarray.c",
"src/bloomfilter.c",
+ "src/md5.c",
"src/primetester.c",
]
@@ -22,7 +23,7 @@
setup(
name = 'pybloomfiltermmap',
- version = "0.2.0",
+ version = "0.2.2",
author = "Michael Axiak, Rob Stacey",
author_email = "mike@axiak.net",
url = "http://github.com/axiak/pybloomfiltermmap/",
View
2  src/Makefile
@@ -1,5 +1,5 @@
bloomfilter: mmapbitarray.* bloomfilter.*
- gcc -lm -O3 mmapbitarray.c bloomfilter.c -o bf
+ gcc -lm -O3 mmapbitarray.c md5.c bloomfilter.c -o bf
mbarray: mmapbitarray.*
gcc -lm -O3 -DMBAQUERY mmapbitarray.c -o mbaquery
View
60 src/bloomfilter.c
@@ -4,6 +4,7 @@
#include <errno.h>
#include <stdio.h>
#include <fcntl.h>
+#include "md5.h"
#include "bloomfilter.h"
@@ -112,6 +113,8 @@ BloomFilter * bloomfilter_Copy_Template(BloomFilter * src, char * filename, int
return bf;
}
+/*
+CODE FOR djb HASH...
uint32_t _hash_char(uint32_t hash_seed, Key * key) {
register uint32_t result = 5381 ^ hash_seed;
register unsigned char * ptr = (unsigned char *)key->shash;
@@ -125,6 +128,63 @@ uint32_t _hash_char(uint32_t hash_seed, Key * key) {
uint32_t _hash_long(uint32_t hash_seed, Key * key) {
return 5381 + (33 * (key->nhash ^ hash_seed));
}
+*/
+
+/*
+
+CODE TO USE SHA512..
+
+#include <openssl/evp.h>
+
+uint32_t _hash_char(uint32_t hash_seed, Key * key) {
+ EVP_MD_CTX ctx;
+ unsigned char result_buffer[64];
+ uint32_t result = 0;
+ unsigned int result_size = sizeof(result);
+
+ EVP_MD_CTX_init(&ctx);
+
+ EVP_DigestInit_ex(&ctx, EVP_sha512(), NULL);
+ EVP_DigestUpdate(&ctx, (const unsigned char *)&hash_seed, sizeof(hash_seed));
+ EVP_DigestUpdate(&ctx, (const unsigned char *)key->shash, key->nhash);
+ EVP_DigestFinal_ex(&ctx, &result_buffer, NULL);
+ return *(uint32_t *)result_buffer;
+}
+
+uint32_t _hash_long(uint32_t hash_seed, Key * key) {
+ Key newKey = {
+ .shash = (char *)key->nhash,
+ .nhash = sizeof(key->nhash)
+ };
+
+ return _hash_char(hash_seed, &newKey);
+}
+*/
+
+
+#include "md5.h"
+uint32_t _hash_char(uint32_t hash_seed, Key * key) {
+ md5_state_t state;
+ md5_byte_t result[16];
+ int i;
+
+ md5_init(&state);
+ md5_append(&state, (md5_byte_t *)&hash_seed, sizeof(uint32_t));
+ md5_append(&state, (md5_byte_t *)key->shash, key->nhash);
+ md5_finish(&state, result);
+ return *(uint32_t *)(&result[4]);
+}
+
+uint32_t _hash_long(uint32_t hash_seed, Key * key) {
+ Key newKey = {
+ .shash = (char *)key->nhash,
+ .nhash = sizeof(key->nhash)
+ };
+
+ return _hash_char(hash_seed, &newKey);
+}
+
+
#if 0
View
394 src/md5.c
@@ -0,0 +1,394 @@
+/*
+ Copyright (C) 1999, 2000, 2002 Aladdin Enterprises. All rights reserved.
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ L. Peter Deutsch
+ ghost@aladdin.com
+
+ */
+/* $Id: md5.c,v 1.6 2002/04/13 19:20:28 lpd Exp $ */
+/*
+ Independent implementation of MD5 (RFC 1321).
+
+ This code implements the MD5 Algorithm defined in RFC 1321, whose
+ text is available at
+ http://www.ietf.org/rfc/rfc1321.txt
+ The code is derived from the text of the RFC, including the test suite
+ (section A.5) but excluding the rest of Appendix A. It does not include
+ any code or documentation that is identified in the RFC as being
+ copyrighted.
+
+ The original and principal author of md5.c is L. Peter Deutsch
+ <ghost@aladdin.com>. Other authors are noted in the change history
+ that follows (in reverse chronological order):
+
+ 2002-04-13 lpd Clarified derivation from RFC 1321; now handles byte order
+ either statically or dynamically; added missing #include <string.h>
+ in library.
+ 2002-03-11 lpd Corrected argument list for main(), and added int return
+ type, in test program and T value program.
+ 2002-02-21 lpd Added missing #include <stdio.h> in test program.
+ 2000-07-03 lpd Patched to eliminate warnings about "constant is
+ unsigned in ANSI C, signed in traditional"; made test program
+ self-checking.
+ 1999-11-04 lpd Edited comments slightly for automatic TOC extraction.
+ 1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5).
+ 1999-05-03 lpd Original version.
+ */
+
+#include "md5.h"
+#include <string.h>
+#include <limits.h>
+
+#undef BYTE_ORDER /* 1 = big-endian, -1 = little-endian, 0 = unknown */
+#ifdef ARCH_IS_BIG_ENDIAN
+# define BYTE_ORDER (ARCH_IS_BIG_ENDIAN ? 1 : -1)
+#else
+# define BYTE_ORDER 0
+#endif
+
+#define T_MASK ((md5_word_t)~0)
+#define T1 /* 0xd76aa478 */ (T_MASK ^ 0x28955b87)
+#define T2 /* 0xe8c7b756 */ (T_MASK ^ 0x173848a9)
+#define T3 0x242070db
+#define T4 /* 0xc1bdceee */ (T_MASK ^ 0x3e423111)
+#define T5 /* 0xf57c0faf */ (T_MASK ^ 0x0a83f050)
+#define T6 0x4787c62a
+#define T7 /* 0xa8304613 */ (T_MASK ^ 0x57cfb9ec)
+#define T8 /* 0xfd469501 */ (T_MASK ^ 0x02b96afe)
+#define T9 0x698098d8
+#define T10 /* 0x8b44f7af */ (T_MASK ^ 0x74bb0850)
+#define T11 /* 0xffff5bb1 */ (T_MASK ^ 0x0000a44e)
+#define T12 /* 0x895cd7be */ (T_MASK ^ 0x76a32841)
+#define T13 0x6b901122
+#define T14 /* 0xfd987193 */ (T_MASK ^ 0x02678e6c)
+#define T15 /* 0xa679438e */ (T_MASK ^ 0x5986bc71)
+#define T16 0x49b40821
+#define T17 /* 0xf61e2562 */ (T_MASK ^ 0x09e1da9d)
+#define T18 /* 0xc040b340 */ (T_MASK ^ 0x3fbf4cbf)
+#define T19 0x265e5a51
+#define T20 /* 0xe9b6c7aa */ (T_MASK ^ 0x16493855)
+#define T21 /* 0xd62f105d */ (T_MASK ^ 0x29d0efa2)
+#define T22 0x02441453
+#define T23 /* 0xd8a1e681 */ (T_MASK ^ 0x275e197e)
+#define T24 /* 0xe7d3fbc8 */ (T_MASK ^ 0x182c0437)
+#define T25 0x21e1cde6
+#define T26 /* 0xc33707d6 */ (T_MASK ^ 0x3cc8f829)
+#define T27 /* 0xf4d50d87 */ (T_MASK ^ 0x0b2af278)
+#define T28 0x455a14ed
+#define T29 /* 0xa9e3e905 */ (T_MASK ^ 0x561c16fa)
+#define T30 /* 0xfcefa3f8 */ (T_MASK ^ 0x03105c07)
+#define T31 0x676f02d9
+#define T32 /* 0x8d2a4c8a */ (T_MASK ^ 0x72d5b375)
+#define T33 /* 0xfffa3942 */ (T_MASK ^ 0x0005c6bd)
+#define T34 /* 0x8771f681 */ (T_MASK ^ 0x788e097e)
+#define T35 0x6d9d6122
+#define T36 /* 0xfde5380c */ (T_MASK ^ 0x021ac7f3)
+#define T37 /* 0xa4beea44 */ (T_MASK ^ 0x5b4115bb)
+#define T38 0x4bdecfa9
+#define T39 /* 0xf6bb4b60 */ (T_MASK ^ 0x0944b49f)
+#define T40 /* 0xbebfbc70 */ (T_MASK ^ 0x4140438f)
+#define T41 0x289b7ec6
+#define T42 /* 0xeaa127fa */ (T_MASK ^ 0x155ed805)
+#define T43 /* 0xd4ef3085 */ (T_MASK ^ 0x2b10cf7a)
+#define T44 0x04881d05
+#define T45 /* 0xd9d4d039 */ (T_MASK ^ 0x262b2fc6)
+#define T46 /* 0xe6db99e5 */ (T_MASK ^ 0x1924661a)
+#define T47 0x1fa27cf8
+#define T48 /* 0xc4ac5665 */ (T_MASK ^ 0x3b53a99a)
+#define T49 /* 0xf4292244 */ (T_MASK ^ 0x0bd6ddbb)
+#define T50 0x432aff97
+#define T51 /* 0xab9423a7 */ (T_MASK ^ 0x546bdc58)
+#define T52 /* 0xfc93a039 */ (T_MASK ^ 0x036c5fc6)
+#define T53 0x655b59c3
+#define T54 /* 0x8f0ccc92 */ (T_MASK ^ 0x70f3336d)
+#define T55 /* 0xffeff47d */ (T_MASK ^ 0x00100b82)
+#define T56 /* 0x85845dd1 */ (T_MASK ^ 0x7a7ba22e)
+#define T57 0x6fa87e4f
+#define T58 /* 0xfe2ce6e0 */ (T_MASK ^ 0x01d3191f)
+#define T59 /* 0xa3014314 */ (T_MASK ^ 0x5cfebceb)
+#define T60 0x4e0811a1
+#define T61 /* 0xf7537e82 */ (T_MASK ^ 0x08ac817d)
+#define T62 /* 0xbd3af235 */ (T_MASK ^ 0x42c50dca)
+#define T63 0x2ad7d2bb
+#define T64 /* 0xeb86d391 */ (T_MASK ^ 0x14792c6e)
+
+
+static void
+md5_process(md5_state_t *pms, const md5_byte_t *data /*[64]*/)
+{
+ md5_word_t
+ a = pms->abcd[0], b = pms->abcd[1],
+ c = pms->abcd[2], d = pms->abcd[3];
+ md5_word_t t;
+#if BYTE_ORDER > 0
+ /* Define storage only for big-endian CPUs. */
+ md5_word_t X[16];
+#else
+ /* Define storage for little-endian or both types of CPUs. */
+ md5_word_t xbuf[16];
+ const md5_word_t *X;
+#endif
+
+ {
+#if BYTE_ORDER == 0
+ /*
+ * Determine dynamically whether this is a big-endian or
+ * little-endian machine, since we can use a more efficient
+ * algorithm on the latter.
+ */
+ static const int w = 1;
+
+ if (*((const md5_byte_t *)&w)) /* dynamic little-endian */
+#endif
+#if BYTE_ORDER <= 0 /* little-endian */
+ {
+ /*
+ * On little-endian machines, we can process properly aligned
+ * data without copying it.
+ */
+ if (!((data - (const md5_byte_t *)0) & 3)) {
+ /* data are properly aligned */
+ X = (const md5_word_t *)data;
+ } else {
+ /* not aligned */
+ memcpy(xbuf, data, 64);
+ X = xbuf;
+ }
+ }
+#endif
+#if BYTE_ORDER == 0
+ else /* dynamic big-endian */
+#endif
+#if BYTE_ORDER >= 0 /* big-endian */
+ {
+ /*
+ * On big-endian machines, we must arrange the bytes in the
+ * right order.
+ */
+ const md5_byte_t *xp = data;
+ int i;
+
+# if BYTE_ORDER == 0
+ X = xbuf; /* (dynamic only) */
+# else
+# define xbuf X /* (static only) */
+# endif
+ for (i = 0; i < 16; ++i, xp += 4)
+ xbuf[i] = xp[0] + (xp[1] << 8) + (xp[2] << 16) + (xp[3] << 24);
+ }
+#endif
+ }
+
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+
+ /* Round 1. */
+ /* Let [abcd k s i] denote the operation
+ a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). */
+#define F(x, y, z) (((x) & (y)) | (~(x) & (z)))
+#define SET(a, b, c, d, k, s, Ti)\
+ t = a + F(b,c,d) + X[k] + Ti;\
+ a = ROTATE_LEFT(t, s) + b
+ /* Do the following 16 operations. */
+ SET(a, b, c, d, 0, 7, T1);
+ SET(d, a, b, c, 1, 12, T2);
+ SET(c, d, a, b, 2, 17, T3);
+ SET(b, c, d, a, 3, 22, T4);
+ SET(a, b, c, d, 4, 7, T5);
+ SET(d, a, b, c, 5, 12, T6);
+ SET(c, d, a, b, 6, 17, T7);
+ SET(b, c, d, a, 7, 22, T8);
+ SET(a, b, c, d, 8, 7, T9);
+ SET(d, a, b, c, 9, 12, T10);
+ SET(c, d, a, b, 10, 17, T11);
+ SET(b, c, d, a, 11, 22, T12);
+ SET(a, b, c, d, 12, 7, T13);
+ SET(d, a, b, c, 13, 12, T14);
+ SET(c, d, a, b, 14, 17, T15);
+ SET(b, c, d, a, 15, 22, T16);
+#undef SET
+
+ /* Round 2. */
+ /* Let [abcd k s i] denote the operation
+ a = b + ((a + G(b,c,d) + X[k] + T[i]) <<< s). */
+#define G(x, y, z) (((x) & (z)) | ((y) & ~(z)))
+#define SET(a, b, c, d, k, s, Ti)\
+ t = a + G(b,c,d) + X[k] + Ti;\
+ a = ROTATE_LEFT(t, s) + b
+ /* Do the following 16 operations. */
+ SET(a, b, c, d, 1, 5, T17);
+ SET(d, a, b, c, 6, 9, T18);
+ SET(c, d, a, b, 11, 14, T19);
+ SET(b, c, d, a, 0, 20, T20);
+ SET(a, b, c, d, 5, 5, T21);
+ SET(d, a, b, c, 10, 9, T22);
+ SET(c, d, a, b, 15, 14, T23);
+ SET(b, c, d, a, 4, 20, T24);
+ SET(a, b, c, d, 9, 5, T25);
+ SET(d, a, b, c, 14, 9, T26);
+ SET(c, d, a, b, 3, 14, T27);
+ SET(b, c, d, a, 8, 20, T28);
+ SET(a, b, c, d, 13, 5, T29);
+ SET(d, a, b, c, 2, 9, T30);
+ SET(c, d, a, b, 7, 14, T31);
+ SET(b, c, d, a, 12, 20, T32);
+#undef SET
+
+ /* Round 3. */
+ /* Let [abcd k s t] denote the operation
+ a = b + ((a + H(b,c,d) + X[k] + T[i]) <<< s). */
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define SET(a, b, c, d, k, s, Ti)\
+ t = a + H(b,c,d) + X[k] + Ti;\
+ a = ROTATE_LEFT(t, s) + b
+ /* Do the following 16 operations. */
+ SET(a, b, c, d, 5, 4, T33);
+ SET(d, a, b, c, 8, 11, T34);
+ SET(c, d, a, b, 11, 16, T35);
+ SET(b, c, d, a, 14, 23, T36);
+ SET(a, b, c, d, 1, 4, T37);
+ SET(d, a, b, c, 4, 11, T38);
+ SET(c, d, a, b, 7, 16, T39);
+ SET(b, c, d, a, 10, 23, T40);
+ SET(a, b, c, d, 13, 4, T41);
+ SET(d, a, b, c, 0, 11, T42);
+ SET(c, d, a, b, 3, 16, T43);
+ SET(b, c, d, a, 6, 23, T44);
+ SET(a, b, c, d, 9, 4, T45);
+ SET(d, a, b, c, 12, 11, T46);
+ SET(c, d, a, b, 15, 16, T47);
+ SET(b, c, d, a, 2, 23, T48);
+#undef SET
+
+ /* Round 4. */
+ /* Let [abcd k s t] denote the operation
+ a = b + ((a + I(b,c,d) + X[k] + T[i]) <<< s). */
+#define I(x, y, z) ((y) ^ ((x) | ~(z)))
+#define SET(a, b, c, d, k, s, Ti)\
+ t = a + I(b,c,d) + X[k] + Ti;\
+ a = ROTATE_LEFT(t, s) + b
+ /* Do the following 16 operations. */
+ SET(a, b, c, d, 0, 6, T49);
+ SET(d, a, b, c, 7, 10, T50);
+ SET(c, d, a, b, 14, 15, T51);
+ SET(b, c, d, a, 5, 21, T52);
+ SET(a, b, c, d, 12, 6, T53);
+ SET(d, a, b, c, 3, 10, T54);
+ SET(c, d, a, b, 10, 15, T55);
+ SET(b, c, d, a, 1, 21, T56);
+ SET(a, b, c, d, 8, 6, T57);
+ SET(d, a, b, c, 15, 10, T58);
+ SET(c, d, a, b, 6, 15, T59);
+ SET(b, c, d, a, 13, 21, T60);
+ SET(a, b, c, d, 4, 6, T61);
+ SET(d, a, b, c, 11, 10, T62);
+ SET(c, d, a, b, 2, 15, T63);
+ SET(b, c, d, a, 9, 21, T64);
+#undef SET
+
+ /* Then perform the following additions. (That is increment each
+ of the four registers by the value it had before this block
+ was started.) */
+ pms->abcd[0] += a;
+ pms->abcd[1] += b;
+ pms->abcd[2] += c;
+ pms->abcd[3] += d;
+}
+
+void
+md5_init(md5_state_t *pms)
+{
+ pms->count[0] = pms->count[1] = 0;
+ pms->abcd[0] = 0x67452301;
+ pms->abcd[1] = /*0xefcdab89*/ T_MASK ^ 0x10325476;
+ pms->abcd[2] = /*0x98badcfe*/ T_MASK ^ 0x67452301;
+ pms->abcd[3] = 0x10325476;
+}
+
+void
+md5_append(md5_state_t *pms, const md5_byte_t *data, unsigned int nbytes)
+{
+ const md5_byte_t *p = data;
+ unsigned int left = nbytes;
+ unsigned int offset = (pms->count[0] >> 3) & 63;
+ md5_word_t nbits = (md5_word_t)(nbytes << 3);
+
+ if (nbytes <= 0)
+ return;
+
+ /* this special case is handled recursively */
+ if (nbytes > INT_MAX - offset) {
+ unsigned int overlap;
+
+ /* handle the append in two steps to prevent overflow */
+ overlap = 64 - offset;
+
+ md5_append(pms, data, overlap);
+ md5_append(pms, data + overlap, nbytes - overlap);
+ return;
+ }
+
+ /* Update the message length. */
+ pms->count[1] += nbytes >> 29;
+ pms->count[0] += nbits;
+ if (pms->count[0] < nbits)
+ pms->count[1]++;
+
+ /* Process an initial partial block. */
+ if (offset) {
+ unsigned int copy = (offset + nbytes > 64 ? 64 - offset : nbytes);
+
+ memcpy(pms->buf + offset, p, copy);
+ if (offset + copy < 64)
+ return;
+ p += copy;
+ left -= copy;
+ md5_process(pms, pms->buf);
+ }
+
+ /* Process full blocks. */
+ for (; left >= 64; p += 64, left -= 64)
+ md5_process(pms, p);
+
+ /* Process a final partial block. */
+ if (left)
+ memcpy(pms->buf, p, left);
+}
+
+void
+md5_finish(md5_state_t *pms, md5_byte_t digest[16])
+{
+ static const md5_byte_t pad[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+ md5_byte_t data[8];
+ int i;
+
+ /* Save the length before padding. */
+ for (i = 0; i < 8; ++i)
+ data[i] = (md5_byte_t)(pms->count[i >> 2] >> ((i & 3) << 3));
+ /* Pad to 56 bytes mod 64. */
+ md5_append(pms, pad, ((55 - (pms->count[0] >> 3)) & 63) + 1);
+ /* Append the length. */
+ md5_append(pms, data, 8);
+ for (i = 0; i < 16; ++i)
+ digest[i] = (md5_byte_t)(pms->abcd[i >> 2] >> ((i & 3) << 3));
+}
View
91 src/md5.h
@@ -0,0 +1,91 @@
+/*
+ Copyright (C) 1999, 2002 Aladdin Enterprises. All rights reserved.
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ L. Peter Deutsch
+ ghost@aladdin.com
+
+ */
+/* $Id$ */
+/*
+ Independent implementation of MD5 (RFC 1321).
+
+ This code implements the MD5 Algorithm defined in RFC 1321, whose
+ text is available at
+ http://www.ietf.org/rfc/rfc1321.txt
+ The code is derived from the text of the RFC, including the test suite
+ (section A.5) but excluding the rest of Appendix A. It does not include
+ any code or documentation that is identified in the RFC as being
+ copyrighted.
+
+ The original and principal author of md5.h is L. Peter Deutsch
+ <ghost@aladdin.com>. Other authors are noted in the change history
+ that follows (in reverse chronological order):
+
+ 2002-04-13 lpd Removed support for non-ANSI compilers; removed
+ references to Ghostscript; clarified derivation from RFC 1321;
+ now handles byte order either statically or dynamically.
+ 1999-11-04 lpd Edited comments slightly for automatic TOC extraction.
+ 1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5);
+ added conditionalization for C++ compilation from Martin
+ Purschke <purschke@bnl.gov>.
+ 1999-05-03 lpd Original version.
+ */
+
+#ifndef md5_INCLUDED
+# define md5_INCLUDED
+
+/*
+ * This package supports both compile-time and run-time determination of CPU
+ * byte order. If ARCH_IS_BIG_ENDIAN is defined as 0, the code will be
+ * compiled to run only on little-endian CPUs; if ARCH_IS_BIG_ENDIAN is
+ * defined as non-zero, the code will be compiled to run only on big-endian
+ * CPUs; if ARCH_IS_BIG_ENDIAN is not defined, the code will be compiled to
+ * run on either big- or little-endian CPUs, but will run slightly less
+ * efficiently on either one than if ARCH_IS_BIG_ENDIAN is defined.
+ */
+
+typedef unsigned char md5_byte_t; /* 8-bit byte */
+typedef unsigned int md5_word_t; /* 32-bit word */
+
+/* Define the state of the MD5 Algorithm. */
+typedef struct md5_state_s {
+ md5_word_t count[2]; /* message length in bits, lsw first */
+ md5_word_t abcd[4]; /* digest buffer */
+ md5_byte_t buf[64]; /* accumulate block */
+} md5_state_t;
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* Initialize the algorithm. */
+void md5_init(md5_state_t *pms);
+
+/* Append a string to the message. */
+void md5_append(md5_state_t *pms, const md5_byte_t *data, unsigned int nbytes);
+
+/* Finish the message and return the digest. */
+void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif /* md5_INCLUDED */
View
1,312 src/pybloomfilter.c
657 additions, 655 deletions not shown
View
26 src/pybloomfilter.pyx
@@ -43,7 +43,7 @@ class IndeterminateCountError(ValueError):
cdef class BloomFilter:
"""
The BloomFilter class implements a bloom filter that uses mmap'd files.
- For more information on what a bloom filter is, please read the Wikipedia article about it.
+ For more information on what a bloom filter is, please read the Wikipedia article about it.
"""
cdef cbloomfilter.BloomFilter * _bf
cdef int _closed
@@ -84,11 +84,16 @@ cdef class BloomFilter:
else:
if os.path.exists(filename):
os.unlink(filename)
- num_bits = 5 * math.ceil((capacity * math.log(error_rate)) / math.log(1.0 /
- (math.pow(2.0,
- math.log(2.0)))))
- num_bits = cbloomfilter.next_prime(num_bits)
- num_hashes = int(math.ceil(math.log(2.0) * num_bits / capacity))
+
+ num_hashes = int(math.ceil(math.log(1.0 / error_rate, 2.0)))
+ bits_per_hash = int(math.ceil(
+ (2.0 * capacity * abs(math.log(error_rate))) /
+ (num_hashes * (math.log(2) ** 2))))
+
+ num_bits = num_hashes * bits_per_hash
+ #num_bits = cbloomfilter.next_prime(num_bits)
+ #num_hashes = int(math.ceil(math.log(2.0) * num_bits / capacity))
+
hash_seeds = array.array('I')
hash_seeds.extend([random.getrandbits(32) for i in range(num_hashes)])
test = hash_seeds.tostring()
@@ -107,7 +112,7 @@ cdef class BloomFilter:
def __dealloc__(self):
cbloomfilter.bloomfilter_Destroy(self._bf)
- self._bf = NULL
+ self._bf = NULL
property hash_seeds:
def __get__(self):
@@ -131,6 +136,11 @@ cdef class BloomFilter:
self._assert_open()
return self._bf.num_hashes
+ property num_slices:
+ def __get__(self):
+ self._assert_open()
+ return self._bf.num_hashes
+
property num_bits:
def __get__(self):
self._assert_open()
@@ -167,6 +177,8 @@ cdef class BloomFilter:
if isinstance(item, str):
key.shash = item
key.nhash = len(item)
+ #key.shash = NULL
+ #key.nhash = hash(item)
else:
key.shash = NULL
key.nhash = hash(item)
View
13 tests/accuracytest.py
@@ -1,3 +1,4 @@
+import sys
import os
import tempfile
import pybloomfilter
@@ -6,6 +7,11 @@
TEST_WORDS = os.path.join(os.path.dirname(__file__), 'testwords')
def main():
+ global pybloomfilter
+
+ if len(sys.argv) > 1 and sys.argv[1].lower() == '-pybloom':
+ import pybloom
+ pybloomfilter = pybloom
with open(WORDS_FILE) as base_file:
with open(TEST_WORDS) as test_file:
@@ -21,7 +27,10 @@ def main():
def test_errors(error_rate, filter_size, correct_overlap, num_test_words):
bloom_file = tempfile.NamedTemporaryFile()
- bf = pybloomfilter.BloomFilter(filter_size, error_rate, bloom_file.name)
+ try:
+ bf = pybloomfilter.BloomFilter(filter_size, error_rate, bloom_file.name)
+ except TypeError:
+ bf = pybloomfilter.BloomFilter(filter_size, error_rate)
with open(WORDS_FILE) as source_file:
with open(TEST_WORDS) as test_file:
@@ -43,7 +52,7 @@ def run_test(bf, source_file, test_file, correct_overlap, num_test_words, error_
print "Specified: %f; Measured: %f; num_hashes: %d, num_bits: %d" % (
error_rate,
actual_error_rate,
- bf.num_hashes,
+ bf.num_slices,
bf.num_bits,
)
View
52,142 tests/testwords
25,834 additions, 26,308 deletions not shown
Please sign in to comment.
Something went wrong with that request. Please try again.