Skip to content
Browse files

Initial commit with c library and test

  • Loading branch information...
0 parents commit 368ffcdb3c45e9e6538ef60c1d15aed4555c04b2 Justin Hines committed
Showing with 1,222 additions and 0 deletions.
  1. +1 −0 .gitignore
  2. +59 −0 Makefile
  3. +544 −0 src/dablooms.c
  4. +77 −0 src/dablooms.h
  5. +332 −0 src/md5.c
  6. +39 −0 src/md5.h
  7. +170 −0 src/test_dablooms.c
1 .gitignore
@@ -0,0 +1 @@
+/build/
59 Makefile
@@ -0,0 +1,59 @@
+PREFIX = /usr/local
+LIBDIR = $(PREFIX)/lib
+DESTDIR =
+
+LDFLAGS = -g
+LDLIBS = -lm
+CFLAGS = -g -Wall
+
+INSTALL = install
+CC = gcc
+AR = ar
+
+SRCDIR = src
+BLDDIR = build
+
+SRCS_LIBDABLOOMS = md5.c dablooms.c
+SRCS_TESTS = test_dablooms.c
+
+OBJS_LIBDABLOOMS = $(patsubst %.c, $(BLDDIR)/%.o, $(SRCS_LIBDABLOOMS))
+OBJS_TESTS = $(patsubst %.c, $(BLDDIR)/%.o, $(SRCS_TESTS))
+
+all: test_dablooms libdablooms
+
+DEPS := $(patsubst %.o, %.o.deps, $(OBJS_LIBDABLOOMS) $(OBJS_TESTS))
+# sort removes duplicates
+-include $(sort $(DEPS))
+
+install: install_libdablooms
+
+$(DESTDIR)$(LIBDIR)/libdablooms.a: $(BLDDIR)/libdablooms.a
+ @echo " INSTALL " $@
+ @$(INSTALL) -d $(dir $@)
+ @$(INSTALL) -D $< $@
+
+install_libdablooms: $(DESTDIR)$(LIBDIR)/libdablooms.a
+
+libdablooms: $(BLDDIR)/libdablooms.a
+
+$(BLDDIR)/test_dablooms: $(OBJS_TESTS) libdablooms
+ @echo " LD " $@
+ @$(CC) -o $@ $(OBJS_TESTS) -L$(BLDDIR) $(LDFLAGS) -l dablooms $(LDLIBS)
+
+test: $(BLDDIR)/test_dablooms
+ $(BLDDIR)/test_dablooms
+
+$(BLDDIR)/%.o: $(SRCDIR)/%.c
+ @echo " CC " $@
+ @mkdir -p $(dir $@)
+ @$(CC) -o $@ -c $< $(CFLAGS) -MMD -MF $@.deps
+
+$(BLDDIR)/libdablooms.a: $(OBJS_LIBDABLOOMS)
+ @echo " AR " $@
+ @rm -f $@
+ @$(AR) rcs $@ $^
+
+clean:
+ rm -f $(OBJS_LIBDABLOOMS) $(BLDDIR)/libdablooms.a $(OBJS_TESTS) $(BLDIR)/test_dablooms
+
+.PHONY: all clean install install_libdablooms libdablooms test
544 src/dablooms.c
@@ -0,0 +1,544 @@
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <math.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "md5.h"
+#include "dablooms.h"
+
+#define HEADER_BYTES (2*sizeof(uint32_t))
+#define SCALE_HEADER_BYTES (2*sizeof(uint64_t))
+#define SALT_SIZE 16
+
+void bitmap_destroy(bitmap_t *bitmap)
+{
+ if ((munmap(bitmap->array, bitmap->bytes)) < 0) {
+ perror("Error unmapping memory");
+ }
+ close(bitmap->fd);
+ free(bitmap);
+}
+
+bitmap_t *bitmap_resize(bitmap_t *bitmap, size_t old_size, size_t new_size, int fromfile)
+{
+ int fd = bitmap->fd;
+ /* Stretch the file to the appropiate size (Make sure the kernel allocates disk space)" */
+ if (lseek(fd, new_size, SEEK_SET) < 0) {
+ perror("Error calling lseek() to set file size");
+ bitmap_destroy(bitmap);
+ close(fd);
+ return NULL;
+ }
+ /* Write something to the end of the file to insure it is the new size */
+ if(! fromfile) {
+ if (write(fd, "", 1) < 0) {
+ perror("Error writing last byte of the file");
+ bitmap_destroy(bitmap);
+ close(fd);
+ return NULL;
+ }
+ }
+ lseek(fd, 0, SEEK_SET);
+
+ /* New mmap if it doesn't exist, else resize */
+ if ((bitmap->array) == NULL) {
+ if ((bitmap->array = mmap(0, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)) == MAP_FAILED) {
+ perror("Error init mmap");
+ bitmap_destroy(bitmap);
+ close(fd);
+ return NULL;
+ }
+ } else {
+ if ((bitmap->array = mremap(bitmap->array, old_size, new_size, MREMAP_MAYMOVE)) == MAP_FAILED) {
+ perror("Error resizing mmap");
+ bitmap_destroy(bitmap);
+ close(fd);
+ return NULL;
+ }
+ }
+
+ bitmap->bytes = new_size;
+ return bitmap;
+}
+
+bitmap_t *bitmap_create(int fd, size_t bytes, int fromfile)
+{
+ bitmap_t *bitmap;
+
+ if ((bitmap = (bitmap_t *)malloc(sizeof(bitmap_t))) == NULL) {
+ return NULL;
+ }
+
+ bitmap->bytes = bytes;
+ bitmap->fd = fd;
+ bitmap->array = NULL;
+
+ if ((bitmap = bitmap_resize(bitmap, 0, bytes, fromfile)) == NULL) {
+ return NULL;
+ }
+
+ return bitmap;
+}
+
+int bitmap_increment(bitmap_t *bitmap, unsigned int index, unsigned int offset)
+{
+ uint32_t access = index / 2 + offset;
+ uint8_t temp;
+ uint8_t n = bitmap->array[access];
+ if (index % 2 != 0) {
+ temp = (n & 0x0f);
+ n = (n & 0xf0) + ((n & 0x0f) + 0x01);
+ } else {
+ temp = (n & 0xf0) >> 4;
+ n = (n & 0x0f) + ((n & 0xf0) + 0x10);
+ }
+
+ if (temp == 0x0f) {
+ fprintf(stderr, "4 bit int Overflow\n");
+ return -1;
+ }
+
+ bitmap->array[access] = n;
+ return 0;
+}
+
+int bitmap_decrement(bitmap_t *bitmap, unsigned int index, unsigned int offset)
+{
+ uint32_t access = index / 2 + offset;
+ uint32_t temp;
+ uint32_t n = bitmap->array[access];
+
+ if (index % 2 != 0) {
+ temp = (n & 0x0f);
+ n = (n & 0xf0) + ((n & 0x0f) - 0x01);
+ } else {
+ temp = (n & 0xf0) >> 4;
+ n = (n & 0x0f) + ((n & 0xf0) - 0x10);
+ }
+
+ if (temp == 0x00) {
+ fprintf(stderr, "Decrementing zero\n");
+ return -1;
+ }
+
+ bitmap->array[access] = n;
+ return 0;
+}
+
+int bitmap_check(bitmap_t *bitmap, unsigned int index, unsigned int offset)
+{
+ unsigned int access = index / 2 + offset;
+ if (index % 2 != 0 ) {
+ return bitmap->array[access] & 0x0f;
+ } else {
+ return bitmap->array[access] & 0xf0;
+ }
+}
+
+int bitmap_flush(bitmap_t *bitmap)
+{
+ if ((msync(bitmap->array, bitmap->bytes, MS_SYNC) < 0)) {
+ perror("Error flushing bitmap to disk");
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/* Each function has a unique salt, so we need at least x = nfuncs salts.
+ * An MD5 hash is 16 bytes long, and each salt only needds to be 4 bytes long
+ * Thus we can proportion 4 salts per each md5 hash we create as a salt.
+ */
+void create_salts(counting_bloom_t *bloom)
+{
+ int div = bloom->nfuncs / 4;
+ int mod = bloom->nfuncs % 4;
+ int i;
+
+ if (mod) {
+ div += 1;
+ }
+ bloom->num_salts = div;
+ bloom->salts = calloc(div, SALT_SIZE);
+ for (i = 0; i < div; i++) {
+ struct cvs_MD5Context context;
+ unsigned char checksum[16];
+ cvs_MD5Init (&context);
+ cvs_MD5Update (&context, (unsigned char *) &i, sizeof(int));
+ cvs_MD5Final (checksum, &context);
+ memcpy(bloom->salts + i * SALT_SIZE, &checksum, SALT_SIZE);
+ }
+}
+
+/* We are are using the salts, adding them to the new md5 hash, adding the key,
+ * converting said md5 hash to 4 byte indexes
+ */
+unsigned int *hash_func(counting_bloom_t *bloom, const char *key, unsigned int *hashes)
+{
+
+ int i, j, hash_cnt, hash;
+ unsigned char *salts = bloom->salts;
+ hash_cnt = 0;
+
+ for (i = 0; i < bloom->num_salts; i++) {
+ struct cvs_MD5Context context;
+ unsigned char checksum[16];
+ cvs_MD5Init(&context);
+ cvs_MD5Update(&context, salts + i * SALT_SIZE, SALT_SIZE);
+ cvs_MD5Update(&context, (unsigned char *)key, strlen(key));
+ cvs_MD5Final(checksum, &context);
+ for (j = 0; j < sizeof(checksum); j += 4) {
+ if (hash_cnt >= (bloom->nfuncs)) {
+ break;
+ }
+ hash = *(uint32_t *)(checksum + j);
+ hashes[hash_cnt] = hash % bloom->counts_per_func;
+ hash_cnt++;
+ }
+ }
+ return hashes;
+}
+
+int counting_bloom_destroy(counting_bloom_t *bloom)
+{
+ if (bloom != NULL) {
+ free(bloom->header);
+ bloom->header = NULL;
+
+ free(bloom->salts);
+ bloom->salts = NULL;
+
+ free(bloom->hashes);
+ bloom->hashes = NULL;
+
+ free(bloom);
+ bloom = NULL;
+ }
+ return 0;
+}
+
+counting_bloom_t *bloom_setup(counting_bloom_t *bloom, unsigned int capacity, double error_rate,
+ unsigned int offset, uint32_t id, unsigned int count)
+{
+ bloom->salts = NULL;
+ bloom->bitmap = NULL;
+ bloom->capacity = capacity;
+ bloom->error_rate = error_rate;
+ bloom->offset = offset + HEADER_BYTES;
+ bloom->nfuncs = (int) ceil(log(1 / error_rate) / log(2));
+ bloom->counts_per_func = (int) ceil(capacity * fabs(log(error_rate)) / (bloom->nfuncs * pow(log(2), 2)));
+ bloom->size = ceil(bloom->nfuncs * bloom->counts_per_func);
+ bloom->num_bytes = (int) ceil(bloom->size / 2 + HEADER_BYTES);
+ bloom->hashes = calloc(bloom->nfuncs, sizeof(unsigned int));
+ create_salts(bloom);
+ return bloom;
+}
+
+int counting_bloom_add(counting_bloom_t *bloom, const char *s)
+{
+ unsigned int index, i, offset;
+ unsigned int *hashes = bloom->hashes;
+
+ hash_func(bloom, s, hashes);
+
+ for (i = 0; i < bloom->nfuncs; i++) {
+ offset = i * bloom->counts_per_func;
+ index = hashes[i] + offset;
+ bitmap_increment(bloom->bitmap, index, bloom->offset);
+ }
+ (*bloom->header->count)++;
+
+ return 0;
+}
+
+int counting_bloom_remove(counting_bloom_t *bloom, const char *s)
+{
+ unsigned int index, i, offset;
+ unsigned int *hashes = bloom->hashes;
+
+ hash_func(bloom, s, hashes);
+
+ for (i = 0; i < bloom->nfuncs; i++) {
+ offset = i * bloom->counts_per_func;
+ index = hashes[i] + offset;
+ bitmap_decrement(bloom->bitmap, index, bloom->offset);
+ }
+ (*bloom->header->count)--;
+
+ return 0;
+}
+
+int counting_bloom_check(counting_bloom_t *bloom, const char *s)
+{
+ unsigned int index, i, offset;
+ unsigned int *hashes = bloom->hashes;
+
+ hash_func(bloom, s, hashes);
+
+ for (i = 0; i < bloom->nfuncs; i++) {
+ offset = i * bloom->counts_per_func;
+ index = hashes[i] + offset;
+ if (!(bitmap_check(bloom->bitmap, index, bloom->offset))) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+int scaling_bloom_destroy(scaling_bloom_t *bloom)
+{
+ int i;
+ for (i = bloom->num_blooms - 1; i >= 0; i--) {
+ counting_bloom_destroy(*(bloom->blooms + i));
+ }
+ free(bloom->blooms);
+ free(bloom->header);
+ bitmap_destroy(bloom->bitmap);
+ free(bloom);
+ return 0;
+}
+
+/* creates a new counting bloom filter from a given scaling bloom filter, with count and a fromfile flag */
+counting_bloom_t *bloom_from_scale(scaling_bloom_t *bloom, uint32_t id, unsigned int count, int fromfile)
+{
+ int i, offset;
+ double error_rate;
+ size_t old_size, new_size;
+ counting_bloom_t *cur_bloom;
+
+ error_rate = bloom->error_rate * (pow(.9, bloom->num_blooms + 1));
+ old_size = bloom->num_bytes;
+
+ if (bloom->num_blooms) {
+ if ((bloom->blooms = realloc(bloom->blooms, (bloom->num_blooms + 1) * sizeof(counting_bloom_t *))) == NULL) {
+ fprintf(stderr, "Error, could not realloc a new bloom filter\n");
+ return NULL;
+ }
+ } else {
+ if ((bloom->blooms = malloc(sizeof(counting_bloom_t *))) == NULL) {
+ fprintf(stderr, "Error, Could not malloc a new bloom filter\n");
+ return NULL;
+ }
+ }
+
+ if ((cur_bloom = malloc(sizeof(counting_bloom_t))) == NULL) {
+ fprintf(stderr, "Error, could not realloc a new bloom filter\n");
+ return NULL;
+ }
+ if ((cur_bloom->header = malloc(sizeof(counting_bloom_header_t))) == NULL) {
+ fprintf(stderr, "Error, could not malloc size for pointers of headers\n");
+ return NULL;
+ }
+
+ bloom->blooms[bloom->num_blooms] = cur_bloom;
+ bloom_setup(cur_bloom, bloom->capacity, error_rate, old_size, id, 0);
+
+ new_size = bloom->num_bytes + cur_bloom->num_bytes;
+ if (! fromfile) {
+ if (bloom->num_blooms) {
+ bloom->bitmap = bitmap_resize(bloom->bitmap, old_size, new_size, fromfile);
+ } else {
+ bloom->bitmap = bitmap_create(bloom->fd, new_size, fromfile);
+ }
+ }
+ bloom->num_blooms++;
+
+ /* Set these values, as mmap may have moved */
+ bloom->header->preseq = (uint64_t *)(bloom->bitmap->array);
+ bloom->header->posseq = (uint64_t *)(bloom->bitmap->array + sizeof(uint64_t));
+
+ /* Set the pointers for these header structs to the right location since mmap may have moved */
+ for (i = 0; i < bloom->num_blooms; i++) {
+ offset = bloom->blooms[i]->offset - HEADER_BYTES;
+ bloom->blooms[i]->header->id = (uint32_t *)(bloom->bitmap->array + offset);
+ bloom->blooms[i]->header->count = (uint32_t *)(bloom->bitmap->array + offset + sizeof(uint32_t));
+ }
+
+ /* set the value for the current pointers */
+ *cur_bloom->header->count = count;
+ *cur_bloom->header->id = id;
+
+ bloom->num_bytes = new_size;
+ cur_bloom->bitmap = bloom->bitmap;
+
+ return cur_bloom;
+}
+
+scaling_bloom_t *scaling_bloom_create(unsigned int capacity, double error_rate, const char *filename, uint32_t id)
+{
+
+ scaling_bloom_t *bloom;
+ counting_bloom_t *cur_bloom;
+ int fd;
+
+ if ((bloom = malloc(sizeof(scaling_bloom_t))) == NULL) {
+ return NULL;
+ }
+
+ if ((bloom->header = malloc(sizeof(scaling_bloom_header_t))) == NULL) {
+ fprintf(stderr, "Error Maoolc for scaling bloom header failed\n");
+ return NULL;
+ }
+
+ if ((fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600)) < 0) {
+ perror("Opening File Failed");
+ fprintf(stderr, " %s \n", filename);
+ return NULL;
+ }
+
+ bloom->fd = fd;
+ bloom->capacity = capacity;
+ bloom->error_rate = error_rate;
+ bloom->num_blooms = 0;
+ bloom->num_bytes = SCALE_HEADER_BYTES;
+
+ if (!(cur_bloom = bloom_from_scale(bloom, id, 0, 0))) {
+ fprintf(stderr, "ERROR, Could not create counting bloom\n");
+ scaling_bloom_destroy(bloom);
+ return NULL;
+ }
+ return bloom;
+}
+
+int scaling_bloom_add(scaling_bloom_t *bloom, const char *s, uint32_t id)
+{
+ int nblooms = bloom->num_blooms;
+ int i, id_diff;
+ counting_bloom_t *cur_bloom;
+
+ for (i = nblooms - 1; i >= 0; i--) {
+ cur_bloom = bloom->blooms[i];
+ id_diff = id - (*cur_bloom->header->id);
+ if (id_diff) {
+ /* If we're the top, ltes check to see if we need to add another filter */
+ if (i == (nblooms - 1)) {
+ if ((*(cur_bloom->header->count)) >= cur_bloom->capacity - 1) {
+ /* We don't want to add the element, even IF we need to make a new
+ * bloom to the wrong filter, so if there is no diff in id,
+ * we won't create a new bloom filter
+ */
+ if (!(id_diff == 0)) {
+ cur_bloom = bloom_from_scale(bloom, id, 0, 0);
+ }
+ }
+ }
+ break;
+ }
+ }
+ (*bloom->header->preseq) ++;
+ counting_bloom_add(cur_bloom, s);
+ (*bloom->header->posseq) ++;
+ return 1;
+}
+
+int scaling_bloom_remove(scaling_bloom_t *bloom, const char *s, uint32_t id)
+{
+ counting_bloom_t *cur_bloom;
+ int id_diff, i;
+
+ for (i = bloom->num_blooms - 1; i >= 0; i--) {
+ cur_bloom = bloom->blooms[i];
+ id_diff = id - (*cur_bloom->header->id);
+ if (id_diff >= 0) {
+ (*bloom->header->preseq)++;
+ counting_bloom_remove(cur_bloom, s);
+ (*bloom->header->posseq)++;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int scaling_bloom_check(scaling_bloom_t *bloom, const char *s)
+{
+ int i;
+ counting_bloom_t *cur_bloom;
+ for (i = bloom->num_blooms - 1; i >= 0; i--) {
+ cur_bloom = bloom->blooms[i];
+ if (counting_bloom_check(cur_bloom, s)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int scaling_bloom_flush(scaling_bloom_t *bloom)
+{
+ bitmap_flush(bloom->bitmap);
+ return 1;
+}
+
+
+scaling_bloom_t *scaling_bloom_from_file(unsigned int capacity, double error_rate, const char *filename)
+{
+ int fd;
+ off_t size;
+ uint32_t id;
+ unsigned int count, offset;
+
+ scaling_bloom_t *bloom;
+ counting_bloom_t *cur_bloom;
+
+ if ((bloom = malloc(sizeof(scaling_bloom_t))) == NULL) {
+ return NULL;
+ }
+
+ bloom->capacity = capacity;
+ bloom->error_rate = error_rate;
+ bloom->num_blooms = 0;
+ bloom->num_bytes = SCALE_HEADER_BYTES;
+
+ if ((fd = open(filename, O_RDWR, (mode_t)0600)) < 0) {
+ fprintf(stderr, "ERROR: Could not open file %s with open: \n", filename);
+ perror("");
+ scaling_bloom_destroy(bloom);
+ return NULL;
+ }
+ if ((bloom->header = malloc(sizeof(scaling_bloom_header_t))) == NULL) {
+ fprintf(stderr, "Error, Malloc failed for scaling_bloom_header_t\n");
+ return NULL;
+ }
+ if ((size = lseek(fd, 0, SEEK_END)) < 0) {
+ perror("Error calling lseek() to tell file size");
+ close(fd);
+ return NULL;
+ }
+ if (size == 0) {
+ fprintf(stderr, "ERROR: File size zero\n");
+ }
+ if ((bloom->bitmap = bitmap_create(fd, size, 1)) == NULL) {
+ fprintf(stderr, "ERROR: Could not create bitmap with file\n");
+ scaling_bloom_destroy(bloom);
+ return NULL;
+ }
+
+ bloom->header->preseq = (uint64_t *)(bloom->bitmap->array);
+ bloom->header->posseq = (uint64_t *)(bloom->bitmap->array + sizeof(uint64_t));
+ if (*bloom->header->preseq != *bloom->header->posseq) {
+ fprintf(stderr, "ERROR: File corrupt, seq nums not equal %li %li\n",
+ *bloom->header->preseq, *bloom->header->posseq);
+ return NULL;
+ }
+
+ offset = SCALE_HEADER_BYTES;
+ size -= offset + 1;
+ while (size) {
+ id = *(uint32_t *)(bloom->bitmap->array + offset);
+ count = *(uint32_t *)(bloom->bitmap->array + offset + sizeof(uint32_t));
+ cur_bloom = bloom_from_scale(bloom, id, count, 1);
+ size -= cur_bloom->num_bytes;
+ offset += cur_bloom->num_bytes;
+ if (size < 0) {
+ scaling_bloom_destroy(bloom);
+ fprintf(stderr, "ERROR: Actual filesize and expected filesize are not equal\n");
+ return NULL;
+ }
+ }
+ return bloom;
+}
77 src/dablooms.h
@@ -0,0 +1,77 @@
+#ifndef __BLOOM_H__
+#define __BLOOM_H__
+#include<stdint.h>
+#include<stdlib.h>
+
+typedef struct {
+ size_t bytes;
+ int fd;
+ char *array;
+} bitmap_t;
+
+
+bitmap_t *bitmap_resize(bitmap_t *bitmap, size_t old_size, size_t new_size, int fromfile);
+bitmap_t *bitmap_create(int fd, size_t bytes, int fromfile);
+
+int bitmap_increment(bitmap_t *bitmap, unsigned int index, unsigned int offset);
+int bitmap_decrement(bitmap_t *bitmap, unsigned int index, unsigned int offset);
+int bitmap_check(bitmap_t *bitmap, unsigned int index, unsigned int offset);
+
+int bitmap_flush(bitmap_t *bitmap);
+void bitmap_destroy(bitmap_t *bitmap);
+
+int bitmap_get_bit(bitmap_t *bitmap, unsigned int index);
+int bitmap_set_bit(bitmap_t *bitmap, unsigned int index, unsigned int val);
+
+typedef struct {
+ uint32_t *count;
+ uint32_t *id;
+} counting_bloom_header_t;
+
+
+typedef struct {
+ counting_bloom_header_t *header;
+ unsigned int capacity;
+ unsigned int offset;
+ unsigned int counts_per_func;
+ unsigned int num_salts;
+ unsigned char *salts;
+ unsigned int *hashes;
+ size_t nfuncs;
+ size_t size;
+ size_t num_bytes;
+ double error_rate;
+ bitmap_t *bitmap;
+
+} counting_bloom_t;
+
+int counting_bloom_destroy(counting_bloom_t *bloom);
+int counting_bloom_add(counting_bloom_t *bloom, const char *s);
+int counting_bloom_remove(counting_bloom_t *bloom, const char *s);
+int counting_bloom_check(counting_bloom_t *bloom, const char *s);
+
+typedef struct {
+ uint64_t *preseq;
+ uint64_t *posseq;
+} scaling_bloom_header_t;
+
+typedef struct {
+ scaling_bloom_header_t *header;
+ unsigned int capacity;
+ unsigned int num_blooms;
+ size_t num_bytes;
+ size_t size;
+ double error_rate;
+ int fd;
+ counting_bloom_t **blooms;
+ bitmap_t *bitmap;
+} scaling_bloom_t;
+
+scaling_bloom_t *scaling_bloom_create(unsigned int capacity, double error_rate, const char *filename, uint32_t id);
+scaling_bloom_t *scaling_bloom_from_file(unsigned int capacity, double error_rate, const char *filename);
+int scaling_bloom_destroy(scaling_bloom_t *bloom);
+int scaling_bloom_add(scaling_bloom_t *bloom, const char *s, uint32_t id);
+int scaling_bloom_remove(scaling_bloom_t *bloom, const char *s, uint32_t id);
+int scaling_bloom_check(scaling_bloom_t *bloom, const char *s);
+int scaling_bloom_flush(scaling_bloom_t *bloom);
+#endif
332 src/md5.c
@@ -0,0 +1,332 @@
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest. This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ */
+
+/* This code was modified in 1997 by Jim Kingdon of Cyclic Software to
+ not require an integer type which is exactly 32 bits. This work
+ draws on the changes for the same purpose by Tatu Ylonen
+ <ylo@cs.hut.fi> as part of SSH, but since I didn't actually use
+ that code, there is no copyright issue. I hereby disclaim
+ copyright in any changes I have made; this code remains in the
+ public domain. */
+
+/* Note regarding cvs_* namespace: this avoids potential conflicts
+ with libraries such as some versions of Kerberos. No particular
+ need to worry about whether the system supplies an MD5 library, as
+ this file is only about 3k of object code. */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string.h> /* for memcpy() and memset() */
+
+/* Add prototype support. */
+#ifndef PROTO
+#if defined (USE_PROTOTYPES) ? USE_PROTOTYPES : defined (__STDC__)
+#define PROTO(ARGS) ARGS
+#else
+#define PROTO(ARGS) ()
+#endif
+#endif
+
+#include "md5.h"
+
+/* Little-endian byte-swapping routines. Note that these do not
+ depend on the size of datatypes such as cvs_uint32, nor do they require
+ us to detect the endianness of the machine we are running on. It
+ is possible they should be macros for speed, but I would be
+ surprised if they were a performance bottleneck for MD5. */
+
+static cvs_uint32
+getu32 (addr)
+ const unsigned char *addr;
+{
+ return (((((unsigned long)addr[3] << 8) | addr[2]) << 8)
+ | addr[1]) << 8 | addr[0];
+}
+
+static void
+putu32 (data, addr)
+ cvs_uint32 data;
+ unsigned char *addr;
+{
+ addr[0] = (unsigned char)data;
+ addr[1] = (unsigned char)(data >> 8);
+ addr[2] = (unsigned char)(data >> 16);
+ addr[3] = (unsigned char)(data >> 24);
+}
+
+/*
+ * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void
+cvs_MD5Init (ctx)
+ struct cvs_MD5Context *ctx;
+{
+ ctx->buf[0] = 0x67452301;
+ ctx->buf[1] = 0xefcdab89;
+ ctx->buf[2] = 0x98badcfe;
+ ctx->buf[3] = 0x10325476;
+
+ ctx->bits[0] = 0;
+ ctx->bits[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void
+cvs_MD5Update (ctx, buf, len)
+ struct cvs_MD5Context *ctx;
+ unsigned char const *buf;
+ unsigned len;
+{
+ cvs_uint32 t;
+
+ /* Update bitcount */
+
+ t = ctx->bits[0];
+ if ((ctx->bits[0] = (t + ((cvs_uint32)len << 3)) & 0xffffffff) < t)
+ ctx->bits[1]++; /* Carry from low to high */
+ ctx->bits[1] += len >> 29;
+
+ t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */
+
+ /* Handle any leading odd-sized chunks */
+
+ if ( t ) {
+ unsigned char *p = ctx->in + t;
+
+ t = 64-t;
+ if (len < t) {
+ memcpy(p, buf, len);
+ return;
+ }
+ memcpy(p, buf, t);
+ cvs_MD5Transform (ctx->buf, ctx->in);
+ buf += t;
+ len -= t;
+ }
+
+ /* Process data in 64-byte chunks */
+
+ while (len >= 64) {
+ memcpy(ctx->in, buf, 64);
+ cvs_MD5Transform (ctx->buf, ctx->in);
+ buf += 64;
+ len -= 64;
+ }
+
+ /* Handle any remaining bytes of data. */
+
+ memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void
+cvs_MD5Final (digest, ctx)
+ unsigned char digest[16];
+ struct cvs_MD5Context *ctx;
+{
+ unsigned count;
+ unsigned char *p;
+
+ /* Compute number of bytes mod 64 */
+ count = (ctx->bits[0] >> 3) & 0x3F;
+
+ /* Set the first char of padding to 0x80. This is safe since there is
+ always at least one byte free */
+ p = ctx->in + count;
+ *p++ = 0x80;
+
+ /* Bytes of padding needed to make 64 bytes */
+ count = 64 - 1 - count;
+
+ /* Pad out to 56 mod 64 */
+ if (count < 8) {
+ /* Two lots of padding: Pad the first block to 64 bytes */
+ memset(p, 0, count);
+ cvs_MD5Transform (ctx->buf, ctx->in);
+
+ /* Now fill the next block with 56 bytes */
+ memset(ctx->in, 0, 56);
+ } else {
+ /* Pad block to 56 bytes */
+ memset(p, 0, count-8);
+ }
+
+ /* Append length in bits and transform */
+ putu32(ctx->bits[0], ctx->in + 56);
+ putu32(ctx->bits[1], ctx->in + 60);
+
+ cvs_MD5Transform (ctx->buf, ctx->in);
+ putu32(ctx->buf[0], digest);
+ putu32(ctx->buf[1], digest + 4);
+ putu32(ctx->buf[2], digest + 8);
+ putu32(ctx->buf[3], digest + 12);
+ memset(ctx, 0, sizeof(ctx)); /* In case it's sensitive */
+}
+
+#ifndef ASM_MD5
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, data, s) \
+ ( w += f(x, y, z) + data, w &= 0xffffffff, w = w<<s | w>>(32-s), w += x )
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data. MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+void
+cvs_MD5Transform (buf, inraw)
+ cvs_uint32 buf[4];
+ const unsigned char inraw[64];
+{
+ register cvs_uint32 a, b, c, d;
+ cvs_uint32 in[16];
+ int i;
+
+ for (i = 0; i < 16; ++i)
+ in[i] = getu32 (inraw + 4 * i);
+
+ a = buf[0];
+ b = buf[1];
+ c = buf[2];
+ d = buf[3];
+
+ MD5STEP(F1, a, b, c, d, in[ 0]+0xd76aa478, 7);
+ MD5STEP(F1, d, a, b, c, in[ 1]+0xe8c7b756, 12);
+ MD5STEP(F1, c, d, a, b, in[ 2]+0x242070db, 17);
+ MD5STEP(F1, b, c, d, a, in[ 3]+0xc1bdceee, 22);
+ MD5STEP(F1, a, b, c, d, in[ 4]+0xf57c0faf, 7);
+ MD5STEP(F1, d, a, b, c, in[ 5]+0x4787c62a, 12);
+ MD5STEP(F1, c, d, a, b, in[ 6]+0xa8304613, 17);
+ MD5STEP(F1, b, c, d, a, in[ 7]+0xfd469501, 22);
+ MD5STEP(F1, a, b, c, d, in[ 8]+0x698098d8, 7);
+ MD5STEP(F1, d, a, b, c, in[ 9]+0x8b44f7af, 12);
+ MD5STEP(F1, c, d, a, b, in[10]+0xffff5bb1, 17);
+ MD5STEP(F1, b, c, d, a, in[11]+0x895cd7be, 22);
+ MD5STEP(F1, a, b, c, d, in[12]+0x6b901122, 7);
+ MD5STEP(F1, d, a, b, c, in[13]+0xfd987193, 12);
+ MD5STEP(F1, c, d, a, b, in[14]+0xa679438e, 17);
+ MD5STEP(F1, b, c, d, a, in[15]+0x49b40821, 22);
+
+ MD5STEP(F2, a, b, c, d, in[ 1]+0xf61e2562, 5);
+ MD5STEP(F2, d, a, b, c, in[ 6]+0xc040b340, 9);
+ MD5STEP(F2, c, d, a, b, in[11]+0x265e5a51, 14);
+ MD5STEP(F2, b, c, d, a, in[ 0]+0xe9b6c7aa, 20);
+ MD5STEP(F2, a, b, c, d, in[ 5]+0xd62f105d, 5);
+ MD5STEP(F2, d, a, b, c, in[10]+0x02441453, 9);
+ MD5STEP(F2, c, d, a, b, in[15]+0xd8a1e681, 14);
+ MD5STEP(F2, b, c, d, a, in[ 4]+0xe7d3fbc8, 20);
+ MD5STEP(F2, a, b, c, d, in[ 9]+0x21e1cde6, 5);
+ MD5STEP(F2, d, a, b, c, in[14]+0xc33707d6, 9);
+ MD5STEP(F2, c, d, a, b, in[ 3]+0xf4d50d87, 14);
+ MD5STEP(F2, b, c, d, a, in[ 8]+0x455a14ed, 20);
+ MD5STEP(F2, a, b, c, d, in[13]+0xa9e3e905, 5);
+ MD5STEP(F2, d, a, b, c, in[ 2]+0xfcefa3f8, 9);
+ MD5STEP(F2, c, d, a, b, in[ 7]+0x676f02d9, 14);
+ MD5STEP(F2, b, c, d, a, in[12]+0x8d2a4c8a, 20);
+
+ MD5STEP(F3, a, b, c, d, in[ 5]+0xfffa3942, 4);
+ MD5STEP(F3, d, a, b, c, in[ 8]+0x8771f681, 11);
+ MD5STEP(F3, c, d, a, b, in[11]+0x6d9d6122, 16);
+ MD5STEP(F3, b, c, d, a, in[14]+0xfde5380c, 23);
+ MD5STEP(F3, a, b, c, d, in[ 1]+0xa4beea44, 4);
+ MD5STEP(F3, d, a, b, c, in[ 4]+0x4bdecfa9, 11);
+ MD5STEP(F3, c, d, a, b, in[ 7]+0xf6bb4b60, 16);
+ MD5STEP(F3, b, c, d, a, in[10]+0xbebfbc70, 23);
+ MD5STEP(F3, a, b, c, d, in[13]+0x289b7ec6, 4);
+ MD5STEP(F3, d, a, b, c, in[ 0]+0xeaa127fa, 11);
+ MD5STEP(F3, c, d, a, b, in[ 3]+0xd4ef3085, 16);
+ MD5STEP(F3, b, c, d, a, in[ 6]+0x04881d05, 23);
+ MD5STEP(F3, a, b, c, d, in[ 9]+0xd9d4d039, 4);
+ MD5STEP(F3, d, a, b, c, in[12]+0xe6db99e5, 11);
+ MD5STEP(F3, c, d, a, b, in[15]+0x1fa27cf8, 16);
+ MD5STEP(F3, b, c, d, a, in[ 2]+0xc4ac5665, 23);
+
+ MD5STEP(F4, a, b, c, d, in[ 0]+0xf4292244, 6);
+ MD5STEP(F4, d, a, b, c, in[ 7]+0x432aff97, 10);
+ MD5STEP(F4, c, d, a, b, in[14]+0xab9423a7, 15);
+ MD5STEP(F4, b, c, d, a, in[ 5]+0xfc93a039, 21);
+ MD5STEP(F4, a, b, c, d, in[12]+0x655b59c3, 6);
+ MD5STEP(F4, d, a, b, c, in[ 3]+0x8f0ccc92, 10);
+ MD5STEP(F4, c, d, a, b, in[10]+0xffeff47d, 15);
+ MD5STEP(F4, b, c, d, a, in[ 1]+0x85845dd1, 21);
+ MD5STEP(F4, a, b, c, d, in[ 8]+0x6fa87e4f, 6);
+ MD5STEP(F4, d, a, b, c, in[15]+0xfe2ce6e0, 10);
+ MD5STEP(F4, c, d, a, b, in[ 6]+0xa3014314, 15);
+ MD5STEP(F4, b, c, d, a, in[13]+0x4e0811a1, 21);
+ MD5STEP(F4, a, b, c, d, in[ 4]+0xf7537e82, 6);
+ MD5STEP(F4, d, a, b, c, in[11]+0xbd3af235, 10);
+ MD5STEP(F4, c, d, a, b, in[ 2]+0x2ad7d2bb, 15);
+ MD5STEP(F4, b, c, d, a, in[ 9]+0xeb86d391, 21);
+
+ buf[0] += a;
+ buf[1] += b;
+ buf[2] += c;
+ buf[3] += d;
+}
+#endif
+
+#ifdef TEST
+/* Simple test program. Can use it to manually run the tests from
+ RFC1321 for example. */
+#include <stdio.h>
+
+int
+main (int argc, char **argv)
+{
+ struct cvs_MD5Context context;
+ unsigned char checksum[16];
+ int i;
+ int j;
+
+ if (argc < 2)
+ {
+ fprintf (stderr, "usage: %s string-to-hash\n", argv[0]);
+ exit (1);
+ }
+ for (j = 1; j < argc; ++j)
+ {
+ printf ("MD5 (\"%s\") = ", argv[j]);
+ cvs_MD5Init (&context);
+ cvs_MD5Update (&context, argv[j], strlen (argv[j]));
+ cvs_MD5Final (checksum, &context);
+ for (i = 0; i < 16; i++)
+ {
+ printf ("%02x", (unsigned int) checksum[i]);
+ }
+ printf ("\n");
+ }
+ return 0;
+}
+#endif /* TEST */
39 src/md5.h
@@ -0,0 +1,39 @@
+/* See md5.c for explanation and copyright information. */
+
+/*
+ * $FreeBSD: src/contrib/cvs/lib/md5.h,v 1.2 1999/12/11 15:10:02 peter Exp $
+ */
+
+/* Add prototype support. */
+#ifndef PROTO
+#if defined (USE_PROTOTYPES) ? USE_PROTOTYPES : defined (__STDC__)
+#define PROTO(ARGS) ARGS
+#else
+#define PROTO(ARGS) ()
+#endif
+#endif
+
+#ifndef MD5_H
+#define MD5_H
+
+/* Unlike previous versions of this code, uint32 need not be exactly
+ 32 bits, merely 32 bits or more. Choosing a data type which is 32
+ bits instead of 64 is not important; speed is considerably more
+ important. ANSI guarantees that "unsigned long" will be big enough,
+ and always using it seems to have few disadvantages. */
+typedef unsigned long cvs_uint32;
+
+struct cvs_MD5Context {
+ cvs_uint32 buf[4];
+ cvs_uint32 bits[2];
+ unsigned char in[64];
+};
+
+void cvs_MD5Init PROTO ((struct cvs_MD5Context *context));
+void cvs_MD5Update PROTO ((struct cvs_MD5Context *context,
+ unsigned char const *buf, unsigned len));
+void cvs_MD5Final PROTO ((unsigned char digest[16],
+ struct cvs_MD5Context *context));
+void cvs_MD5Transform PROTO ((cvs_uint32 buf[4], const unsigned char in[64]));
+
+#endif /* !MD5_H */
170 src/test_dablooms.c
@@ -0,0 +1,170 @@
+#include<stdio.h>
+#include<string.h>
+#include<sys/types.h>
+#include<sys/stat.h>
+#include<fcntl.h>
+#include<stdlib.h>
+
+#include"dablooms.h"
+
+#define FILEPATH "/tmp/bloom.bin"
+#define CAPACITY 100000
+#define ERROR_RATE .05
+
+int test_bitmap()
+{
+ int fd;
+ int fail = 0;
+ int pass = 0;
+ int i = 0;
+ bitmap_t *map;
+
+ FILE *file;
+ if ((file = fopen(FILEPATH, "r"))) {
+ fclose(file);
+ remove(FILEPATH);
+ }
+
+ if (!(fd = open(FILEPATH, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600))) {
+ fprintf(stderr, "ERROR: Could not open file %s with open\n", FILEPATH);
+ return EXIT_FAILURE;
+ }
+
+ if (!(map = bitmap_create(fd, 1000, 0))) {
+ fprintf(stderr, "ERROR: Could not create bitmap with file");
+ return EXIT_FAILURE;
+ }
+
+ for (i = 0; i < 2000; i++) {
+ bitmap_increment(map, i, 0);
+ }
+
+ for (i = 0; i < 2000; i++) {
+ if (bitmap_check(map, i, 0)) {
+ pass++;
+ } else {
+ fail++;
+ }
+ }
+
+ for (i = 0; i < 2000; i++) {
+ bitmap_decrement(map, i, 0);
+ }
+
+ for (i = 0; i < 2000; i++) {
+ if (bitmap_check(map, i, 0)) {
+ fail++;
+ } else {
+ pass++;
+ }
+ }
+
+ if (fail) {
+ fprintf(stderr, "failures %i\n", fail);
+ } else {
+ fprintf(stderr, ".");
+ }
+ bitmap_destroy(map);
+
+ return 0;
+}
+
+void chomp_line(char *word)
+{
+ char *p;
+ if ((p = strchr(word, '\r'))) {
+ *p = '\0';
+ }
+ if ((p = strchr(word, '\n'))) {
+ *p = '\0';
+ }
+}
+
+int test_scale()
+{
+ FILE *fp, *file;
+ char word[128];
+ scaling_bloom_t *bloom;
+ int i = 0;
+ int not_exist_pass = 0, not_exist_fail = 0;
+ int exist_pass = 0, exist_fail = 0;
+
+ if ((file = fopen(FILEPATH, "r"))) {
+ fclose(file);
+ remove(FILEPATH);
+ }
+
+ if (!(bloom = scaling_bloom_create(CAPACITY, ERROR_RATE, FILEPATH, 0))) {
+ fprintf(stderr, "ERROR: Could not create bloom filter\n");
+ return EXIT_FAILURE;
+ }
+
+ if (!(fp = fopen("/usr/share/dict/words", "r"))) {
+ fprintf(stderr, "ERROR: Could not open words file");
+ return EXIT_FAILURE;
+ }
+
+ for (i = 0; fgets(word, 128, fp); i++) {
+ if (word != NULL) {
+ chomp_line(word);
+ scaling_bloom_add(bloom, word, i);
+ }
+ }
+
+ fseek(fp, 0, SEEK_SET);
+ for (i = 0; fgets(word, 128, fp); i++) {
+ if (word != NULL) {
+ if (i % 5 == 0) {
+ chomp_line(word);
+ scaling_bloom_remove(bloom, word, i);
+ }
+ }
+ }
+
+
+ bitmap_flush(bloom->bitmap);
+ scaling_bloom_destroy(bloom);
+
+ bloom = scaling_bloom_from_file(CAPACITY, ERROR_RATE, FILEPATH);
+
+ fseek(fp, 0, SEEK_SET);
+ for (i = 0; fgets(word, 128, fp); i++) {
+ if (word != NULL) {
+ chomp_line(word);
+ if (i % 5 == 0) {
+ if (!(scaling_bloom_check(bloom, word))) {
+ not_exist_pass ++;
+ } else {
+ not_exist_fail ++;
+ }
+ } else {
+ if (scaling_bloom_check(bloom, word)) {
+ exist_pass ++;
+ } else {
+ fprintf(stderr, "%s\n", word);
+ exist_fail ++;
+ }
+ }
+ }
+ }
+
+ fprintf(stderr, "non exist pass: %i\n", not_exist_pass);
+ fprintf(stderr, "non exist fail: %i\n", not_exist_fail);
+ fprintf(stderr, "exist pass: %i\n", exist_pass);
+ fprintf(stderr, "exist fail: %i\n", exist_fail);
+
+ fclose(fp);
+ scaling_bloom_destroy(bloom);
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ /*test_bitmap(); */
+ test_scale();
+
+ fprintf(stderr, "\n");
+
+ return EXIT_SUCCESS;
+}

0 comments on commit 368ffcd

Please sign in to comment.
Something went wrong with that request. Please try again.