# Bloom Filters Application 2

### This project contains the applications of Bloom Filters in 
* searching in multiple databases / caches


## Bloom Filter definition

In [1]:
import math 
import mmh3 
from bitarray import bitarray
import time
import os

In [2]:
class BloomFilter(object):
    def __init__(self, items_count,fp_prob): 
        self.fp_prob = fp_prob 
        self.size = self.get_size(items_count,fp_prob) 
        self.hash_count = self.get_hash_count(self.size,items_count) 
        self.bit_array = bitarray(self.size) 
        self.bit_array.setall(0) 

    def add(self, item):
        digests = [] 
        for i in range(self.hash_count): 
            digest = mmh3.hash(item,i) % self.size 
            digests.append(digest)
            self.bit_array[digest] = True

    def check(self, item): 
        for i in range(self.hash_count): 
            digest = mmh3.hash(item,i) % self.size 
            if self.bit_array[digest] == False:
                return False
        return True

    @classmethod
    def get_size(self,n,p):
        m = -(n * math.log(p))/(math.log(2)**2) 
        return int(m) 

    @classmethod
    def get_hash_count(self, m, n):
        k = (m/n) * math.log(2) 
        return int(k)

## Application : searching in multiple databases / caches

In [3]:
multiple_bloom = []
path = 'data/'

In [4]:
# create multiples bloom filters for every database
n = 100000
p = 0.05

In [5]:
def bloom_create():
    for filename in os.listdir(path):
        filename = path+filename
        bloom = BloomFilter(n,p)
        with open(filename) as word_file:
            valid_words = set(word_file.read().split())
        for item in valid_words: 
            bloom.add(item)
        multiple_bloom.append(bloom)

In [6]:
def bloom_check(value):
    for i in range(1,5):
        if multiple_bloom[i-1].check(value):
            print(i)

In [7]:
def trad_check(value):
    for filename in os.listdir(path):
        filename = path+filename
        with open(filename) as word_file:
            words = set(word_file.read().split())
            if value in words:
                print(filename)

In [8]:
bloom_create()

In [9]:
start_time = time.time()
trad_check('potato')
tms = time.time() - start_time
print("Time for Normal search --- %s seconds ---" % (tms))


nst = time.time()
bloom_check('potato')
bftm = time.time() - nst
print("Time for bloom filters lookup --- %s seconds ---" % (bftm))

data/database_300000.txt
Time for Normal search --- 0.11062812805175781 seconds ---
3
Time for bloom filters lookup --- 0.00014925003051757812 seconds ---
