In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from bitarray import bitarray

In [2]:
from sklearn import datasets
from sklearn.datasets import load_breast_cancer

In [3]:
class BloomFilter:
    def __init__(self, size, hash_count, dim):
        vectors = np.random.normal(0,1,size=(k,input_dim)) #random vectors(each row) from normal distribution, not unit vectors
        self.unit_vectors = np.transpose(vectors/np.sqrt(np.transpose([np.sum(np.multiply(vectors,vectors),1)]))) #Matrix where each column is a unit vector, used as hash
        self.size = size
        self.hash_count = hash_count
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)

    def give_hash_values(self,X):
        projections = np.dot(X,self.unit_vectors) #Projections of datapoints on unit vectors
        norm_proj = 1/(1+np.exp(-projections)) #Sigmoid on each value so that they are in the range (0,1)
        hash_values = (norm_proj*self.size).astype(int) #All values are integers in the range [0,size-1]
        return hash_values #Each row contains hash values of that datapoint

    def add(self,x):
        hash_values = self.give_hash_values(x)
        for i in hash_values:
            self.bit_array[i] = 1
        
    def bulk_add(self, X):
        hash_values = self.give_hash_values(X)
        for i in hash_values:
            for j in i:
                self.bit_array[j] = 1
            
    def lookup(self, x):
        hash_values = self.give_hash_values(x)
        for i in hash_values:
            if(self.bit_array[i]==0):
                return False
        return True

    
#(Input:bloom filter,normalized positive data,normalized negative data; Output:(-1) for false negative, otherwise fpr)
def fpr(bf,x_pos,x_neg):
    fp = 0
    tn = 0
    pos_hash_values = bf.give_hash_values(x_pos)
    neg_hash_values = bf.give_hash_values(x_neg)
    for i in pos_hash_values:
        for j in i:
            if(bf.bit_array[j]==0):
                return -1
    for i in neg_hash_values:
        flag = 0
        for j in i:
            if(bf.bit_array[j]==0):
                tn += 1 
                flag = 1           
                break
        if(flag==0):
            fp += 1
    return fp/(fp+tn)

In [4]:
bcancer = datasets.load_breast_cancer()
X = bcancer.data  
y = bcancer.target

In [5]:
print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [6]:
input_dim = X.shape[1]
k = 10

In [7]:
m = 3000

In [8]:
bf = BloomFilter(m,k,input_dim)

In [9]:
X = StandardScaler().fit_transform(X)

In [10]:
x_pos = X[y==1]
x_neg = X[y==0]

In [11]:
bf.bulk_add(x_pos)

In [12]:
print(fpr(bf,x_pos,x_neg))

0.05188679245283019
