In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from bitarray import bitarray

In [2]:
from sklearn import datasets
from sklearn.datasets import load_breast_cancer

In [3]:
class BloomFilter:
    def __init__(self, size, hash_count, dim):
        vectors = np.random.normal(0,1,size=(hash_count,dim)) #random vectors(each row) from normal distribution, not unit vectors
        self.unit_vectors = np.transpose(vectors/np.sqrt(np.transpose([np.sum(np.multiply(vectors,vectors),1)]))) #Matrix where each column is a unit vector, used as hash
        self.size = size
        self.hash_count = hash_count
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)

    def give_hash_values(self,X):
        projections = np.dot(X,self.unit_vectors) #Projections of datapoints on unit vectors
        norm_proj = 1/(1+np.exp(-projections)) #Sigmoid on each value so that they are in the range (0,1)
        hash_values = (norm_proj*self.size).astype(int) #All values are integers in the range [0,size-1]
        return hash_values #Each row contains hash values of that datapoint

    def add(self,x):
        hash_values = self.give_hash_values(x)
        for i in hash_values:
            self.bit_array[i] = 1
        
    def bulk_add(self, X):
        hash_values = self.give_hash_values(X)
        for i in hash_values:
            for j in i:
                self.bit_array[j] = 1
            
    def lookup(self, x):
        hash_values = self.give_hash_values(x)
        for i in hash_values:
            if(self.bit_array[i]==0):
                return False
        return True

    
#(Input:bloom filter,normalized positive data,normalized negative data; Output:(-1) for false negative, otherwise fpr)
def fpr(bf,x_pos,x_neg):
    fp = 0
    tn = 0
    pos_hash_values = bf.give_hash_values(x_pos)
    neg_hash_values = bf.give_hash_values(x_neg)
    for i in pos_hash_values:
        for j in i:
            if(bf.bit_array[j]==0):
                return -1
    for i in neg_hash_values:
        flag = 0
        for j in i:
            if(bf.bit_array[j]==0):
                tn += 1 
                flag = 1           
                break
        if(flag==0):
            fp += 1
    return fp/(fp+tn)

# Breast Cancer Dataset

In [4]:
bcancer = datasets.load_breast_cancer()
X = bcancer.data  
y = bcancer.target

In [5]:
print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [6]:
input_dim = X.shape[1]
k = 10

In [7]:
m = 3000

In [8]:
bf = BloomFilter(m,k,input_dim)

In [9]:
X = StandardScaler().fit_transform(X)

In [10]:
x_pos = X[y==1]
x_neg = X[y==0]

In [11]:
bf.bulk_add(x_pos)

In [12]:
print(fpr(bf,x_pos,x_neg))

0.05660377358490566


# Heart Disease Dataset

In [13]:
import pandas as pd

In [14]:
file = pd.read_csv("heart.csv")
x_train = np.array(file.values[:,:-1])
y=file.values[:,-1]


In [15]:
input_dim = x_train.shape[1]
k = 10

In [16]:
m = 1500

In [17]:
bf = BloomFilter(m,k,input_dim)

In [18]:
X = StandardScaler().fit_transform(x_train)

In [19]:
x_pos = X[y==1]
x_neg = X[y==0]

In [20]:
bf.bulk_add(x_pos)

In [21]:
print(fpr(bf,x_pos,x_neg))

0.057971014492753624


# Wine Dataset

In [22]:
from sklearn.datasets import load_wine

In [23]:
x , y = load_wine(return_X_y=True)

In [24]:
input_dim = x.shape[1]
k = 5

In [25]:
m=250

In [26]:
y[y==1] = 0
y[y==2] = 1

In [27]:
x_train = np.array(x)
y_train = np.array(y.reshape(-1,1))

In [28]:
bf = BloomFilter(m,k,input_dim)

In [29]:
X = StandardScaler().fit_transform(x_train)

In [30]:
x_pos = X[y==1]
x_neg = X[y==0]

In [31]:
bf.bulk_add(x_pos)

In [32]:
print(fpr(bf,x_pos,x_neg))

0.12307692307692308


# Iris Model

In [33]:
iris = datasets.load_iris()
X = iris.data  
y = iris.target

In [34]:
x_pos=iris.data[0:100]
x_neg=iris.data[100:150]
x_train = np.concatenate((x_pos, x_neg)).reshape(-1, 4)
y_train = np.concatenate((np.ones(100), np.zeros(50))).reshape(-1, 1).astype(int)

In [35]:
x_train = StandardScaler().fit_transform(x_train)

In [36]:
input_dim = x_train.shape[1]
k = 10

In [37]:
m = 300

In [38]:
bf = BloomFilter(m,k,input_dim)

In [39]:
X = StandardScaler().fit_transform(x_train)

In [40]:
x_pos = X[y==1]
x_neg = X[y==0]

In [41]:
bf.bulk_add(x_pos)

In [42]:
print(fpr(bf,x_pos,x_neg))

0.02


# 2D Gaussian Data

In [43]:
n=25000
n=4*(n//4)

In [44]:
mu11=5
mu12=12
mu1=[mu11,mu12]
sig11=50
sig12=70
cov1=[[sig11,0],[0,sig12]]
x1 = np.random.multivariate_normal(mu1,cov1,n//4)

mu21=60
mu22=100
mu2=[mu21,mu22]
sig21=30
sig22=15
cov2=[[sig21,0],[0,sig22]]
x2 = np.random.multivariate_normal(mu2,cov2,n//4)

mu31=-30
mu32=50
mu3=[mu31,mu32]
sig31=50
sig32=45
cov3=[[sig31,0],[0,sig32]]
x3 = np.random.multivariate_normal(mu3,cov3,n//4)

mu41=70
mu42=-50
mu4=[mu41,mu42]
sig41=45
sig42=75
cov4=[[sig41,0],[0,sig42]]
x4 = np.random.multivariate_normal(mu4,cov4,n//4)

x_train = np.concatenate((x1,x2,x3,x4))

y_train = np.concatenate((np.zeros(int(n//4)), np.zeros(n//4),np.ones(int(n//4)),np.ones(n//4))).reshape(-1, 1).astype(int)



In [45]:
y = y_train.reshape(-1)

In [46]:
input_dim = x_train.shape[1]
k = 2

In [47]:
m = 40000

In [48]:
X = StandardScaler().fit_transform(x_train)

In [49]:
bf = BloomFilter(m,k,input_dim)

In [50]:
x_pos = X[y==1]
x_neg = X[y==0]

In [51]:
bf.bulk_add(x_pos)

In [52]:
print(fpr(bf,x_pos,x_neg))

0.04584
