In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from bitarray import bitarray

In [2]:
from sklearn import datasets
from sklearn.datasets import load_breast_cancer

In [3]:
class BloomFilter:
    def __init__(self, size, hash_count, dim):
        vectors = np.random.normal(0,1,size=(hash_count,dim)) #random vectors(each row) from normal distribution, not unit vectors
        i=0
        #while(i<hash_count):
           # if(vectors[i][0]<0):
               # print("initial v", vectors[i])
             #   vectors[i]=np.random.normal(0,1,size=(1,dim))
                #print("changed v", vectors[i])
            #else: i=i+1
       # print(vectors)        
        self.unit_vectors = np.transpose(vectors)
        #self.unit_vectors =np.transpose(vectors/np.sqrt(np.transpose([np.sum(np.multiply(vectors,vectors),1)]))) 
        #Matrix where each column is a unit vector, used as hash
        self.size = size
        self.hash_count = hash_count
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)

    def give_hash_values(self,X):
        projections = np.dot(X,self.unit_vectors) #Projections of datapoints on unit vectors
        norm_proj = 1/(1+np.exp(-projections)) #Sigmoid on each value so that they are in the range (0,1)
        hash_values = (norm_proj*self.size).astype(int) #All values are integers in the range [0,size-1]
        return hash_values #Each row contains hash values of that datapoint

    def add(self,x):
        hash_values = self.give_hash_values(x)
        for i in hash_values:
            self.bit_array[i] = 1
        
    def bulk_add(self, X):
        hash_values = self.give_hash_values(X)
        for i in hash_values:
            for j in i:
                self.bit_array[j] = 1
            
    def lookup(self, x):
        hash_values = self.give_hash_values(x)
        for i in hash_values:
            if(self.bit_array[i]==0):
                return False
        return True

    
#(Input:bloom filter,normalized positive data,normalized negative data; Output:(-1) for false negative, otherwise fpr)
def fpr(bf,x_pos,x_neg):
    fp = 0
    tn = 0
    pos_hash_values = bf.give_hash_values(x_pos)
    neg_hash_values = bf.give_hash_values(x_neg)
    for i in pos_hash_values:
        for j in i:
            if(bf.bit_array[j]==0):
                return -1
    for i in neg_hash_values:
        flag = 0
        for j in i:
            if(bf.bit_array[j]==0):
                tn += 1 
                flag = 1           
                break
        if(flag==0):
            fp += 1
    return fp/(fp+tn)

# Breast Cancer Dataset

In [4]:
bcancer = datasets.load_breast_cancer()
X = bcancer.data  
y = bcancer.target

In [5]:
print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [6]:
input_dim = X.shape[1]
k = 4

In [7]:
m = 2000

In [8]:
bf = BloomFilter(m,k,input_dim)

In [9]:
X = StandardScaler().fit_transform(X)

In [10]:
x_pos = X[y==1]
x_neg = X[y==0]

In [11]:
print(x_pos.shape)

(357, 30)


In [12]:
bf.bulk_add(x_pos)

In [13]:
print(fpr(bf,x_pos,x_neg))  #unit positive vectors

0.09905660377358491


In [14]:
print(fpr(bf,x_pos,x_neg))  #non unit positive vectors

0.13679245283018868


In [13]:
print(fpr(bf,x_pos,x_neg))  #non unit vectors

0.3113207547169811


In [13]:
print(fpr(bf,x_pos,x_neg))  #unit vectors

0.08490566037735849


# Heart Disease Dataset

In [14]:
import pandas as pd

In [15]:
file = pd.read_csv("heart.csv")
x_train = np.array(file.values[:,:-1])
y=file.values[:,-1]


In [16]:
input_dim = x_train.shape[1]
k = 4

In [17]:
m = 1000

In [18]:
bf = BloomFilter(m,k,input_dim)

In [19]:
X = StandardScaler().fit_transform(x_train)

In [20]:
x_pos = X[y==1]
x_neg = X[y==0]

In [21]:
print(x_pos.shape)

(165, 13)


In [22]:
bf.bulk_add(x_pos)

In [23]:
print(fpr(bf,x_pos,x_neg)) #unit positive vectors

0.08695652173913043


In [24]:
print(fpr(bf,x_pos,x_neg))  #non unit positive vectors

0.15217391304347827


In [23]:
print(fpr(bf,x_pos,x_neg))  #non unit vectors

0.21739130434782608


In [23]:
print(fpr(bf,x_pos,x_neg))  #unit vectors

0.10869565217391304


# Wine Dataset

In [24]:
from sklearn.datasets import load_wine

In [25]:
x , y = load_wine(return_X_y=True)

In [26]:
input_dim = x.shape[1]
k = 4

In [27]:
m=300

In [28]:
y[y==1] = 0
y[y==2] = 1

In [29]:
x_train = np.array(x)
y_train = np.array(y.reshape(-1,1))

In [30]:
bf = BloomFilter(m,k,input_dim)

In [31]:
X = StandardScaler().fit_transform(x_train)

In [32]:
x_pos = X[y==1]
x_neg = X[y==0]

In [33]:
print(x_pos.shape)

(48, 13)


In [34]:
bf.bulk_add(x_pos)

In [35]:
print(fpr(bf,x_pos,x_neg))  #unit positive vectors

0.06923076923076923


In [36]:
print(fpr(bf,x_pos,x_neg))  #non unit positive vectors

0.06923076923076923


In [35]:
print(fpr(bf,x_pos,x_neg))  #non unit vectors

0.05384615384615385


In [35]:
print(fpr(bf,x_pos,x_neg))  #unit vectors

0.09230769230769231


# Iris Model

In [36]:
iris = datasets.load_iris()
X = iris.data  
y = iris.target

In [37]:
x_pos=iris.data[0:100]
x_neg=iris.data[100:150]
x_train = np.concatenate((x_pos, x_neg)).reshape(-1, 4)
y_train = np.concatenate((np.ones(100), np.zeros(50))).reshape(-1, 1).astype(int)

In [38]:
x_train = StandardScaler().fit_transform(x_train)

In [39]:
input_dim = x_train.shape[1]
k = 3

In [40]:
m = 200

In [41]:
bf = BloomFilter(m,k,input_dim)

In [42]:
X = StandardScaler().fit_transform(x_train)

In [43]:
x_pos = X[y==1]
x_neg = X[y==0]

In [44]:
print(x_pos.shape)

(50, 4)


In [45]:
bf.bulk_add(x_pos)

In [46]:
print(fpr(bf,x_pos,x_neg))  #unit positive vectors

0.0


In [47]:
print(fpr(bf,x_pos,x_neg))  #non unit positive vectors

0.02


In [46]:
print(fpr(bf,x_pos,x_neg))  #non unit vectors

0.0


In [46]:
print(fpr(bf,x_pos,x_neg))  #unit vectors

0.02


# 2D Gaussian Data

In [47]:
n=25000
n=4*(n//4)

In [48]:
mu11=5
mu12=12
mu1=[mu11,mu12]
sig11=50
sig12=70
cov1=[[sig11,0],[0,sig12]]
x1 = np.random.multivariate_normal(mu1,cov1,n//4)

mu21=60
mu22=100
mu2=[mu21,mu22]
sig21=30
sig22=15
cov2=[[sig21,0],[0,sig22]]
x2 = np.random.multivariate_normal(mu2,cov2,n//4)

mu31=-30
mu32=50
mu3=[mu31,mu32]
sig31=50
sig32=45
cov3=[[sig31,0],[0,sig32]]
x3 = np.random.multivariate_normal(mu3,cov3,n//4)

mu41=70
mu42=-50
mu4=[mu41,mu42]
sig41=45
sig42=75
cov4=[[sig41,0],[0,sig42]]
x4 = np.random.multivariate_normal(mu4,cov4,n//4)

x_train = np.concatenate((x1,x2,x3,x4))

y_train = np.concatenate((np.zeros(int(n//4)), np.zeros(n//4),np.ones(int(n//4)),np.ones(n//4))).reshape(-1, 1).astype(int)



In [49]:
y = y_train.reshape(-1)

In [50]:
input_dim = x_train.shape[1]
k = 6

In [51]:
m = 100000

In [52]:
X = StandardScaler().fit_transform(x_train)

In [53]:
bf = BloomFilter(m,k,input_dim)

In [54]:
x_pos = X[y==1]
x_neg = X[y==0]

In [55]:
print(x_pos.shape)

(12500, 2)


In [56]:
bf.bulk_add(x_pos)

In [57]:
print(fpr(bf,x_pos,x_neg))  #unit positive vectors

0.0284


In [58]:
print(fpr(bf,x_pos,x_neg))   #non unit positive vectors

0.00104


In [57]:
print(fpr(bf,x_pos,x_neg))  #non unit vectors

0.00192


In [57]:
print(fpr(bf,x_pos,x_neg))  #unit vectors

0.00136
