In [86]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from bitarray import bitarray

In [87]:
from sklearn import datasets
from sklearn.datasets import load_breast_cancer

In [88]:
class BloomFilter:
    def __init__(self, size, hash_count, dim):
        vectors = np.random.normal(0,1,size=(hash_count,dim)) #random vectors(each row) from normal distribution, not unit vectors
        i=0
        while(i<hash_count):
            if(vectors[i][0]<0):
               # print("initial v", vectors[i])
                vectors[i]=np.random.normal(0,1,size=(1,dim))
                #print("changed v", vectors[i])
            else: i=i+1
       # print(vectors)        
        self.unit_vectors = np.transpose(vectors/np.sqrt(np.transpose([np.sum(np.multiply(vectors,vectors),1)]))) #Matrix where each column is a unit vector, used as hash
        self.size = size
        self.hash_count = hash_count
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)

    def give_hash_values(self,X):
        projections = np.dot(X,self.unit_vectors) #Projections of datapoints on unit vectors
        norm_proj = 1/(1+np.exp(-projections)) #Sigmoid on each value so that they are in the range (0,1)
        hash_values = (norm_proj*self.size).astype(int) #All values are integers in the range [0,size-1]
        return hash_values #Each row contains hash values of that datapoint

    def add(self,x):
        hash_values = self.give_hash_values(x)
        for i in hash_values:
            self.bit_array[i] = 1
        
    def bulk_add(self, X):
        hash_values = self.give_hash_values(X)
        for i in hash_values:
            for j in i:
                self.bit_array[j] = 1
            
    def lookup(self, x):
        hash_values = self.give_hash_values(x)
        for i in hash_values:
            if(self.bit_array[i]==0):
                return False
        return True

    
#(Input:bloom filter,normalized positive data,normalized negative data; Output:(-1) for false negative, otherwise fpr)
def fpr(bf,x_pos,x_neg):
    fp = 0
    tn = 0
    pos_hash_values = bf.give_hash_values(x_pos)
    neg_hash_values = bf.give_hash_values(x_neg)
    for i in pos_hash_values:
        for j in i:
            if(bf.bit_array[j]==0):
                return -1
    for i in neg_hash_values:
        flag = 0
        for j in i:
            if(bf.bit_array[j]==0):
                tn += 1 
                flag = 1           
                break
        if(flag==0):
            fp += 1
    return fp/(fp+tn)

# Breast Cancer Dataset

In [89]:
bcancer = datasets.load_breast_cancer()
X = bcancer.data  
y = bcancer.target

In [90]:
print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [91]:
input_dim = X.shape[1]
k = 4

In [92]:
m = 2000

In [93]:
bf = BloomFilter(m,k,input_dim)

In [94]:
X = StandardScaler().fit_transform(X)

In [95]:
x_pos = X[y==1]
x_neg = X[y==0]

In [96]:
print(x_pos.shape)

(357, 30)


In [97]:
bf.bulk_add(x_pos)

In [98]:
print(fpr(bf,x_pos,x_neg))

0.09905660377358491


# Heart Disease Dataset

In [99]:
import pandas as pd

In [100]:
file = pd.read_csv("heart.csv")
x_train = np.array(file.values[:,:-1])
y=file.values[:,-1]


In [101]:
input_dim = x_train.shape[1]
k = 4

In [102]:
m = 1000

In [103]:
bf = BloomFilter(m,k,input_dim)

In [104]:
X = StandardScaler().fit_transform(x_train)

In [105]:
x_pos = X[y==1]
x_neg = X[y==0]

In [106]:
print(x_pos.shape)

(165, 13)


In [107]:
bf.bulk_add(x_pos)

In [108]:
print(fpr(bf,x_pos,x_neg))

0.06521739130434782


# Wine Dataset

In [109]:
from sklearn.datasets import load_wine

In [110]:
x , y = load_wine(return_X_y=True)

In [111]:
input_dim = x.shape[1]
k = 4

In [112]:
m=300

In [113]:
y[y==1] = 0
y[y==2] = 1

In [114]:
x_train = np.array(x)
y_train = np.array(y.reshape(-1,1))

In [115]:
bf = BloomFilter(m,k,input_dim)

In [116]:
X = StandardScaler().fit_transform(x_train)

In [117]:
x_pos = X[y==1]
x_neg = X[y==0]

In [118]:
print(x_pos.shape)

(48, 13)


In [119]:
bf.bulk_add(x_pos)

In [120]:
print(fpr(bf,x_pos,x_neg))

0.1


# Iris Model

In [121]:
iris = datasets.load_iris()
X = iris.data  
y = iris.target

In [122]:
x_pos=iris.data[0:100]
x_neg=iris.data[100:150]
x_train = np.concatenate((x_pos, x_neg)).reshape(-1, 4)
y_train = np.concatenate((np.ones(100), np.zeros(50))).reshape(-1, 1).astype(int)

In [123]:
x_train = StandardScaler().fit_transform(x_train)

In [124]:
input_dim = x_train.shape[1]
k = 3

In [125]:
m = 200

In [126]:
bf = BloomFilter(m,k,input_dim)

In [127]:
X = StandardScaler().fit_transform(x_train)

In [128]:
x_pos = X[y==1]
x_neg = X[y==0]

In [129]:
print(x_pos.shape)

(50, 4)


In [130]:
bf.bulk_add(x_pos)

In [131]:
print(fpr(bf,x_pos,x_neg))

0.04


# 2D Gaussian Data

In [132]:
n=25000
n=4*(n//4)

In [133]:
mu11=5
mu12=12
mu1=[mu11,mu12]
sig11=50
sig12=70
cov1=[[sig11,0],[0,sig12]]
x1 = np.random.multivariate_normal(mu1,cov1,n//4)

mu21=60
mu22=100
mu2=[mu21,mu22]
sig21=30
sig22=15
cov2=[[sig21,0],[0,sig22]]
x2 = np.random.multivariate_normal(mu2,cov2,n//4)

mu31=-30
mu32=50
mu3=[mu31,mu32]
sig31=50
sig32=45
cov3=[[sig31,0],[0,sig32]]
x3 = np.random.multivariate_normal(mu3,cov3,n//4)

mu41=70
mu42=-50
mu4=[mu41,mu42]
sig41=45
sig42=75
cov4=[[sig41,0],[0,sig42]]
x4 = np.random.multivariate_normal(mu4,cov4,n//4)

x_train = np.concatenate((x1,x2,x3,x4))

y_train = np.concatenate((np.zeros(int(n//4)), np.zeros(n//4),np.ones(int(n//4)),np.ones(n//4))).reshape(-1, 1).astype(int)



In [134]:
y = y_train.reshape(-1)

In [135]:
input_dim = x_train.shape[1]
k = 6

In [136]:
m = 100000

In [137]:
X = StandardScaler().fit_transform(x_train)

In [138]:
bf = BloomFilter(m,k,input_dim)

In [139]:
x_pos = X[y==1]
x_neg = X[y==0]

In [140]:
print(x_pos.shape)

(12500, 2)


In [141]:
bf.bulk_add(x_pos)

In [142]:
print(fpr(bf,x_pos,x_neg))

0.01288
