In [1]:
from bitarray import bitarray
import random
import numpy as np
import math
import timeit

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [4]:
import pandas as pd
import os

In [5]:
import sklearn.datasets as skds
from sklearn.datasets import fetch_openml

In [6]:
file = pd.read_csv("heart.csv")
file.head()
file.values

array([[63.,  1.,  3., ...,  0.,  1.,  1.],
       [37.,  1.,  2., ...,  0.,  2.,  1.],
       [41.,  0.,  1., ...,  0.,  2.,  1.],
       ...,
       [68.,  1.,  0., ...,  2.,  3.,  0.],
       [57.,  1.,  0., ...,  1.,  3.,  0.],
       [57.,  0.,  1., ...,  1.,  2.,  0.]])

In [7]:
X = np.array(file.values[:,:-1])
X = StandardScaler().fit_transform(X)
y = np.array(file.values[:,-1]).reshape(-1,1).astype(int)

In [8]:
x_neg=X[165:303]
x_pos=X[0:165]
y_pos=y[y==1]
y_neg=y[y==0]
np.random.shuffle(x_neg)
print(x_neg.shape)
print(x_pos.shape)

(138, 13)
(165, 13)


In [9]:
indices=np.arange(69)
#x_train=np.array(x_neg[0:106])[indices]
y_train=y_neg[0:69]
x_train=np.concatenate((np.array(x_neg[0:69])[indices],x_pos)).reshape(-1,13)
y_train=np.concatenate((y_train,y_pos)).reshape(-1,1)

In [10]:
x_test = x_neg[69:138].reshape(-1,13)
y_test = y_neg[69:138].reshape(-1,1)

# Standard Bloom Filter

In [11]:
from bitarray import bitarray
import mmh3
import random
import numpy as np
import math

In [12]:
class BloomFilter:
    
    def __init__(self, size, hash_count):
        self.size = size
        self.hash_count = hash_count
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)
        
    def add(self, num):
        for seed in range(1,self.hash_count+1):
            result = mmh3.hash(str(num), seed) % self.size
            self.bit_array[result] = 1
            
    def lookup(self, num):
        for seed in range(1,self.hash_count+1):
            result = mmh3.hash(str(num), seed) % self.size
            if self.bit_array[result] == 0:
                return False
        return True

# Projection model

In [13]:
class Projection_BloomFilter:
    def __init__(self, size, hash_count, dim):
        vectors = np.random.normal(0,1,size=(hash_count,dim)) #random vectors(each row) from normal distribution, not unit vectors
        i=0
        while(i<hash_count):
            if(vectors[i][0]<0):
                vectors[i][0] *= -1
               # print("initial v", vectors[i])
                #vectors[i]=np.random.normal(0,1,size=(1,dim))
                #print("changed v", vectors[i])
            i = i+1
       # print(vectors)        
        self.unit_vectors = np.transpose(vectors/np.sqrt(np.transpose([np.sum(np.multiply(vectors,vectors),1)]))) #Matrix where each column is a unit vector, used as hash
        self.size = size
        self.hash_count = hash_count
        self.bit_array = bitarray(size)
        self.bit_array.setall(0)

    def give_hash_values(self,X):
        projections = np.dot(X,self.unit_vectors) #Projections of datapoints on unit vectors
        norm_proj = 1/(1+np.exp(-projections)) #Sigmoid on each value so that they are in the range (0,1)
        hash_values = (norm_proj*(self.size-1)).astype(int) #All values are integers in the range [0,size-1]
        return hash_values                        #Each row contains hash values of that datapoint

    def add(self,x):
        hash_values = self.give_hash_values(x)
        for i in hash_values:
            self.bit_array[i] = 1
        
    def bulk_add(self, X):
        hash_values = self.give_hash_values(X)
        for i in hash_values:
            for j in i:
                self.bit_array[j] = 1
            
    def lookup(self, x):
        hash_values = self.give_hash_values(x)
        for i in hash_values:
            if(self.bit_array[i]==0):
                return False
        return True

    
#(Input:bloom filter,normalized positive data,normalized negative data; Output:(-1) for false negative, otherwise fpr)
def find_fpr(bf,x_pos,x_neg):
    fp = 0
    tn = 0
    pos_hash_values = bf.give_hash_values(x_pos)
    neg_hash_values = bf.give_hash_values(x_neg)
    for i in pos_hash_values:
        for j in i:
            if(bf.bit_array[j]==0):
                return -1
    for i in neg_hash_values:
        flag = 0
        for j in i:
            if(bf.bit_array[j]==0):
                tn += 1 
                flag = 1           
                break
        if(flag==0):
            fp += 1
    return fp/(fp+tn)

def find_fpr2(bf,x_neg):
    fp = 0
    tn = 0
    neg_hash_values = bf.give_hash_values(x_neg)
    for i in neg_hash_values:
        flag = 0
        for j in i:
            if(bf.bit_array[j]==0):
                tn += 1 
                flag = 1           
                break
        if(flag==0):
            fp += 1
    return fp/(fp+tn)

#(Input:bloom filter size, total random numbers, ratio of number inserted; Output:efficient number of hash functions)
def eff_k(m,n,r):
    k=int(m/(int(n*r))*math.log(2))
    if(k==0):
        return 1
    return k

In [14]:
def get_data_points_Projection(x_train,x_test,y_train,init_size,diff,num,epochs,fpr_values,size_values,times,standard_times):
    input_dim = x_train.shape[1]
    n = sum(y_train)
    x_pos = x_train[(y_train==1).reshape(-1)]
    print(x_pos.shape)
    x_neg = x_train[(y_train==0).reshape(-1)]
    for i in range(0,num):
        m = init_size + i*diff
        k = eff_k(m,n,1)
        fpr = 1
        start1 = timeit.default_timer()
        
        stanbf = BloomFilter(m,k)
        for x_pos_i in x_pos:
            stanbf.add(sum(x_pos_i))
        
        stop1 = timeit.default_timer()
        
        start = timeit.default_timer()
        tempbf = Projection_BloomFilter(m,k,input_dim)
        for j in range(0,epochs):
            bf = Projection_BloomFilter(m,k,input_dim)
            bf.bulk_add(x_pos)
            temp = find_fpr(bf,x_pos,x_neg)
            if(fpr>=temp):
                fpr = temp
                tempbf=bf
        stop = timeit.default_timer()
        fpr_test=find_fpr2(tempbf,x_test)
        print("Size: %d Bits - FPR: %f - Time Taken to create Standard: %f - Time Taken to create Projecion: %f \n\n\n" % (m,fpr_test,stop1-start1,stop-start))
        fpr_values += [fpr_test]
        size_values += [m]
        times += [stop-start]
        standard_times += [stop1-start1]

In [None]:
num_of_iterations = 20 #Increase to make graph smoother

fpr_values = []
size_values = []
times = []
standard_times = []

for i in range(0,num_of_iterations):
    print("~~~~~~~~ Iteration %d ~~~~~~~~ \n" %(i+1))
    get_data_points_Projection(x_train,x_test,y_train,10,10,300,100,fpr_values,size_values,times,standard_times)

~~~~~~~~ Iteration 1 ~~~~~~~~ 

(165, 13)
Size: 10 Bits - FPR: 0.840580 - Time Taken to create Standard: 0.001326 - Time Taken to create Projecion: 0.040757 



Size: 20 Bits - FPR: 0.869565 - Time Taken to create Standard: 0.000759 - Time Taken to create Projecion: 0.040350 



Size: 30 Bits - FPR: 0.623188 - Time Taken to create Standard: 0.000747 - Time Taken to create Projecion: 0.040009 



Size: 40 Bits - FPR: 0.739130 - Time Taken to create Standard: 0.000817 - Time Taken to create Projecion: 0.040152 



Size: 50 Bits - FPR: 0.840580 - Time Taken to create Standard: 0.000778 - Time Taken to create Projecion: 0.041856 



Size: 60 Bits - FPR: 0.623188 - Time Taken to create Standard: 0.000765 - Time Taken to create Projecion: 0.041833 



Size: 70 Bits - FPR: 0.652174 - Time Taken to create Standard: 0.000743 - Time Taken to create Projecion: 0.038884 



Size: 80 Bits - FPR: 0.681159 - Time Taken to create Standard: 0.000734 - Time Taken to create Projecion: 0.037846 



Size: 

Size: 710 Bits - FPR: 0.144928 - Time Taken to create Standard: 0.000919 - Time Taken to create Projecion: 0.047463 



Size: 720 Bits - FPR: 0.115942 - Time Taken to create Standard: 0.001076 - Time Taken to create Projecion: 0.052504 



Size: 730 Bits - FPR: 0.144928 - Time Taken to create Standard: 0.001064 - Time Taken to create Projecion: 0.050286 



Size: 740 Bits - FPR: 0.159420 - Time Taken to create Standard: 0.001082 - Time Taken to create Projecion: 0.049403 



Size: 750 Bits - FPR: 0.086957 - Time Taken to create Standard: 0.001081 - Time Taken to create Projecion: 0.048204 



Size: 760 Bits - FPR: 0.173913 - Time Taken to create Standard: 0.001138 - Time Taken to create Projecion: 0.056026 



Size: 770 Bits - FPR: 0.159420 - Time Taken to create Standard: 0.001140 - Time Taken to create Projecion: 0.047609 



Size: 780 Bits - FPR: 0.130435 - Time Taken to create Standard: 0.001085 - Time Taken to create Projecion: 0.052899 



Size: 790 Bits - FPR: 0.086957 - Time Ta


Size: 1400 Bits - FPR: 0.028986 - Time Taken to create Standard: 0.001470 - Time Taken to create Projecion: 0.067232 



Size: 1410 Bits - FPR: 0.014493 - Time Taken to create Standard: 0.001390 - Time Taken to create Projecion: 0.074979 



Size: 1420 Bits - FPR: 0.043478 - Time Taken to create Standard: 0.001635 - Time Taken to create Projecion: 0.064205 



Size: 1430 Bits - FPR: 0.028986 - Time Taken to create Standard: 0.001582 - Time Taken to create Projecion: 0.066458 



Size: 1440 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001528 - Time Taken to create Projecion: 0.071310 



Size: 1450 Bits - FPR: 0.014493 - Time Taken to create Standard: 0.001623 - Time Taken to create Projecion: 0.066119 



Size: 1460 Bits - FPR: 0.028986 - Time Taken to create Standard: 0.002060 - Time Taken to create Projecion: 0.066891 



Size: 1470 Bits - FPR: 0.014493 - Time Taken to create Standard: 0.001532 - Time Taken to create Projecion: 0.071427 



Size: 1480 Bits - FPR: 0.014493

Size: 2100 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001953 - Time Taken to create Projecion: 0.084278 



Size: 2110 Bits - FPR: 0.014493 - Time Taken to create Standard: 0.001847 - Time Taken to create Projecion: 0.074596 



Size: 2120 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001823 - Time Taken to create Projecion: 0.081407 



Size: 2130 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001832 - Time Taken to create Projecion: 0.080857 



Size: 2140 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001998 - Time Taken to create Projecion: 0.082158 



Size: 2150 Bits - FPR: 0.014493 - Time Taken to create Standard: 0.002652 - Time Taken to create Projecion: 0.087229 



Size: 2160 Bits - FPR: 0.043478 - Time Taken to create Standard: 0.002280 - Time Taken to create Projecion: 0.083129 



Size: 2170 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001978 - Time Taken to create Projecion: 0.080169 



Size: 2180 Bits - FPR: 0.000000 

Size: 2780 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002194 - Time Taken to create Projecion: 0.105497 



Size: 2790 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002244 - Time Taken to create Projecion: 0.091027 



Size: 2800 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002575 - Time Taken to create Projecion: 0.081482 



Size: 2810 Bits - FPR: 0.028986 - Time Taken to create Standard: 0.002820 - Time Taken to create Projecion: 0.108315 



Size: 2820 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002252 - Time Taken to create Projecion: 0.094803 



Size: 2830 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002197 - Time Taken to create Projecion: 0.087034 



Size: 2840 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002247 - Time Taken to create Projecion: 0.087592 



Size: 2850 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002262 - Time Taken to create Projecion: 0.085749 



Size: 2860 Bits - FPR: 0.000000 

Size: 490 Bits - FPR: 0.289855 - Time Taken to create Standard: 0.000931 - Time Taken to create Projecion: 0.045419 



Size: 500 Bits - FPR: 0.304348 - Time Taken to create Standard: 0.000906 - Time Taken to create Projecion: 0.045098 



Size: 510 Bits - FPR: 0.231884 - Time Taken to create Standard: 0.000931 - Time Taken to create Projecion: 0.042875 



Size: 520 Bits - FPR: 0.086957 - Time Taken to create Standard: 0.000979 - Time Taken to create Projecion: 0.042758 



Size: 530 Bits - FPR: 0.275362 - Time Taken to create Standard: 0.000913 - Time Taken to create Projecion: 0.043360 



Size: 540 Bits - FPR: 0.101449 - Time Taken to create Standard: 0.000916 - Time Taken to create Projecion: 0.046206 



Size: 550 Bits - FPR: 0.260870 - Time Taken to create Standard: 0.000969 - Time Taken to create Projecion: 0.046404 



Size: 560 Bits - FPR: 0.362319 - Time Taken to create Standard: 0.000925 - Time Taken to create Projecion: 0.046146 



Size: 570 Bits - FPR: 0.072464 - Time Ta

Size: 1210 Bits - FPR: 0.101449 - Time Taken to create Standard: 0.001343 - Time Taken to create Projecion: 0.063105 



Size: 1220 Bits - FPR: 0.028986 - Time Taken to create Standard: 0.001381 - Time Taken to create Projecion: 0.058382 



Size: 1230 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001377 - Time Taken to create Projecion: 0.058078 



Size: 1240 Bits - FPR: 0.043478 - Time Taken to create Standard: 0.001385 - Time Taken to create Projecion: 0.065155 



Size: 1250 Bits - FPR: 0.057971 - Time Taken to create Standard: 0.001870 - Time Taken to create Projecion: 0.062024 



Size: 1260 Bits - FPR: 0.043478 - Time Taken to create Standard: 0.001373 - Time Taken to create Projecion: 0.058003 



Size: 1270 Bits - FPR: 0.014493 - Time Taken to create Standard: 0.001384 - Time Taken to create Projecion: 0.060058 



Size: 1280 Bits - FPR: 0.072464 - Time Taken to create Standard: 0.001349 - Time Taken to create Projecion: 0.059784 



Size: 1290 Bits - FPR: 0.144928 

Size: 1910 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001832 - Time Taken to create Projecion: 0.076985 



Size: 1920 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001931 - Time Taken to create Projecion: 0.093925 



Size: 1930 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002069 - Time Taken to create Projecion: 0.077300 



Size: 1940 Bits - FPR: 0.014493 - Time Taken to create Standard: 0.001890 - Time Taken to create Projecion: 0.072921 



Size: 1950 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001831 - Time Taken to create Projecion: 0.081690 



Size: 1960 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001833 - Time Taken to create Projecion: 0.074120 



Size: 1970 Bits - FPR: 0.014493 - Time Taken to create Standard: 0.001817 - Time Taken to create Projecion: 0.076197 



Size: 1980 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.001816 - Time Taken to create Projecion: 0.077189 



Size: 1990 Bits - FPR: 0.000000 

Size: 2600 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002056 - Time Taken to create Projecion: 0.085295 



Size: 2610 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002122 - Time Taken to create Projecion: 0.083587 



Size: 2620 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002270 - Time Taken to create Projecion: 0.107015 



Size: 2630 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002244 - Time Taken to create Projecion: 0.086997 



Size: 2640 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002245 - Time Taken to create Projecion: 0.091978 



Size: 2650 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002207 - Time Taken to create Projecion: 0.086441 



Size: 2660 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002766 - Time Taken to create Projecion: 0.112607 



Size: 2670 Bits - FPR: 0.000000 - Time Taken to create Standard: 0.002233 - Time Taken to create Projecion: 0.084673 



Size: 2680 Bits - FPR: 0.000000 

Size: 350 Bits - FPR: 0.362319 - Time Taken to create Standard: 0.000738 - Time Taken to create Projecion: 0.037729 



Size: 360 Bits - FPR: 0.362319 - Time Taken to create Standard: 0.000755 - Time Taken to create Projecion: 0.039038 



Size: 370 Bits - FPR: 0.217391 - Time Taken to create Standard: 0.000769 - Time Taken to create Projecion: 0.040910 



Size: 380 Bits - FPR: 0.260870 - Time Taken to create Standard: 0.000746 - Time Taken to create Projecion: 0.037056 



Size: 390 Bits - FPR: 0.289855 - Time Taken to create Standard: 0.000724 - Time Taken to create Projecion: 0.037208 



Size: 400 Bits - FPR: 0.173913 - Time Taken to create Standard: 0.000789 - Time Taken to create Projecion: 0.037764 



Size: 410 Bits - FPR: 0.318841 - Time Taken to create Standard: 0.000742 - Time Taken to create Projecion: 0.037508 



Size: 420 Bits - FPR: 0.231884 - Time Taken to create Standard: 0.000768 - Time Taken to create Projecion: 0.041731 



Size: 430 Bits - FPR: 0.246377 - Time Ta

In [None]:
total_fprs = len(fpr_values)
average_fprs = []
for i in range(0,total_fprs//num_of_iterations):
    temp = 0
    for j in range(0,num_of_iterations):
        temp += fpr_values[i+j*(total_fprs//num_of_iterations)]
    temp /= num_of_iterations
    average_fprs += [temp]
print(len(average_fprs))

In [None]:
total_times = len(times)
average_times = []
for i in range(0,total_fprs//num_of_iterations):
    temp = 0
    for j in range(0,num_of_iterations):
        temp += times[i+j*(total_fprs//num_of_iterations)]
    temp /= num_of_iterations
    average_times += [temp]
print(len(average_times))

In [None]:
total_standard_times = len(standard_times)
average_standard_times = []
for i in range(0,total_fprs//num_of_iterations):
    temp = 0
    for j in range(0,num_of_iterations):
        temp += standard_times[i+j*(total_fprs//num_of_iterations)]
    temp /= num_of_iterations
    average_standard_times += [temp]
print(len(average_standard_times))

# Normal Bloom Filter Values

In [None]:
def ideal_fpr(m,n):
    k = eff_k(m,n,1)
    return (1-(1-(1/m))**(n*k))**k

In [None]:
n = sum(y_train)
y_ideal=[]
for x in size_values:
    y_ideal+=[float(ideal_fpr(x,n))]
print(len(y_ideal))
print(n)

# FPR Comparison between Projection Model and Normal Bloom Filter

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)},style="whitegrid")
sns.lineplot(x=size_values, y=fpr_values, err_style="band",label = "Projection",linewidth = 1.5,color="g")
sns.lineplot(x=size_values, y=y_ideal, err_style="band",label = "Ideal",linewidth = 1.5,color="r")
plt.title('FPRs')
plt.xlabel("Size", fontsize = 15)
plt.ylabel("FPR", fontsize = 15)
#plt.figure(figsize = (5000,5000))
plt.show()

In [None]:
average_fprs = num_of_iterations*average_fprs
print(len(average_fprs))

In [None]:
average_times = num_of_iterations*average_times
print(len(average_times))

In [None]:
print(average_fprs)

In [None]:
saved_data = list(zip(size_values, y_ideal, standard_times, average_standard_times, fpr_values, average_fprs, times, average_times))
df = pd.DataFrame(saved_data,columns = ['Size of filter(bits)','Ideal FPR','Standard Filter Total Creation Time(sec)', 'Average Standard Filter Total Creation Time(sec)', 'Projection FPR for iteration', 'Average Projection FPR', 'Projection Filter Total Creation Time(sec)', 'Average Projection Filter Total Creation Time(sec)'])
df.head()

In [None]:
df.to_csv('/Users/aditijain/Desktop/NewFolderWithItems/SURA/Heart_rerun_epochs_100.csv')