In [30]:
import numpy
import mmh3

class BloomFilter(object):
    def __init__(self, nitems, false_positive_prob):
        self.size = int(-(nitems * numpy.log(false_positive_prob))/(numpy.log(2)**2)) # size of the bloom filter
        self.nhash = int((self.size/nitems) * numpy.log(2)) # number of hash functions

        self.bloomf = numpy.zeros(self.size) # initializing the bloom filter as an array of size self.size with everything at 0
 
    def add(self, item):
        for i in range(self.nhash):
            h = mmh3.hash(item, i) % self.size #i = seed
            self.bloomf[h] = 1
 
    def check(self, item):
        for i in range(self.nhash):

            h = mmh3.hash(item, i) % self.size

            if self.bloomf[h] == 0:
                return False

        return True

In [31]:
n = 1003519 # TODO: Change this setting 
p = 0.05 #false positive probability

titles_bf = BloomFilter(n,p)
print(titles_bf.nhash)

authors_bf = BloomFilter(n,p)
subjects_bf = BloomFilter(n,p)

4


In [32]:
import pandas as pd

nlines_read = 1000
size = 1
while True:
    try:
        chunk = pd.read_csv("dataset/new.csv", skiprows=range(0, size), nrows=nlines_read, names=["Title", "Author", "Subjects"])
        size += nlines_read
    except StopIteration:
        break
    else:
        for _,item in chunk.iterrows():
            title = item["Title"]
            author = item["Author"]
            subjects = item["Subjects"]

            if title and isinstance(title, str):
                titles_bf.add(str(title).split(" /")[0])

            if author and isinstance(author, str):
                author_name = str(author).split(", ")
                if (author_name[-1].replace("-","")).isnumeric():
                    author_name = ",".join(author_name[:-1])
                else:
                    author_name = author
                authors_bf.add(author_name)

            if subjects and isinstance(subjects, str):
                sub_list = str(subjects).split(", ")
                for subject in sub_list:
                    subjects_bf.add(subject)

In [None]:
def print_menu():
    print(30 * "-" , "WELCOME TO OUR LIBRARY" , 30 * "-")
    print("1. Search by Title")
    print("2. Search by Author")
    print("3. Search by Subject")
    print("0. Exit")
    print(67 * "-")

def main():
    loop = True

    while loop:
        print_menu()
        try:
            choice = int(input("Enter your choice [1-3] or 0 to quit: "))
        except:
            choice = 222
        
        if choice==1:     
            inpt = input("Enter the title: ")
            if titles_bf.check(inpt):
                print("The searched book is in our library!")
            else:
                print("We're sorry, but we didn't find a result.")
        elif choice==2:     
            inpt = input("Enter the author: ")
            if authors_bf.check(inpt):
                print("We have books of the searched author in our library!")
            else:
                print("We're sorry, but we didn't find a result.")
        elif choice==3:     
            inpt = input("Enter the subject: ")
            if titles_bf.check(inpt):
                print("We have books about the searched subject in our library!")
            else:
                print("We're sorry, but we didn't find a result.")
        elif choice==0:
            print("See you soon!")
            loop=False # This will make the while loop to end as not value of loop is set to False
        else:
            # Any integer inputs other than values 1-2 we print an error message
            print("Wrong option selected.")

main()