In [2]:
import numpy
import mmh3
import pandas as pd

In [3]:
class BloomFilter(object):
    def __init__(self, nitems, false_positive_prob):
        self.size = int(-(nitems * numpy.log(false_positive_prob))/(numpy.log(2)**2)) # size of the bloom filter
        self.nhash = int((self.size/nitems) * numpy.log(2))+1 # number of hash functions

        self.bloomf = numpy.zeros(self.size) # initializing the bloom filter as an array of size self.size with everything at 0
 
    def add(self, item):
        for i in range(self.nhash):
            h = mmh3.hash(item, i) % self.size #i = seed
            self.bloomf[h] = 1
 
    def check(self, item):
        for i in range(self.nhash):

            h = mmh3.hash(item, i) % self.size

            if self.bloomf[h] == 0:
                return False

        return True

In [5]:
chunk = pd.read_csv("dataset/dset.csv", chunksize= 1000000)
our_set = pd.concat(chunk)

In [6]:
our_set.head()

Unnamed: 0.1,Unnamed: 0,Title,Author,Subjects
0,0,A tale of two friends,"O'Ryan, Ellie","Musicians Fiction, Bullfighters Fiction, Best ..."
1,1,"Naruto. Vol. 1, Uzumaki Naruto","Kishimoto, Masashi","Ninja Japan Comic books strips etc, Comic book..."
2,2,"Peace, love & Wi-Fi : a ZITS treasury","Scott, Jerry",Duncan Jeremy Fictitious character Comic books...
3,3,The Paris pilgrims : a novel,"Carlile, Clancy","Hemingway Ernest 1899 1961 Fiction, Biographic..."
4,4,"Erotic by nature : a celebration of life, of l...",,"Erotic literature American, American literatur..."


In [7]:
n = len(our_set) 
p = 0.05 #false positive probability

titles_bf = BloomFilter(n,p)
print(titles_bf.nhash)

authors_bf = BloomFilter(n,p)
subjects_bf = BloomFilter(n,p)

5


In [8]:
file_object = open('subjects.txt', 'a')
for _,item in our_set.iterrows():
    title = item["Title"]
    author = item["Author"]
    subjects = item["Subjects"]

    if title and isinstance(title, str):
        titles_bf.add(title)

    if author and isinstance(author, str):
        authors_bf.add(author)

    if subjects and isinstance(subjects, str):
        sub_list = str(subjects).split(", ")
        for subject in sub_list:
            subjects_bf.add(subject)
file_object.close()

In [9]:
def print_menu():
    print(30 * "-" , "WELCOME TO OUR LIBRARY" , 30 * "-")
    print("1. Search by Title")
    print("2. Search by Author")
    print("3. Search by Subject")
    print("0. Exit")
    print(67 * "-")

print_menu()

def main():
    loop = True

    while loop:
        try:
            choice = int(input("Enter your choice [1-3] or 0 to quit: "))
        except:
            choice = 222
        
        if choice==1:     
            inpt = input("Enter the title: ")
            if titles_bf.check(inpt):
                print("The searched book is in our library!")
            else:
                print("We're sorry, but we didn't find a result.")
        elif choice==2:     
            inpt = input("Enter the author: ")
            if authors_bf.check(inpt):
                print("We have books of the searched author in our library!")
            else:
                print("We're sorry, but we didn't find a result.")
        elif choice==3:     
            inpt = input("Enter the subject: ")
            if subjects_bf.check(inpt):
                print("We have books about the searched subject in our library!")
            else:
                print("We're sorry, but we didn't find a result.")
        elif choice==0:
            print("See you soon!")
        else:
            # Any integer inputs other than values 1-2 we print an error message
            print("Wrong option selected.")

main()

------------------------------ WELCOME TO OUR LIBRARY ------------------------------
1. Search by Title
2. Search by Author
3. Search by Subject
0. Exit
-------------------------------------------------------------------
We have books about the searched subject in our library!
We have books about the searched subject in our library!
See you soon!
