In [7]:
import pdfquery
import glob
import PyPDF2
from os import listdir
from os.path import isfile, join
import re

In [8]:
class Appearance:
    
    def __init__(self, docId, frequency):
        self.docId = docId
        self.frequency = frequency
        
    def __repr__(self):
        
        return str(self.__dict__)

In [9]:
class Database:
    
    def __init__(self):
        self.db = dict()

    def __repr__(self):
        return str(self.__dict__)
    
    def get(self, id):
        return self.db.get(id, None)
    
    def add(self, document):
        return self.db.update({document['id']: document})

    def remove(self, document):
        return self.db.pop(document['id'], None)

In [10]:
class InvertedIndex:
    def __init__(self, db):
        self.index = dict()
        self.db = db

    def __repr__(self):
        return str(self.index)
        
    def index_document(self, document):
        clean_text = re.sub(r'[^\w\s]','', document['text'])
        clean_text = clean_text.lower()
        
        filtered_crps = re.sub(r'\d+', '', clean_text)
        punc_filtered = filtered_crps.strip(" ")
        
        import nltk
        nltk.download('stopwords')
        nltk.download('punkt')
        nltk.download('wordnet')


        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        from nltk.tokenize import word_tokenize
        tokens = word_tokenize(punc_filtered)
        resultant_token = [i for i in tokens if not i in stop_words]
        
        appearances_dict = dict()
        
        for term in resultant_token:
            term_frequency = appearances_dict[term].frequency if term in appearances_dict else 0
            appearances_dict[term] = Appearance(document['id'], term_frequency + 1)
           
        update_dict = { key: [appearance]
                       if key not in self.index
                       else self.index[key] + [appearance]
                       for (key, appearance) in appearances_dict.items() }
        self.index.update(update_dict)
        
        self.db.add(document)
        return document
    
    def lookup_query(self, query):
        return { term: self.index[term] for term in query.split(' ') if term in self.index }


In [13]:

def highlight_term(id, term, text):
    replaced_text = text.replace(term, "\033[1;32;40m {term} \033[0;0m".format(term=term))
    return "document: {id}".format(id=id)

def main():
    import os
    db = Database()
    index = InvertedIndex(db)
  
    pdf_dir = "./PDF"
    pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
    
    for file in pdf_files:
        pg_cntnt = ''
        page_cnt = 0
        read_pdf = PyPDF2.PdfFileReader(file)
        number_of_pages = read_pdf.getNumPages()
        for i in range(number_of_pages):
            page = read_pdf.getPage(page_cnt)
            page_cnt += 1
            page_content = page.extractText()
            pg_cntnt += str(page_content)
        docs = {'id':os.path.abspath(file),'text':pg_cntnt}
        index.index_document(docs)
    search_term = input("Enter term(s) to search: ")
    result = index.lookup_query(search_term.lower())
    
    for term in result.keys():
        for appearance in result[term]:
            document = db.get(appearance.docId)
            print(highlight_term(appearance.docId, term, document['text']))
        print("-----------------------------")    
    
main()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\

Enter term(s) to search: fees
document: C:\Works\PDF_IE_Extract\PDF\doc14.pdf
document: C:\Works\PDF_IE_Extract\PDF\doc16.pdf
document: C:\Works\PDF_IE_Extract\PDF\doc17.pdf
document: C:\Works\PDF_IE_Extract\PDF\doc18.pdf
-----------------------------
