# **Setup**

* https://pypi.org/project/PyMuPDF/
* https://towardsdatascience.com/the-beginning-of-information-extraction-highlight-key-words-and-obtain-frequencies-a03da0a1ba71

In [2]:
import fitz #PymuPDF
from collections import Counter

# **Define Function**

In [13]:
def highlight_terms_and_count(input_pdf_path, output_pdf_path, terms_to_highlight, output_text_file):
   
    """
    A function which accepts a PDF file and a sting of words as input 
    and outputs a highlighted PDF file of the queried words and a text file 
    with the query word frequences.

    Arguments:
    input_pdf_path (str): Path to a PDF file
    output_pdf_file (str): Path to the output pdf file
    terms_to_highlight (list): List of terms (str) to highlight
    output_text_file (str): Path to output text file.

    Returns
    output_pd_file : A PDF highlighted with the queried words.
    output_text_file: A text file containing the frequency of each queried word.
    """

    # Open the PDF file
    pdf_document = fitz.open(input_pdf_path)
    term_counter = Counter()

    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        # Get the text on the page
        text = page.get_text()

        for term in terms_to_highlight:
            term_instances = page.search_for(term)
            term_counter[term] += len(term_instances)  # Count term instances on this page

            for term_rect in term_instances:
                # Create a highlight annotation
                highlight = page.add_highlight_annot(term_rect)
                # Set the color of the highlight (e.g., yellow)
                highlight.set_colors(stroke=(1, 1, 0))
                # Set the opacity of the highlight (0 to 1)
                highlight.set_opacity(0.5)

    # Save the modified PDF
    pdf_document.save(output_pdf_path)
    pdf_document.close()

    # Save term frequencies to a text file
    with open(output_text_file, 'w') as text_file:
        for term, frequency in term_counter.items():
            print(term,frequency)
            text_file.write(f"{term}: {frequency}\n")

# **Extract Information**

In [9]:
input_pdf_path = "Input.pdf"  # Replace with your input PDF file
output_pdf_path = "Output.pdf"  # Replace with your output PDF file
terms_to_highlight = ["neural", "networks"]  # Add the terms you want to highlight
output_text_file = "term_frequencies.txt"  # Text file to store term frequencies

In [14]:
highlight_terms_and_count(input_pdf_path, output_pdf_path, terms_to_highlight, output_text_file)

neural 21
networks 23
