<a href="https://colab.research.google.com/github/btoneil2021/100-projects/blob/main/PDFTextExtractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.1


In [10]:
import fitz
import re

In [12]:
def extract_text_from_pdf(pdf_path):
    try:
        document = fitz.open(pdf_path)
        full_text = ""
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            full_text += page.get_text()
        return full_text
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return None

In [14]:
def analyze_text(text):
    if not text:
        return {
            "word_count": 0,
            "character_count": 0,
            "line_count": 0
        }

    words = re.findall(r'\b\w+\b', text.lower())

    analysis = {
        "word_count": len(words),
        "character_count": len(text),
        "line_count": len(text.splitlines())
    }
    return analysis

In [16]:
def find_keywords(text, keywords):
    keyword_counts = {}
    if not text:
        return keyword_counts

    for keyword in keywords:
        count = len(re.findall(r'\b' + re.escape(keyword) + r'\b', text, re.IGNORECASE))
        keyword_counts[keyword] = count
    return keyword_counts

In [17]:
def main():
    pdf_file_path = 'cover_letter.pdf'

    extracted_text = extract_text_from_pdf(pdf_file_path)

    if extracted_text:
        text_analysis = analyze_text(extracted_text)

        print("--- Text Analysis ---")
        print(f"Word Count: {text_analysis['word_count']}")
        print(f"Character Count: {text_analysis['character_count']}")
        print(f"Line Count: {text_analysis['line_count']}")
        print("-" * 21)

        keywords_to_find = ["I ", "think", "position"]
        keyword_results = find_keywords(extracted_text, keywords_to_find)

        print("\n--- Keyword Search Results ---")
        for keyword, count in keyword_results.items():
            print(f"'{keyword}': {count} occurrences")
        print("-" * 28)

if __name__ == "__main__":
    main()

--- Text Analysis ---
Word Count: 353
Character Count: 2170
Line Count: 36
---------------------

--- Keyword Search Results ---
'I ': 12 occurrences
'think': 0 occurrences
'position': 1 occurrences
----------------------------
