In [1]:
import fitz
import re
import pymongo
import gridfs

In [11]:
def extract_section(document, section_heading):
    """
    Extracts text from a specific section of a PDF.

    :param pdf_path: Path to the PDF file.
    :param section_heading: The heading of the section to extract.
    :return: Extracted text from the specified section.
    """
    try:                
        full_text = ""
        
        # Extract text from all pages
        for page_num in range(len(document)):
            page = document[page_num]
            full_text += page.get_text()

        # Normalize text and split into lines
        lines = full_text.split("\n")
        section_text = []
        capture = False
        
        # Normalize section_heading (allow spaces between "Item x." and header)
        normalized_heading = re.sub(r'\s+', ' ', section_heading.strip())  # Collapse multiple spaces
        
        for line in lines:
            # Use regex to find the Item x. format with varying spaces
            if re.search(rf"Item\s*[\dA-Z]+\.\s*{re.escape(normalized_heading)}", line, re.IGNORECASE):
                capture = True  # Start capturing text
            elif capture and line.strip().isupper():  # New heading detected
                break
            elif capture:
                section_text.append(line.strip())
        
        return "\n".join(section_text) if section_text else "Section not found."
    
    except Exception as e:
        return f"Error: {str(e)}"

In [12]:
def test(mongo_uri, db_name, filename):
    # Connect to MongoDB
    client = pymongo.MongoClient(mongo_uri)
    db = client[db_name]
    
    # Access GridFS collection
    fs = gridfs.GridFS(db)
    
    # Retrieve the file from GridFS by filename
    pdf_file = fs.find_one({"filename": filename})
    
    if pdf_file is None:
        raise FileNotFoundError(f"File with filename '{filename}' not found in the database.")
    
    # Read the binary data of the PDF
    pdf_data = pdf_file.read()
    
    # Open the PDF document using fitz
    document = fitz.open(stream=pdf_data, filetype="pdf")
    
    
    # Extract headers
    text = ""

    # Extract text from the first few pages (TOC is usually at the beginning)
    for page_num in range(min(10, len(document))):  # Check first 10 pages
        text += document[page_num].get_text()    

    # Identify potential TOC entries using regex
    toc_pattern = re.compile(r"(Item\s\d+[A-Z]*\.\s+.+?)\s+\d{1,4}", re.MULTILINE)
    matches = toc_pattern.findall(text)

    # Clean up and store results
    toc = [match.strip() for match in matches]
    headers = [re.sub(r'^Item \d+[A-Z]?\.\n', '', item) for item in toc]

    info_dict = {}

    i = 0
    for header in headers:
        info_dict[header] = extract_section(document, header)
        i += 1
        if i==5:
            break

    document.close()    

    return info_dict

In [17]:
def open_document(mongo_uri, db_name, filename):
     # Connect to MongoDB
    client = pymongo.MongoClient(mongo_uri)
    db = client[db_name]
    
    # Access GridFS collection
    fs = gridfs.GridFS(db)
    
    # Retrieve the file from GridFS by filename
    pdf_file = fs.find_one({"filename": filename})
    
    if pdf_file is None:
        raise FileNotFoundError(f"File with filename '{filename}' not found in the database.")
    
    # Read the binary data of the PDF
    pdf_data = pdf_file.read()
    
    # Open the PDF document using fitz
    document = fitz.open(stream=pdf_data, filetype="pdf")

    return document

In [18]:
def extract_headers(document):
    # Extract headers
    text = ""

    # Extract text from the first few pages (TOC is usually at the beginning)
    for page_num in range(min(10, len(document))):  # Check first 10 pages
        text += document[page_num].get_text()    

    # Identify potential TOC entries using regex
    toc_pattern = re.compile(r"(Item\s\d+[A-Z]*\.\s+.+?)\s+\d{1,4}", re.MULTILINE)
    matches = toc_pattern.findall(text)

    # Clean up and store results
    toc = [match.strip() for match in matches]
    headers = [re.sub(r'^Item \d+[A-Z]?\.\n', '', item) for item in toc]

    return headers

In [None]:
def extract_content(mongo_uri, db_name, filename):
    
    document = open_document(mongo_uri, db_name, filename)
    headers = extract_headers(document)

    info_dict = {
        'file_name': filename
    }
    
    for header in headers:
        info_dict[header] = extract_section(document, header)        

    document.close()

    return info_dict

In [3]:
mongo_uri = "mongodb://localhost:27017"
db_name = "financial_reports"
filename = "aapl_10-K_report.pdf"

In [13]:
res = test(mongo_uri, db_name, filename)

In [14]:
res

{'Business': 'Company Background\nThe  Company  designs,  manufactures  and  markets  smartphones,  personal  computers,  tablets,  wearables  and\naccessories, and sells a variety of related services. The Company’s fiscal year is the 52- or 53-week period that ends\non the last Saturday of September.\nProducts\niPhone\niPhone  is the Company’s line of smartphones based on its iOS operating system. The iPhone line includes iPhone 16\nPro, iPhone 16, iPhone 15, iPhone 14 and iPhone SE .\nMac\nMac  is the Company’s line of personal computers based on its macOS  operating system. The Mac line includes\nlaptops MacBook Air  and MacBook Pro , as well as desktops iMac , Mac mini , Mac Studio  and Mac Pro .\niPad\niPad  is the Company’s line of multipurpose tablets based on its iPadOS  operating system. The iPad line includes\niPad Pro , iPad Air , iPad and iPad mini .\nWearables, Home and Accessories\nWearables  includes  smartwatches,  wireless  headphones  and  spatial  computers.  The  Co

In [16]:
res['Business']

'Company Background\nThe  Company  designs,  manufactures  and  markets  smartphones,  personal  computers,  tablets,  wearables  and\naccessories, and sells a variety of related services. The Company’s fiscal year is the 52- or 53-week period that ends\non the last Saturday of September.\nProducts\niPhone\niPhone  is the Company’s line of smartphones based on its iOS operating system. The iPhone line includes iPhone 16\nPro, iPhone 16, iPhone 15, iPhone 14 and iPhone SE .\nMac\nMac  is the Company’s line of personal computers based on its macOS  operating system. The Mac line includes\nlaptops MacBook Air  and MacBook Pro , as well as desktops iMac , Mac mini , Mac Studio  and Mac Pro .\niPad\niPad  is the Company’s line of multipurpose tablets based on its iPadOS  operating system. The iPad line includes\niPad Pro , iPad Air , iPad and iPad mini .\nWearables, Home and Accessories\nWearables  includes  smartwatches,  wireless  headphones  and  spatial  computers.  The  Company’s  line