In [8]:
import fitz  # PyMuPDF
import re

In [2]:
pdf_path = "/Users/aditkotwal/Projects/FinancialAnalysisApp/Reports/aapl_10-K_report.pdf"

In [9]:
def extract_toc_custom(pdf_path):
    """
    Custom extraction of the Table of Contents from the PDF by analyzing text.
    """
    document = fitz.open(pdf_path)
    text = ""

    # Extract text from the first few pages (TOC is usually at the beginning)
    for page_num in range(min(10, len(document))):  # Check first 10 pages
        text += document[page_num].get_text()

    document.close()

    # Identify potential TOC entries using regex
    toc_pattern = re.compile(r"(Item\s\d+[A-Z]*\.\s+.+?)\s+\d{1,4}", re.MULTILINE)
    matches = toc_pattern.findall(text)

    # Clean up and store results
    toc = [match.strip() for match in matches]
    return toc

# Example usage
toc_headers = extract_toc_custom(pdf_path)

# Print the extracted TOC headers
if toc_headers:
    print("Extracted Table of Contents:")
    for header in toc_headers:
        print(header)
else:
    print("No Table of Contents could be identified.")

Extracted Table of Contents:
Item 1.
Business
Item 1A.
Risk Factors
Item 1B.
Unresolved Staff Comments
Item 1C.
Cybersecurity
Item 2.
Properties
Item 3.
Legal Proceedings
Item 4.
Mine Safety Disclosures
Item 6.
[Reserved]
Item 7.
Management’s Discussion and Analysis of Financial Condition and Results of Operations
Item 7A.
Quantitative and Qualitative Disclosures About Market Risk
Item 8.
Financial Statements and Supplementary Data
Item 9.
Changes in and Disagreements with Accountants on Accounting and Financial Disclosure
Item 9A.
Controls and Procedures
Item 9B.
Other Information
Item 9C.
Disclosure Regarding Foreign Jurisdictions that Prevent Inspections
Item 10.
Directors, Executive Officers and Corporate Governance
Item 11.
Executive Compensation
Item 13.
Certain Relationships and Related Transactions, and Director Independence
Item 14.
Principal Accountant Fees and Services
Item 15.
Exhibit and Financial Statement Schedules
Item 16.
Form


In [3]:
def extract_toc_custom(pdf_path):
    """
    Custom extraction of the Table of Contents from the PDF by analyzing text.
    """
    document = fitz.open(pdf_path)
    text = ""

    # Extract text from the first few pages (TOC is usually at the beginning)
    for page_num in range(min(10, len(document))):  # Check first 10 pages
        text += document[page_num].get_text()

    document.close()

    # Adjusted regex to capture "Item x" along with the header and maintain formatting
    toc_pattern = re.compile(r"(Item\s[\dA-Z]+(?:\.\s+)[^\d]+)", re.MULTILINE)
    matches = toc_pattern.findall(text)

    # Clean up and store results
    toc = [match.replace('  ', '    ') for match in matches]  # Replace multiple spaces with 4 spaces for better formatting
    return toc

# Example usage
toc_headers = extract_toc_custom(pdf_path)

# Print the extracted TOC headers with the desired formatting
if toc_headers:
    print("Extracted Table of Contents:")
    for header in toc_headers:
        print(header)
else:
    print("No Table of Contents could be identified.")

Extracted Table of Contents:
Item 1.
Business

Item 1A.
Risk Factors

Item 1B.
Unresolved Staff Comments

Item 1C.
Cybersecurity

Item 2.
Properties

Item 3.
Legal Proceedings

Item 4.
Mine Safety Disclosures

Item 5.
Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of
Equity Securities

Item 6.
[Reserved]

Item 7.
Management’s Discussion and Analysis of Financial Condition and Results of Operations

Item 7A.
Quantitative and Qualitative Disclosures About Market Risk

Item 8.
Financial Statements and Supplementary Data

Item 9.
Changes in and Disagreements with Accountants on Accounting and Financial Disclosure

Item 9A.
Controls and Procedures

Item 9B.
Other Information

Item 9C.
Disclosure Regarding Foreign Jurisdictions that Prevent Inspections

Item 10.
Directors, Executive Officers and Corporate Governance

Item 11.
Executive Compensation

Item 12.
Security Ownership of Certain Beneficial Owners and Management and Related Stockholder
Matters

In [4]:
def extract_table_of_contents(pdf_path):
    """
    Extract the Table of Contents (TOC) from the PDF.
    """
    document = fitz.open(pdf_path)
    toc = document.get_toc()
    document.close()

    # Extract TOC entries and format them as regex patterns
    headers = []
    for level, title, page in toc:
        headers.append(re.escape(title.strip()))  # Escape special regex characters

    return headers

def extract_headers_and_content_using_toc(pdf_path, headers):
    """
    Extract headers and their contents based on the TOC.
    """
    document = fitz.open(pdf_path)
    text = ""

    # Extract all text from the PDF
    for page in document:
        text += page.get_text()

    document.close()

    # Compile header patterns dynamically from TOC
    header_regex = re.compile(
        r"(?P<header>(" + "|".join(headers) + r"))\s*(?P<content>.*?)(?=(" + "|".join(headers) + r")|\Z)",
        re.DOTALL | re.IGNORECASE
    )
    
    # Find all matches
    matches = header_regex.finditer(text)
    
    # Extract headers and contents
    results = {}
    for match in matches:
        header = match.group("header").strip()
        content = match.group("content").strip()
        results[header] = content

    return results

# Example usage
pdf_path = "/Users/aditkotwal/Projects/FinancialAnalysisApp/Reports/aapl_10-K_report.pdf"

# Step 1: Extract Table of Contents
headers_from_toc = extract_table_of_contents(pdf_path)

# Step 2: Extract Headers and Content Using TOC
if headers_from_toc:
    extracted_data = extract_headers_and_content_using_toc(pdf_path, headers_from_toc)

    # Print results
    for header, content in extracted_data.items():
        print(f"Header: {header}")
        print(f"Content: {content[:500]}...")  # Print first 500 characters for brevity
        print("-" * 80)
else:
    print("No Table of Contents found in the document.")

No Table of Contents found in the document.


In [1]:
import pymongo
import gridfs
import fitz  # PyMuPDF


In [12]:
def extract_info_from_mongo_file(mongo_uri, db_name, filename):
    # Connect to MongoDB
    client = pymongo.MongoClient(mongo_uri)
    db = client[db_name]
    
    # Access GridFS collection
    fs = gridfs.GridFS(db)
    
    # Retrieve the file from GridFS by filename
    pdf_file = fs.find_one({"filename": filename})
    
    if pdf_file is None:
        raise FileNotFoundError(f"File with filename '{filename}' not found in the database.")
    
    # Read the binary data of the PDF
    pdf_data = pdf_file.read()
    
    # Open the PDF document using fitz
    document = fitz.open(stream=pdf_data, filetype="pdf")
    
    
    # Extract headers
    text = ""

    # Extract text from the first few pages (TOC is usually at the beginning)
    for page_num in range(min(10, len(document))):  # Check first 10 pages
        text += document[page_num].get_text()

    document.close()

    # Identify potential TOC entries using regex
    toc_pattern = re.compile(r"(Item\s\d+[A-Z]*\.\s+.+?)\s+\d{1,4}", re.MULTILINE)
    matches = toc_pattern.findall(text)

    # Clean up and store results
    toc = [match.strip() for match in matches]
    return toc

# Example usage
mongo_uri = "mongodb://localhost:27017"
db_name = "financial_reports"
filename = "aapl_10-K_report.pdf"

In [13]:
# Example usage
toc_headers = extract_info_from_mongo_file(mongo_uri, db_name, filename)

# Print the extracted TOC headers
if toc_headers:
    print("Extracted Table of Contents:")
    for header in toc_headers:
        print(header)
else:
    print("No Table of Contents could be identified.")

Extracted Table of Contents:
Item 1.
Business
Item 1A.
Risk Factors
Item 1B.
Unresolved Staff Comments
Item 1C.
Cybersecurity
Item 2.
Properties
Item 3.
Legal Proceedings
Item 4.
Mine Safety Disclosures
Item 6.
[Reserved]
Item 7.
Management’s Discussion and Analysis of Financial Condition and Results of Operations
Item 7A.
Quantitative and Qualitative Disclosures About Market Risk
Item 8.
Financial Statements and Supplementary Data
Item 9.
Changes in and Disagreements with Accountants on Accounting and Financial Disclosure
Item 9A.
Controls and Procedures
Item 9B.
Other Information
Item 9C.
Disclosure Regarding Foreign Jurisdictions that Prevent Inspections
Item 10.
Directors, Executive Officers and Corporate Governance
Item 11.
Executive Compensation
Item 13.
Certain Relationships and Related Transactions, and Director Independence
Item 14.
Principal Accountant Fees and Services
Item 15.
Exhibit and Financial Statement Schedules
Item 16.
Form
