In [99]:
# import fitz  # PyMuPDF

# def extract_management_report_toc(pdf_path):
#     # Open the PDF
#     doc = fitz.open(pdf_path)
    
#     # Load the first page
#     page = doc[0]
#     text_content = page.get_text("text")
    
#     # Split text into lines and remove empty whitespace
#     lines = [line.strip() for line in text_content.split('\n') if line.strip()]
    
#     headers = []
#     page_numbers = []
    
#     for line in lines:
#         # Check if the line is purely a number (the page number)
#         if line.isdigit():
#             page_numbers.append(int(line))
#         else:
#             # Otherwise, it's a section header
#             headers.append(line)
            
#     # Zip them together into a dictionary
#     # This pairs the N-th header with the N-th page number found
#     report_dict = dict(zip(headers, page_numbers))
    
#     return report_dict

# # --- Usage ---
pdf_file = r"C:\Users\arnew\Desktop\HE\ultimateclaudelovers\reports\vw_ESG_report.pdf"
# pdf_file = pdf_path = r"C:\Users\arnew\Desktop\HE\ultimateclaudelovers\reports\BMW-Group-Management-Report-2024-en.pdf"
# try:
#     toc_dict = extract_management_report_toc(pdf_file)
    
#     print("Extracted Header:Page Number Dictionary:")
#     import pprint
#     pprint.pprint(toc_dict)
# except Exception as e:
#     print(f"Error: {e}")

In [100]:
import fitz  # PyMuPDF

def extract_universal_toc(pdf_path):
    doc = fitz.open(pdf_path)
    page = doc[0]
    
    # Get all words with their coordinates (x0, y0, x1, y1, "text", ...)
    words = page.get_text("words")
    
    # 1. Group words into horizontal rows based on their Y-coordinate
    rows = {}
    for w in words:
        x0, y0, x1, y1, text = w[:5]
        # Round Y to group words on the same visual line
        y_key = round(y0, 1) 
        if y_key not in rows:
            rows[y_key] = []
        rows[y_key].append({'x0': x0, 'text': text})
    
    extracted_data = {}
    seen_pages = set()

    # 2. Process each row
    for y in sorted(rows.keys()):
        # Sort words in this row from left to right
        line = sorted(rows[y], key=lambda x: x['x0'])
        if not line:
            continue
            
        # Extract the text and find the page number
        full_line_text = [item['text'] for item in line]
        
        # Identify which element is the page number
        # It's usually the first (left-aligned) or last (right-aligned) item
        page_num = None
        header_text = ""
        
        if full_line_text[0].isdigit():
            # Format: [Page Number] [Heading Text]
            page_num = int(full_line_text[0])
            header_text = " ".join(full_line_text[1:])
        elif full_line_text[-1].isdigit():
            # Format: [Heading Text] [Page Number]
            page_num = int(full_line_text[-1])
            header_text = " ".join(full_line_text[:-1])
            
        # 3. Validation & Duplicate Removal
        # Only add if we found a number and a header, and haven't seen the page yet
        if page_num is not None and header_text.strip():
            # Clean common TOC junk like dots (......)
            clean_header = header_text.replace('...', '').strip()
            
            if page_num not in seen_pages:
                extracted_data[clean_header] = page_num
                seen_pages.add(page_num)

    # 4. Final Sort by Page Number
    return dict(sorted(extracted_data.items(), key=lambda item: item[1]))

# --- Usage ---
toc_dict = extract_universal_toc(pdf_file)
toc_dict

{'Outlook for': 2025}

In [101]:
import fitz
import re

def extract_toc_with_levels(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    first_page = doc[0]

    # Try built-in TOC first
    toc = doc.get_toc()
    if toc:
        doc.close()
        return [{"heading": title, "page": page, "level": level}
                for level, title, page in toc]

    entries = []
    for link in first_page.get_links():
        target_page = link.get("page")
        if target_page is None:
            continue

        rect = fitz.Rect(link.get("from") or link.get("rect"))
        
        # Get spans within the link rect to check font properties
        words = first_page.get_text("dict", clip=rect)
        
        is_bold = False
        font_size = 0
        text_parts = []
        
        for block in words.get("blocks", []):
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text_parts.append(span["text"])
                    font_size = max(font_size, span["size"])
                    # Bold is indicated by "Bold" in font name or flags bit 4
                    if "Bold" in span.get("font", "") or (span.get("flags", 0) & 2**4):
                        is_bold = True

        text = " ".join(text_parts).strip()
        text = re.sub(r'^\d+\s*', '', text)
        text = re.sub(r'\s*\d+$', '', text)
        text = text.strip()

        if text:
            entries.append({
                "heading": text,
                "page": int(target_page) + 1,
                "level": 0 if is_bold else 1,  # bold = top level, regular = sub
                "bold": is_bold,
                "font_size": font_size,
            })

    doc.close()
    return sorted(entries, key=lambda x: x["page"])
font_dict = extract_toc_with_levels(pdf_file)

In [102]:
def filter_top_level_headings(toc_entries: list[dict], tolerance: float = 1.0) -> list[dict]:
    max_font_size = max(e["font_size"] for e in toc_entries)
    return [e for e in toc_entries if e["font_size"] >= max_font_size - tolerance]
filtered_dict = filter_top_level_headings(font_dict)
filtered_dict

[{'heading': 'Goals and Strategies',
  'page': 3,
  'level': 1,
  'bold': False,
  'font_size': 8.5},
 {'heading': 'Internal Management System and Key Performance Indicators',
  'page': 8,
  'level': 1,
  'bold': False,
  'font_size': 8.5},
 {'heading': 'Structure and Business Activities',
  'page': 11,
  'level': 1,
  'bold': False,
  'font_size': 8.5},
 {'heading': 'Disclosures Required Under Takeover Law',
  'page': 15,
  'level': 1,
  'bold': False,
  'font_size': 8.5},
 {'heading': 'Business Development',
  'page': 18,
  'level': 1,
  'bold': False,
  'font_size': 8.5},
 {'heading': 'Shares and Bonds',
  'page': 35,
  'level': 1,
  'bold': False,
  'font_size': 8.5},
 {'heading': 'Results of Operations, Financial Position and Net Assets',
  'page': 43,
  'level': 1,
  'bold': False,
  'font_size': 8.5},
 {'heading': 'Volkswagen AG (condensed, in accordance with the German Commercial Code)',
  'page': 59,
  'level': 1,
  'bold': False,
  'font_size': 8.5},
 {'heading': 'Sustainable

In [103]:
def extract_section_text(pdf_path: str, heading: str, toc_entries: list[dict], filtered_toc: list[dict]) -> str:
    """
    Extract all text from the section of a given heading until the next TOP-LEVEL section starts.
    
    Args:
        pdf_path: Path to the PDF file
        heading: The heading to extract text for (partial match supported)
        toc_entries: Full list of TOC entries from extract_toc_with_levels()
        filtered_toc: Top-level headings only from filter_top_level_headings()
    
    Returns:
        Extracted text as a string
    """
    # Find heading in full toc (partial match)
    matches = [e for e in toc_entries if heading.lower() in e["heading"].lower()]
    if not matches:
        raise ValueError(f"Heading '{heading}' not found in TOC")
    target = matches[0]
    start_page = target["page"]

    # Find end boundary from filtered (top-level) headings only
    sorted_filtered = sorted(filtered_toc, key=lambda x: x["page"])
    end_page = None
    for entry in sorted_filtered:
        if entry["page"] > start_page:
            end_page = entry["page"]
            break

    doc = fitz.open(pdf_path)
    end_page = end_page or doc.page_count

    print(start_page, end_page)
    # raise ValueError
    text_parts = []
    for page_num in range(start_page - 1, end_page - 1):
        text_parts.append(doc[page_num].get_text())

    doc.close()
    return "\n".join(text_parts).strip()


# Usage
toc_entries = extract_toc_with_levels(pdf_file)
filtered_toc = filter_top_level_headings(toc_entries)
text = extract_section_text(pdf_file, "Sustain", toc_entries, filtered_toc)
print(text)

64 99
141 
Sustainable Value Enhancement
Group Management Report 
environment. Together, we follow a vision for the Volkswagen Group and drive sustainable value creation. Trans-
formation is a process, and we are constantly in motion: we regularly review our ambitious targets and contin-
uously adapt them. In systematically implementing our new regenerate+ Group sustainability strategy, we are 
continuing along this path. The strategy features clear measures in four dimensions: 
> Nature, with the focus areas of climate change mitigation, resources and ecosystem 
> Our people, with the focus areas of culture, workforce, occupational safety and preventive health care 
> Society, with the focus areas of supply chain, customers & stakeholders and social engagement 
> Business, with the focus areas of sustainability-related business areas and financing 
Wherever this has already been decided, each focus area is linked to clear goals and milestones, KPIs and 
appropriate packages of measure

In [104]:
import re

def clean_text(text: str) -> str:
    # Replace multiple spaces with single space
    text = re.sub(r' {2,}', ' ', text)
    # Replace 3+ newlines with double newline (preserve paragraph breaks)
    text = re.sub(r'\n{3,}', '\n\n', text)
    # Remove lines that are just whitespace
    text = re.sub(r'\n\s+\n', '\n\n', text)
    # Remove spaces at the start/end of each line
    text = "\n".join(line.strip() for line in text.splitlines())
    return text.strip()
print(clean_text(text))

141
Sustainable Value Enhancement
Group Management Report
environment. Together, we follow a vision for the Volkswagen Group and drive sustainable value creation. Trans-
formation is a process, and we are constantly in motion: we regularly review our ambitious targets and contin-
uously adapt them. In systematically implementing our new regenerate+ Group sustainability strategy, we are
continuing along this path. The strategy features clear measures in four dimensions:
> Nature, with the focus areas of climate change mitigation, resources and ecosystem
> Our people, with the focus areas of culture, workforce, occupational safety and preventive health care
> Society, with the focus areas of supply chain, customers & stakeholders and social engagement
> Business, with the focus areas of sustainability-related business areas and financing
Wherever this has already been decided, each focus area is linked to clear goals and milestones, KPIs and
appropriate packages of measures. ESG-related 