In [None]:
%pip install -qU pymupdf 

In [None]:
import fitz  # PyMuPDF - vanilla pdf parser
import re
from collections import defaultdict

In [None]:
# Regex for ISO-style codes such as "CG 20 37 12 19" or "CG 00 01"
ISO_CODE_PATTERN = re.compile(
    r"\b[A-Z]{2}\s?\d{2}\s?\d{2}(?:\s?\d{2}\s?\d{2})?\b"
)

def extract_iso_codes_from_headers_footers(
    pdf_path: str,
    header_fraction: float = 0.12,
    footer_fraction: float = 0.12,
) -> dict[str, list[dict[str, str | int]]]:
    """
    Extract ISO-style codes from headers and footers of each page.
    Returns a dict:
        {code: [{"page": page_number, "snippet": text_span}, ...]}
    """
    doc = fitz.open(pdf_path)
    codes = defaultdict(list)

    for page_index, page in enumerate(doc):
        page_num = page_index + 1
        rect = page.rect
        height = rect.height

        # Define header and footer bands
        header_cutoff = rect.y0 + header_fraction * height
        footer_cutoff = rect.y1 - footer_fraction * height

        text_dict = page.get_text("dict")

        for block in text_dict.get("blocks", []):
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    x0, y0, x1, y1 = span["bbox"]
                    text = span.get("text", "").strip()
                    if not text:
                        continue

                    # Only look at text in header or footer bands
                    in_header = y0 <= header_cutoff
                    in_footer = y1 >= footer_cutoff
                    if not (in_header or in_footer):
                        continue

                    for match in ISO_CODE_PATTERN.finditer(text):
                        code = " ".join(match.group(0).split())
                        codes[code].append(
                            {
                                "page": page_num,
                                "snippet": text,
                            }
                        )

    doc.close()
    return codes

In [None]:
import time

# Example usage
pdf_path = "examples/Michigan-Hospitality-Liability-Forms.pdf"

start_time = time.time()
result = extract_iso_codes_from_headers_footers(pdf_path)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Extraction took {elapsed_time:.2f} seconds")

result

In [None]:
len(result)