In [None]:
import re
from collections import defaultdict
from typing import Dict, List, Any, Tuple

import fitz  # PyMuPDF
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()

In [None]:
# -------------------------------------------------------------------
# Shared config and helpers
# -------------------------------------------------------------------

# Regex for ISO codes - requires specific multi-segment digit patterns
ISO_CODE_PATTERN = re.compile(
    r"\b"
    r"[A-Z]{2}[\s\-]?"                                             # Two-letter prefix
    r"(?:"
        r"\d{2}(?:[\s\-]?\d{2}){1,3}"                             # Standard: 2-4 segments of 2 digits (CG 00 01 04 13)
        r"|"
        r"\d{4}[\s\-]?\d{4}"                                      # GL format: 2 segments of 4 digits (GL 0169 0001)
        r"|"
        r"[A-Z]\s?\d{3}(?:[\s\-]?\d{2}){2}"                      # Letter variant: P 023 05 23
        r"|"
        r"\d{2}[\s\-]?\d{2}[\s\-]?\([Ee]d\.?[\s\-]?\d{2}[/\-]\d{2}\)"  # Edition: 21 70 (Ed. 01/15)
    r")"
    r"\b"
)

def get_header_footer_spans(
    page: fitz.Page,
    header_fraction: float,
    footer_fraction: float,
) -> List[Tuple[str, Tuple[float, float, float, float]]]:
    rect = page.rect
    height = rect.height

    header_cutoff = rect.y0 + header_fraction * height
    footer_cutoff = rect.y1 - footer_fraction * height

    spans = []
    text_dict = page.get_text("dict")

    for block in text_dict.get("blocks", []):
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text = span.get("text", "").strip()
                if not text:
                    continue

                x0, y0, x1, y1 = span["bbox"]
                in_header = y0 <= header_cutoff
                in_footer = y1 >= footer_cutoff
                if in_header or in_footer:
                    spans.append((text, (x0, y0, x1, y1)))

    return spans


def build_page_to_codes_map(
    codes_dict: Dict[str, List[Dict[str, Any]]]
) -> Dict[int, List[str]]:
    page_to_codes: Dict[int, List[str]] = defaultdict(list)
    for code, occurrences in codes_dict.items():
        for occ in occurrences:
            page = occ["page"]
            page_to_codes[page].append(code)
    return page_to_codes

def pages_to_fix_from_initial_output(pdf_path: str, initial_codes: dict) -> list[int]:
    # page -> set(unique codes)
    page_to_codes = defaultdict(set)
    for code, occs in initial_codes.items():
        for occ in occs:
            page_to_codes[int(occ["page"])].add(code)

    # total pages
    with fitz.open(pdf_path) as doc:
        all_pages = set(range(1, len(doc) + 1))

    missing_pages = sorted([p for p in all_pages if p not in page_to_codes])
    ambiguous_pages = sorted([p for p, codeset in page_to_codes.items() if len(codeset) >= 2])

    return sorted(set(missing_pages) | set(ambiguous_pages))

In [None]:
# -------------------------------------------------------------------
# Stage 1: regex-only extraction
# -------------------------------------------------------------------

def extract_iso_codes_from_headers_footers(
    pdf_path: str,
    header_fraction: float = 0.12,
    footer_fraction: float = 0.12,
) -> Dict[str, List[Dict[str, Any]]]:
    doc = fitz.open(pdf_path)
    codes = defaultdict(list)

    try:
        for page_index, page in enumerate(doc):
            page_num = page_index + 1
            spans = get_header_footer_spans(
                page,
                header_fraction=header_fraction,
                footer_fraction=footer_fraction,
            )

            for text, _bbox in spans:
                for match in ISO_CODE_PATTERN.finditer(text):
                    code = " ".join(match.group(0).split())
                    codes[code].append(
                        {
                            "page": page_num,
                            "snippet": text,
                        }
                    )
    finally:
        doc.close()

    return codes

In [None]:
# -------------------------------------------------------------------
# Stage 2: LLM cleanup for missing or ambiguous pages only
# This makes the assumption that each page corresponds to at most one policy/endorsement/exclusion
# -------------------------------------------------------------------

base_prompt = (
            "You are given the text of a single page from a commercial insurance policy.\n"
            "Find one ISO (Insurance Services Office) form code if present. A valid code looks like: CG 20 37 12 19. Sometimes,"
            "there are variants on this pattern (extra letters, different date formats, etc)\n"
            "Return exactly one code or the string NONE.\n" \
            "If there are multiple codes mentioned, use page context to determine the correct one. " \
            "The correct code is often near the top or bottom of the page."
        )

async def refine_iso_codes_with_llm(
    pdf_path: str,
    initial_codes: Dict[str, List[Dict[str, Any]]],
    model_name: str = "gpt-5-mini",
) -> Dict[str, List[Dict[str, Any]]]:
    page_to_codes = build_page_to_codes_map(initial_codes)

    doc = fitz.open(pdf_path)
    num_pages = len(doc)

    try:
        all_pages = set(range(1, num_pages + 1))

        pages_to_fix = pages_to_fix_from_initial_output(pdf_path, initial_codes)

        print(f"Using LLM to check pages: {pages_to_fix}")

        llm = ChatOpenAI(
            model=model_name,
            temperature=0.0,  # deterministic
        )

        final_page_codes = {}

        # Pages already clean
        for page in sorted(all_pages - set(pages_to_fix)):
            final_page_codes[page] = page_to_codes[page][0]

        for page_num in pages_to_fix:
            page = doc[page_num - 1]
            page_text = page.get_text("text") or ""

            prompt = base_prompt + "\nPage text:\n" + page_text

            response = await llm.ainvoke(prompt)
            raw = getattr(response, "content", str(response))
            print(f"Page num: {page_num}, LLM raw output: {raw}")
            code = raw.strip()

            if code.upper() == "NONE":
                final_page_codes[page_num] = "NONE"
            else:
                final_page_codes[page_num] = " ".join(code.split())

    finally:
        doc.close()

    # Convert from page->code format to code->pages format (same as initial_codes)
    result = defaultdict(list)
    
    for page_num, code in sorted(final_page_codes.items()):
        if code != "NONE":
            snippet = ""
            
            # If this page wasn't processed by LLM, preserve original snippet
            if page_num not in pages_to_fix:
                for orig_code, occurrences in initial_codes.items():
                    for occ in occurrences:
                        if occ["page"] == page_num and orig_code == code:
                            snippet = occ["snippet"]
                            break
            
            result[code].append({
                "page": page_num,
                "snippet": snippet
            })
    
    return dict(result)


# -------------------------------------------------------------------
# Example usage
# -------------------------------------------------------------------
# pdf_path = "your_policy.pdf"
# initial = extract_iso_codes_from_headers_footers(pdf_path)
# refined = refine_iso_codes_with_llm(pdf_path, initial)
# refined

In [None]:
import time

# Example usage
pdf_path = "examples/general-liability-forms.pdf"

start_time = time.time()
initial = extract_iso_codes_from_headers_footers(pdf_path)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Regex extraction took {elapsed_time:.2f} seconds, gave {len(initial)} codes")
initial

In [None]:

start_time = time.time()
result = await refine_iso_codes_with_llm(pdf_path, initial) 
end_time = time.time()
elapsed_time = end_time - start_time

print(f"LLM refinement took {elapsed_time:.2f} seconds, gave {len(result)} codes")

result

In [None]:
# if results look good and have all the correct codes, we can store them as ground truth
# leave it commented out so that we don't accidentally store results without checking them first

#import json
#with open("examples/Michigan-Hospitality-Liability-Forms-gt.json", "w") as f:
#    json.dump(result, f, indent=2)

In [None]:
# This code allows you to read in the json ground truth in case you want to inspect it or compare
import json
with open("examples/general-liability-forms-gt.json", "r") as f:
    ground_truth = json.load(f)

len(ground_truth)