In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import base64
import pymupdf
from openai import OpenAI

from pricing import calculate_cost_response
from models import Page, PageResponse

In [5]:
openai_client = OpenAI()
doc = pymupdf.open(r"C:\Users\alexe\Downloads\books\math-engineers.pdf")

In [6]:
def to_base64_url(page):
    matrix = pymupdf.Matrix(1.5, 1.5)

    pixmap = page.get_pixmap(matrix=matrix)
    png_bytes = pixmap.tobytes(output='png')

    b64 = base64.b64encode(png_bytes).decode("utf-8")
    image_url = f"data:image/png;base64,{b64}"

    return image_url

In [30]:
instructions = """
You are extracting a single textbook page into structured page blocks.

Hard requirements:
- Extract ALL text and ALL formulas verbatim (no paraphrasing).
- Use LaTeX for ALL math.
- Use `$...$` for inline math.
- Use EquationBlock for display (block) equations.

Do not skip any content. If something cannot be recognized, include a placeholder like
`[UNRECOGNIZED TEXT]` or `[ILLEGIBLE SYMBOL]` at the correct position in reading order.

Empty pages:
- If the page contains no readable text and no meaningful graphics/tables/figures,
  output `blocks: []`.
- Do not hallucinate page numbers, headings, or content on empty pages.

Page number rules:
- Only set Page.page_number if a page number is explicitly present on the page.
- If no page number is visible, set Page.page_number = null.
- Do NOT infer page number from file index, PDF page index, or surrounding pages.

Extraction rules:
1) Preserve human reading order. The blocks list must match the order a human reads the page.
2) Do NOT include OCR or layout details (no coordinates, fonts, line breaks, hyphenation, scan artifacts).
3) Prefer fewer, larger TextBlocks, but never drop content to achieve this.
   - Avoid consecutive TextBlocks: if two adjacent TextBlocks would be consecutive in reading order
     and have no intervening non-text content, merge them into one TextBlock.
4) Headings must be SectionHeadingBlock only; do not include body text in headings.
5) Math:
   - Use LaTeX for all math.
   - Treat an equation as a block equation (EquationBlock) if it is on its own line OR
     if there is little surrounding prose (e.g., only a short label or intro phrase).
6) Figures:
   - FigureBlock.description must explain what the figure conveys conceptually
     (relationships, variables, trends), not how it is positioned or styled.
7) Tables:
   - TableBlock must capture semantic columns/rows.
   - Include units in column names if shown.
8) Store the running page header (if any) in Page.header (not in blocks).

Block field completeness:
- Do not output null for required descriptive fields.
  If a description is required by the schema but cannot be determined, write
  a placeholder like `[DESCRIPTION NOT AVAILABLE]` instead of null.

Uncertainty:
- Make a best-faith extraction.
- Do not invent missing words, numbers, or equations; use placeholders instead.
""".strip()

def extract_page_information(page, detail="low", model_name="gpt-4o-mini") ->  PageResponse:
    image_url = to_base64_url(page) 
    
    image_content = {
        "type": "input_image",
        "image_url": image_url,
        "detail": detail
    }

    messages = [
        {"role": "system", "content": instructions}, 
        {"role": "user", "content": [image_content]}
    ]

    response = openai_client.responses.parse(
        model=model_name,
        input=messages,
        text_format=Page
    )

    cost = calculate_cost_response(response)

    return PageResponse(
        page=response.output_parsed,
        cost=cost
    )

In [32]:
from pathlib import Path

output = Path('output/')

In [33]:
from tqdm.auto import tqdm

In [40]:
def process_document(page_number) -> bool:
    try:
        page_file = output / f'page_{page_number:03d}.json'

        if page_file.exists():
            print(f'{page_file} already processed')
            return True

        page = doc[page_number]
        page_response = extract_page_information(page)
    
        page_json = page_response.model_dump_json(indent=2)
        page_file.write_text(page_json, encoding='utf-8')

        return True
    except Exception as e:
        print(e)
        return False

In [42]:
process_document(110)

True

In [44]:
from concurrent.futures import ThreadPoolExecutor

def map_progress(pool, seq, f):
    results = []
    
    with tqdm(total=len(seq)) as progress:
        futures = []
    
        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)
        
        return results

In [48]:
from concurrent.futures import ThreadPoolExecutor

In [50]:
with ThreadPoolExecutor(max_workers=4) as pool:
    pages_seq = list(range(len(doc)))
    map_progress(pool, pages_seq, process_document)

 56%|████████████████████████████████▌                         | 248/442 [00:00<00:00, 2373.95it/s]

output\page_000.json already processedoutput\page_001.json already processed

output\page_002.json already processed
output\page_003.json already processed
output\page_004.json already processed
output\page_007.json already processed
output\page_006.json already processed
output\page_005.json already processed
output\page_008.json already processed
output\page_009.json already processed
output\page_010.json already processed
output\page_011.json already processed
output\page_012.json already processed
output\page_014.json already processed
output\page_015.json already processed
output\page_013.json already processed
output\page_016.json already processed
output\page_017.json already processed
output\page_018.json already processed
output\page_019.json already processed
output\page_020.json already processed
output\page_021.json already processed
output\page_022.json already processed
output\page_023.json already processed
output\page_024.json already processed
output\page_027.json alre

100%|████████████████████████████████████████████████████████████| 442/442 [00:14<00:00, 30.48it/s]
