In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import base64
import pymupdf
from openai import OpenAI

from pricing import calculate_cost_response
from models import Page, PageResponse

In [5]:
openai_client = OpenAI()
doc = pymupdf.open(r"C:\Users\alexe\Downloads\books\math-engineers.pdf")

In [6]:
def to_base64_url(page):
    matrix = pymupdf.Matrix(1.5, 1.5)

    pixmap = page.get_pixmap(matrix=matrix)
    png_bytes = pixmap.tobytes(output='png')

    b64 = base64.b64encode(png_bytes).decode("utf-8")
    image_url = f"data:image/png;base64,{b64}"

    return image_url

In [7]:
instructions = """
You are extracting a textbook page into structured page blocks.

text and formulas should be extracted verbatim.

use latex for all math.
Use "$" for inline equation and EquationBlock type for block equations.

some inline equations should be treated as block equations
if there's little text around them.

important: don't skip any text. if something is not possible to 
recognize, include a placeholder

Extraction rules:
1) Preserve reading order. The blocks list must match the order a human reads the page.
2) Do NOT include OCR or layout details (no coordinates, fonts, line breaks, or scan artifacts).
3) Prefer fewer, larger TextBlocks over many tiny ones. Group adjacent paragraphs when they belong together.
4) Use LaTeX for all math in EquationBlock.latex.
5) Section headings must be SectionHeadingBlock only; do not include body text in them.
6) FigureBlock.description should explain what the figure conveys conceptually (graphs, curves, relationships),
   not how it looks on the page.
7) TableBlock should capture semantic columns and rows. Include units in column names if shown.
8) Store the running page header (if any) in Page.header.
9) If uncertain, make a best-faith concise extraction; do not invent content.
""".strip()

def extract_page_information(page, detail="low", model_name="gpt-4o-mini") ->  PageResponse:
    image_url = to_base64_url(page) 
    
    image_content = {
        "type": "input_image",
        "image_url": image_url,
        "detail": detail
    }

    messages = [
        {"role": "system", "content": instructions}, 
        {"role": "user", "content": [image_content]}
    ]

    response = openai_client.responses.parse(
        model=model_name,
        input=messages,
        text_format=Page
    )

    cost = calculate_cost_response(response)

    return PageResponse(
        page=response.output_parsed,
        cost=cost
    )

In [8]:
page_response = extract_page_information(doc[100])

In [14]:
from pathlib import Path

output = Path('output/')

In [25]:
from tqdm.auto import tqdm

In [None]:
for page_number, page in enumerate(tqdm(doc)):
    page_file = output / f'page_{page_number:03d}.json'

    if page_file.exists():
        continue

    page_response = extract_page_information(page)

    page_json = page_response.model_dump_json(indent=2)
    page_file.write_text(page_json, encoding='utf-8')

  0%|          | 0/442 [00:00<?, ?it/s]