# Document Preprocessing

In [None]:
import fitz  # pip install PyMuPDF

doc = fitz.open("Papers/hoac009.pdf")
content = []

In [None]:
for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    blocks = page.get_text("dict")["blocks"]

    for block in blocks:
        if block["type"] == 0:  # text
            block_text = ""
            for line in block["lines"]:
                for span in line["spans"]:
                    block_text += span["text"] + " "
            content.append({
                "type": "text",
                "content": block_text.strip(),
                "bbox": block["bbox"], 
                "page_num": page_num
            })
        elif block["type"] == 1:  # image
            image_ext = block["ext"]
            image_filename = f"images/image_{page_num}_{block['number']}.{image_ext}"
            with open(image_filename, "wb") as img_file:
                    img_file.write(block["image"])
            content.append({
                "type": "image",
                "content": image_filename,
                "bbox": block["bbox"], 
                "page_num": page_num
            })

In [None]:
content.sort(key=lambda x: (x["page_num"], x["bbox"][1])) # sort by page, y0 (y axis of top-left corner)

for item in content:
    if item["type"] == "text":
        print(f"Text: {item['content']}")
    elif item["type"] == "image":
        print(f"Image: {item['content']}")