In [27]:
def sort_blocks(blocks):
    """Sort top-to-bottom, then left-to-right (handles multi-column layout)."""
    return sorted(blocks, key=lambda b: (b["bbox"][1], b["bbox"][0]))

def is_table_block(block):
    """Heuristic to detect if a text block looks like a table (many spaces)."""
    lines = block.get("lines", [])
    if len(lines) < 2:
        return False
    avg_chars = sum(len(" ".join(span["text"] for span in line["spans"])) for line in lines) / len(lines)
    avg_spaces = sum(" ".join(span["text"] for span in line["spans"]).count(" ") for line in lines) / len(lines)
    return avg_spaces > avg_chars * 0.2  # e.g., 20% spaces → likely table

def map_font_sizes_to_headers(doc):
    """Collect all font sizes and map to Markdown header levels."""
    sizes = set()
    for page in doc:
        for block in page.get_text("dict")["blocks"]:
            if block["type"] == 0:
                for line in block["lines"]:
                    for span in line["spans"]:
                        sizes.add(round(span["size"], 1))

    sorted_sizes = sorted(sizes, reverse=True)
    header_map = {}
    for i, size in enumerate(sorted_sizes):
        if size < 12:
            header_map[size] = ""  # no header
        else:
            header_map[size] = "#" * (i + 1)  # larger size = fewer #
    return header_map


In [28]:
import fitz  # PyMuPDF
import re,io,os
from PIL import Image

pdf_path = "test_pdf1.pdf"
doc = fitz.open(pdf_path)



In [32]:
file_name="hr.iu"
md_output="temp.md"
image_folder="extracted_images"
header_map = map_font_sizes_to_headers(doc)

with open(md_output, "w", encoding="utf-8") as f_md:

    for page_num, page in enumerate(doc, start=1):
        if page_num !=7:
            continue
        blocks = sort_blocks(page.get_text("dict")["blocks"])
        # blocks = page.get_text("dict")["blocks"]

        used_xrefs = set()
        DPI = 150

        current_image_block = []

        for block_id,b in enumerate(blocks):
            # --- TEXT BLOCK ---
            if b["type"] == 0:
                # Save current image block if any
                line_out=""
                line_out_img=""
                # --- Process text ---
                if is_table_block(b):
                    lines=""
                    for line in b.get("lines", []):
                        line_text = re.sub(r"\s{2,}", "|||", " ".join(span["text"] for span in line["spans"]))
                        line_text = re.sub(r"\|\|\|\|+", "|||", line_text)
                        if line_text.strip():
                            lines+=line_text.strip() + "\n"
                    line_out=lines.strip()
                else:
                    lines=""
                    for line in b.get("lines", []):
                        line_text_parts = []
                        for span in line["spans"]:
                            fontname = span.get("font", "").lower()
                            size = round(span.get("size", 0), 1)
                            text = span["text"].strip()
                            if not text:
                                continue
                            if "bold" in fontname:
                                text = f"**{text}**"
                            header_prefix = header_map.get(size, "")
                            if header_prefix and len(text) > 2 and "bold" in fontname:
                                text = f"{header_prefix} {text}"
                            line_text_parts.append(text)
                        line_text = " ".join(line_text_parts).strip()+"\n"
                        if line_text:
                            lines+="\n"+line_text
                    line_out=lines.strip() + "\n"


                # print("line,",line_out ,"\n -----s")
                if current_image_block and len(line_out.strip())>3:
                    # Merge or save single image
                    x0 = min(rect.x0 for _, rect in current_image_block)
                    y0 = min(rect.y0 for _, rect in current_image_block)
                    x1 = max(rect.x1 for _, rect in current_image_block)
                    y1 = max(rect.y1 for _, rect in current_image_block)
                    merged_rect = fitz.Rect(x0, y0, x1, y1)

                    pix = page.get_pixmap(clip=merged_rect, dpi=DPI)
                    if pix.alpha:
                        pix = fitz.Pixmap(fitz.csRGB, pix)
                    img_data = pix.tobytes("png")
                    pil_img = Image.open(io.BytesIO(img_data))
                    if pil_img.mode != 'RGB':
                        pil_img = pil_img.convert('RGB')

                    image_name = f"{file_name}_page{page_num}_block{block_id}.png"
                    image_filename = os.path.join(image_folder, image_name)
                    pil_img.save(image_filename, 'PNG')
                    # f_md.write(f"![imageurl:{image_name}]\n\n")
                    line_out_img=f"![imageurl:{image_name}]\n\n"
                    current_image_block = []

                f_md.write("\n")

                if len(line_out.strip())>3:
                    f_md.write(line_out_img)
                    f_md.write(line_out)
                
            # --- IMAGE BLOCK ---
            elif b["type"] == 1:
                if "image" in b:  # Some PDFs may store image xref directly
                    xref =b.get('number',())
                    if xref in used_xrefs:
                        continue
                    img_rects =fitz.Rect(*b.get('bbox',()))
                    # Add image to current block
                    current_image_block.append((xref, img_rects))
                    used_xrefs.add(xref)

            else:
                print("other type",b["type"],b)
        # Save any remaining image block at the end of page
        if current_image_block and len(line_out.strip())>2:
            x0 = min(rect.x0 for _, rect in current_image_block)
            y0 = min(rect.y0 for _, rect in current_image_block)
            x1 = max(rect.x1 for _, rect in current_image_block)
            y1 = max(rect.y1 for _, rect in current_image_block)
            merged_rect = fitz.Rect(x0, y0, x1, y1)

            pix = page.get_pixmap(clip=merged_rect, dpi=DPI)
            if pix.alpha:
                pix = fitz.Pixmap(fitz.csRGB, pix)
            img_data = pix.tobytes("png")
            pil_img = Image.open(io.BytesIO(img_data))
            if pil_img.mode != 'RGB':
                pil_img = pil_img.convert('RGB')

            image_name = f"{file_name}_page{page_num}_block_{block_id}.png"
            image_filename = os.path.join(image_folder, image_name)
            pil_img.save(image_filename, 'PNG')
            f_md.write(f"![imageurl:{image_name}]\n\n")

        f_md.write("\n---\n\n")  # page separator
