## parse in the form of image

In [None]:
import fitz  # PyMuPDF

# Open the PDF
pdf_document = fitz.open('data/G12_MACHU14_TLD_1217.pdf')

# Get page 87 (0-based indexing, so page 86)
extract_num=58
dpi=300
page = pdf_document[extract_num]

# Maximum quality settings
zoom = dpi / 72  # 600 DPI for very high quality
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(
    matrix=mat,
    alpha=False,  # No transparency for smaller file size
    colorspace=fitz.csRGB  # Ensure RGB color space
)

img_data = pix.tobytes("png")

# Save as PNG
# with open('page_87_600dpi.png', 'wb') as f:
#     f.write(img_data)
with open(f'page_{extract_num}_300dpi.png', 'wb') as f:
    f.write(img_data)


pdf_document.close()


### parse text

In [None]:
import fitz  # PyMuPDF

# Open the PDF
pdf_document = fitz.open('data/G12_MACHU14_TLD_1217.pdf')

# Get page 87 (0-based indexing, so page 86)
extract_num = 58
page = pdf_document[extract_num]

# Method 2: Extract text with formatting information
text_dict = page.get_text("dict")
print("\nText with formatting:")
for block in text_dict["blocks"]:
    if "lines" in block:  # Text block
        for line in block["lines"]:
            for span in line["spans"]:
                print(f"Text: {span['text']}")
                print(f"Font: {span['font']}, Size: {span['size']}")
                print(f"Position: {span['bbox']}")

# Method 3: Extract text with coordinates (useful for layout preservation)
text_blocks = page.get_text("blocks")
for block in text_blocks:
    print(f"Text block: {block[4]}")  # block[4] contains the text
    print(f"Coordinates: {block[:4]}")  # x0, y0, x1, y1

pdf_document.close()

### Vectors, lines

In [None]:
import fitz  # PyMuPDF
import json

pdf_document = fitz.open('data/G12_MACHU14_TLD_1217.pdf')
extract_num=58
page = pdf_document[extract_num]

# Method 1: Extract drawing commands (vector paths)
drawings = page.get_drawings()
print(f"Found {len(drawings)} drawing objects")

for i, drawing in enumerate(drawings):
    print(f"\nDrawing {i}:")
    print(f"  Fill: {drawing.get('fill', None)}")
    print(f"  Stroke: {drawing.get('stroke', None)}")
    print(f"  Width: {drawing.get('width', None)}")
    print(f"  Items: {len(drawing.get('items', []))}")
    
    # Get path data (coordinates and commands)
    for item in drawing.get('items', []):
        print(f"    Command: {item[0]}, Points: {item[1:]}")

# Method 2: Get all vector paths as SVG
svg_text = page.get_svg_image()
with open(f'page_{extract_num}_vectors.svg', 'w') as f:
    f.write(svg_text)

pdf_document.close()

In [14]:
import fitz
import os

output_dir = "Output/drawing"
os.makedirs(output_dir, exist_ok=True)

pdf_document = fitz.open('data/G12_MACHU14_TLD_1217.pdf')
extract_num=58
page = pdf_document[extract_num]
drawings = page.get_drawings()

# Inspect first few drawings in detail
# for i, drawing in enumerate(drawings[:5]):
#     print(f"\n=== Drawing {i} ===")
#     print(f"Raw drawing object: {drawing}")
#     print(f"Type: {type(drawing)}")
#     print(f"Keys available: {list(drawing.keys()) if hasattr(drawing, 'keys') else 'No keys method'}")
    
#     # Check each key-value pair
#     for key in drawing:
#         print(f"  {key}: {drawing[key]} (type: {type(drawing[key])})")
    
#     items = drawing.get('items', [])
#     print(f"Items count: {len(items)}")
    
#     if items:
#         print("First few items:")
#         for j, item in enumerate(items[:3]):
#             print(f"  Item {j}: {item} (type: {type(item)})")

# pdf_document.close()

import fitz
import os

output_dir = "Output"
os.makedirs(output_dir, exist_ok=True)

pdf_document = fitz.open('data/G12_MACHU14_TLD_1217.pdf')
page = pdf_document[extract_num]
drawings = page.get_drawings()

print(f"Processing {len(drawings)} drawings...")

for i, drawing in enumerate(drawings):
    # Create new document
    temp_doc = fitz.open()
    temp_page = temp_doc.new_page(width=page.rect.width, height=page.rect.height)
    
    # White background
    temp_page.draw_rect(temp_page.rect, fill=(1, 1, 1))
    
    # Create shape
    shape = temp_page.new_shape()
    
    items = drawing.get('items', [])
    
    # Process each line in the drawing
    for item in items:
        if item[0] == 'l':  # Line command
            start_point = item[1]  # First Point
            end_point = item[2]    # Second Point
            
            # Draw the line
            shape.draw_line(start_point, end_point)
    
    # Finish with black color and visible width
    shape.finish(
        color=(0, 0, 0),  # Black color
        width=1.0         # Visible line width
    )
    shape.commit()
    
    # Convert to image
    zoom = 2.0
    mat = fitz.Matrix(zoom, zoom)
    pix = temp_page.get_pixmap(matrix=mat, alpha=False)
    
    # Save
    pix.save(f'{output_dir}/drawing_{i:03d}.png')
    
    temp_doc.close()
    
    # Progress update
    if (i + 1) % 50 == 0:
        print(f"Processed {i + 1}/{len(drawings)} drawings")

print(f"Completed! Saved {len(drawings)} drawings to {output_dir}/")
pdf_document.close()

Processing 235 drawings...
Processed 50/235 drawings
Processed 100/235 drawings
Processed 150/235 drawings
Processed 200/235 drawings
Completed! Saved 235 drawings to Output/


In [None]:
extract_num