In [1]:
import os
from docling.document_converter import DocumentConverter

def save_text_and_tables(document, output_dir):
    markdown_content = document.export_to_markdown()
    with open(os.path.join(output_dir, 'text_and_tables.md'), 'w', encoding='utf-8') as f:
        f.write(markdown_content)
    print(f"Text and tables saved to: {os.path.join(output_dir, 'text_and_tables.md')}")

def save_images(document, output_dir):
    doc_dict = document.export_to_dict()
    images = []
    
    def find_images(item):
        if isinstance(item, dict):
            if item.get('label') == 'picture' and 'image' in item:
                images.append(item)
            for value in item.values():
                find_images(value)
        elif isinstance(item, list):
            for element in item:
                find_images(element)

    find_images(doc_dict)
    
    print(f"Found {len(images)} potential images in the document.")

    for i, image in enumerate(images):
        print(f"Processing image {i+1}:")
        if 'image' in image and 'content' in image['image']:
            try:
                file_extension = image['image']['mime_type'].split('/')[-1]
                image_path = os.path.join(output_dir, f'image_{i+1}.{file_extension}')
                image_content = image['image']['content']
                
                # Check if content is already bytes, if not, encode it
                if not isinstance(image_content, bytes):
                    image_content = image_content.encode()
                
                with open(image_path, 'wb') as f:
                    f.write(image_content)
                print(f"  Image {i+1} saved to: {image_path}")
            except Exception as e:
                print(f"  Error saving image {i+1}: {str(e)}")
        else:
            print(f"  Image {i+1} doesn't have the expected structure.")

    if not images:
        print("No images found in the document.")

# Set up the document converter
converter = DocumentConverter()

# Path to your PDF file
source = "a.pdf"

# Convert the PDF
result = converter.convert(source)

# Print top-level keys of the exported dictionary (for debugging)
doc_dict = result.document.export_to_dict()
print("Top-level keys in exported dictionary:", list(doc_dict.keys()))

# Print more detailed information about the document structure
print("\nDocument structure:")
for key, value in doc_dict.items():
    if isinstance(value, list):
        print(f"{key}: List with {len(value)} items")
    elif isinstance(value, dict):
        print(f"{key}: Dictionary with keys {list(value.keys())}")
    else:
        print(f"{key}: {type(value)}")

# Create an output directory
output_dir = 'pdf_output'
os.makedirs(output_dir, exist_ok=True)

# Save text and tables
save_text_and_tables(result.document, output_dir)

# Save images
save_images(result.document, output_dir)

print("PDF parsing and saving complete.")


Top-level keys in exported dictionary: ['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tables', 'key_value_items', 'pages']

Document structure:
schema_name: <class 'str'>
version: <class 'str'>
name: <class 'str'>
origin: Dictionary with keys ['mimetype', 'binary_hash', 'filename']
furniture: Dictionary with keys ['self_ref', 'children', 'name', 'label']
body: Dictionary with keys ['self_ref', 'children', 'name', 'label']
groups: List with 18 items
texts: List with 724 items
pictures: List with 2 items
tables: List with 49 items
key_value_items: List with 0 items
pages: Dictionary with keys ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',