# Multi-format Conversion with Docling

In [1]:
# Install required libraries
!pip install docling pypdfium2 pyyaml









In [2]:
import json
import yaml
import logging
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the files to convert
paths = [
    Path("README.md"),
    Path("tests/daily-earnings-dashboard-us.pdf"),
    Path("tests/NVIDIA-Corporation-2023-08-02.pptx"),
    Path("tests/AR_2020_WEB2.pdf"),
]

# Ensure output directory exists
output_dir = Path("parsed")
output_dir.mkdir(exist_ok=True)

# Configure Docling converter
converter = DocumentConverter(
    allowed_formats=[
        InputFormat.PDF,
        InputFormat.IMAGE,
        InputFormat.DOCX,
        InputFormat.HTML,
        InputFormat.PPTX,
        InputFormat.ASCIIDOC,
        InputFormat.MD,
    ],
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=StandardPdfPipeline,
            backend=PyPdfiumDocumentBackend
        ),
        InputFormat.DOCX: WordFormatOption(
            pipeline_cls=SimplePipeline
        ),
    },
)

# Run conversions
results = converter.convert_all(paths)

# Export each result
for res in results:
    stem = res.input.file.stem
    logger.info(f"Converted {res.input.file.name}")
    # Markdown
    (output_dir / f"{stem}.md").write_text(res.document.export_to_markdown())
    # JSON
    (output_dir / f"{stem}.json").write_text(
        json.dumps(res.document.export_to_dict(), indent=2)
    )
    # YAML
    (output_dir / f"{stem}.yaml").write_text(
        yaml.safe_dump(res.document.export_to_dict(), sort_keys=False)
    )

logger.info("All conversions complete. Check the 'parsed' folder.")

2025-09-24 16:20:00,579 - INFO - detected formats: [<InputFormat.MD: 'md'>]


2025-09-24 16:20:00,580 - INFO - Going to convert document batch...


2025-09-24 16:20:00,580 - INFO - Initializing pipeline for SimplePipeline with options hash 995a146ad601044538e6a923bea22f4e


2025-09-24 16:20:00,770 - INFO - Loading plugin 'docling_defaults'




2025-09-24 16:20:00,771 - INFO - Registered picture descriptions: ['vlm', 'api']


2025-09-24 16:20:00,771 - INFO - Processing document README.md


2025-09-24 16:20:00,812 - INFO - Finished converting document README.md in 0.23 sec.


2025-09-24 16:20:00,813 - INFO - Converted README.md


2025-09-24 16:20:00,834 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]


2025-09-24 16:20:00,837 - INFO - Going to convert document batch...


2025-09-24 16:20:00,837 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e647edf348883bed75367b22fbe60347


2025-09-24 16:20:00,844 - INFO - Loading plugin 'docling_defaults'




2025-09-24 16:20:00,847 - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']


2025-09-24 16:20:00,945 - INFO - Accelerator device: 'mps'


2025-09-24 16:20:02,969 - INFO - Accelerator device: 'mps'


2025-09-24 16:20:04,016 - INFO - Accelerator device: 'mps'


2025-09-24 16:20:04,386 - INFO - Processing document daily-earnings-dashboard-us.pdf


2025-09-24 16:20:15,296 - INFO - Finished converting document daily-earnings-dashboard-us.pdf in 14.48 sec.


2025-09-24 16:20:15,296 - INFO - Converted daily-earnings-dashboard-us.pdf


2025-09-24 16:20:15,986 - INFO - detected formats: [<InputFormat.PPTX: 'pptx'>]


2025-09-24 16:20:16,003 - INFO - Going to convert document batch...


2025-09-24 16:20:16,003 - INFO - Processing document NVIDIA-Corporation-2023-08-02.pptx


2025-09-24 16:20:16,581 - INFO - Finished converting document NVIDIA-Corporation-2023-08-02.pptx in 1.29 sec.


2025-09-24 16:20:16,584 - INFO - Converted NVIDIA-Corporation-2023-08-02.pptx


2025-09-24 16:20:18,201 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]


2025-09-24 16:20:18,208 - INFO - Going to convert document batch...


2025-09-24 16:20:18,209 - INFO - Processing document AR_2020_WEB2.pdf


2025-09-24 16:20:25,044 - INFO - Finished converting document AR_2020_WEB2.pdf in 8.46 sec.


2025-09-24 16:20:25,044 - INFO - Converted AR_2020_WEB2.pdf


2025-09-24 16:20:25,182 - INFO - All conversions complete. Check the 'parsed' folder.
