# Multi-format Conversion with Docling

In [2]:
# Install required libraries
!pip install docling pypdfium2 pyyaml


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import json
import yaml
import logging
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the files to convert
paths = [
    Path("README.md"),
    Path("tests/daily-earnings-dashboard-us.pdf"),
    Path("tests/NVIDIA-Corporation-2023-08-02.pptx"),
    Path("tests/AR_2020_WEB2.pdf"),
]

# Ensure output directory exists
output_dir = Path("parsed")
output_dir.mkdir(exist_ok=True)

# Configure Docling converter
converter = DocumentConverter(
    allowed_formats=[
        InputFormat.PDF,
        InputFormat.IMAGE,
        InputFormat.DOCX,
        InputFormat.HTML,
        InputFormat.PPTX,
        InputFormat.ASCIIDOC,
        InputFormat.MD,
    ],
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=StandardPdfPipeline,
            backend=PyPdfiumDocumentBackend
        ),
        InputFormat.DOCX: WordFormatOption(
            pipeline_cls=SimplePipeline
        ),
    },
)

# Run conversions
results = converter.convert_all(paths)

# Export each result
for res in results:
    stem = res.input.file.stem
    logger.info(f"Converted {res.input.file.name}")
    # Markdown
    (output_dir / f"{stem}.md").write_text(res.document.export_to_markdown())
    # JSON
    (output_dir / f"{stem}.json").write_text(
        json.dumps(res.document.export_to_dict(), indent=2)
    )
    # YAML
    (output_dir / f"{stem}.yaml").write_text(
        yaml.safe_dump(res.document.export_to_dict(), sort_keys=False)
    )

logger.info("All conversions complete. Check the 'parsed' folder.")

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document README.md
INFO:docling.document_converter:Finished converting document README.md in 0.00 sec.
INFO:__main__:Converted README.md
  from .autonotebook import tqdm as notebook_tqdm
INFO:docling.pipeline.base_pipeline:Processing document daily-earnings-dashboard-us.pdf
INFO:docling.document_converter:Finished converting document daily-earnings-dashboard-us.pdf in 8.69 sec.
INFO:__main__:Converted daily-earnings-dashboard-us.pdf
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document NVIDIA-Corporation-2023-08-02.pptx
INFO:docling.document_converter:Finished converting document NVIDIA-Corporation-2023-08-02.pptx in 1.34 sec.
INFO:__main__:Converted NVIDIA-Corporation-2023-08-02.pptx
INFO:__main__:All conversions complete. Check the 'parsed' folder.
