In [None]:
import time
import logging
from pathlib import Path
from typing import Iterable

from docling.document_converter import PdfFormatOption
from docling.document_converter import DocumentConverter

from docling.datamodel.settings import settings
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.pipeline_options import PdfPipelineOptions

In [None]:
_log = logging.getLogger(__name__)

In [None]:
def export_documents(conv_results: Iterable[ConversionResult], output_dir: Path):
    
    output_dir.mkdir(parents=True, exist_ok=True)

    success_count = 0
    failure_count = 0
    partial_success_count = 0

    for conv_res in conv_results:
        if conv_res.status == ConversionStatus.SUCCESS:
            success_count += 1
            doc_filename = conv_res.input.file.stem

            # Export Docling document format to markdown:
            with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
                fp.write(conv_res.document.export_to_markdown())

            # Export Docling document format to text:
            with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
                fp.write(conv_res.document.export_to_markdown(strict_text=True))

        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
            _log.info(
                f"Document {conv_res.input.file} was partially converted with the following errors:"
            )
            for item in conv_res.errors:
                _log.info(f"\t{item.error_message}")
            partial_success_count += 1
        else:
            _log.info(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1

    _log.info(
        f"Processed {success_count + partial_success_count + failure_count} docs, "
        f"of which {failure_count} failed "
        f"and {partial_success_count} were partially converted."
    )
    return success_count, partial_success_count, failure_count

In [None]:
def main():
    logging.basicConfig(level=logging.INFO)

    # https://realpython.com/get-all-files-in-directory-python/
    DIRECTORY = Path('./data/pdf')
    input_doc_paths = list(DIRECTORY.glob("*.pdf"))

    # input_doc_paths = [
    #     Path("./data/source/UU_NO_11_2008.PDF"),
    #     Path("./data/source/UU_NO_19_2016.pdf"),
    #     Path("./data/source/UU_NO_1_2024.pdf")
    #     Path("./data/source/PP_NO_71_2019.pdf")
    # ]

    # Turn on inline debug visualizations:
    settings.debug.visualize_layout = False
    settings.debug.visualize_ocr = False
    settings.debug.visualize_tables = False
    settings.debug.visualize_cells = False

    # Converter Settings
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = False
    
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_results = doc_converter.convert_all(
        input_doc_paths,
        raises_on_error=False,  # to let conversion run through all and examine results at the end
    )
    success_count, partial_success_count, failure_count = export_documents(
        conv_results, output_dir=Path("./data/markdown/raw/bpk")
    )

    end_time = time.time() - start_time

    _log.info(f"Document conversion complete in {end_time:.2f} seconds.")

    if failure_count > 0:
        raise RuntimeError(
            f"The example failed converting {failure_count} on {len(input_doc_paths)}."
        )

In [None]:
if __name__ == "__main__":
    main()