In [1]:
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption, ExcelFormatOption
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TableFormerMode,
    AcceleratorDevice,
    AcceleratorOptions,
    RapidOcrOptions,
)
from docling.datamodel.base_models import InputFormat
from docling.backend import pypdfium2_backend
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling.pipeline.simple_pipeline import SimplePipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def scrape_file(file_path):
    try:
        ocr_options = RapidOcrOptions()
        pipeline_options = PdfPipelineOptions(do_table_structure=True, do_ocr=True)
        pipeline_options.ocr_options = ocr_options
        pipeline_options.ocr_options.lang = ["en"]
        pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=4, device=AcceleratorDevice.CUDA
        )
        pipeline_options.table_structure_options.do_cell_matching = False
        pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE

        pipeline_options.create_legacy_output = True

        converter = DocumentConverter(
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.IMAGE,
                InputFormat.DOCX,
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
                InputFormat.MD,
                InputFormat.XLSX,
            ],
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=pipeline_options,
                    backend=pypdfium2_backend.PyPdfiumDocumentBackend,
                ),
                InputFormat.DOCX: WordFormatOption(
                    pipeline_cls=SimplePipeline  
                ),
                InputFormat.XLSX: ExcelFormatOption(
                    pipeline_cls=SimplePipeline
                ),
            }
        )
        doc = converter.convert(source=file_path).document
        print("Processed")
        return doc
    except Exception as e:
        return e

In [3]:
file = r'C:\Github\mindfolder\server\notebooks\test\Builder_ai_Overview.pdf'


In [5]:
resp = scrape_file(file)

CUDA is not available in the system. Fall back to 'CPU'
CUDA is not available in the system. Fall back to 'CPU'
CUDA is not available in the system. Fall back to 'CPU'


Processed


In [7]:
output= r'C:\Github\mindfolder\server\notebooks\test\test.md'

with open(output, 'w') as f:
    f.write(resp.export_to_markdown())