<div style="background-color: #ADD8E6; border: 1px solid gray; padding: 3px">
        <h3>Data Preprocessing with Docling</h3>
        <li>Converts pdfs into image files.</li>
</div>

In [1]:
import logging
import time
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
import os
import shutil

_log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
IMAGE_RESOLUTION_SCALE = 2.0
source_dir="data2"
output_dir = "data2"
pdf_dir = "pdf2"

try:
    files = os.listdir(source_dir)
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(pdf_dir, exist_ok=True)
    pdf_files = [f for f in files if f.endswith(".pdf")]
    
    for input_doc_path in pdf_files:

        _log.info(f"Converting {source_dir}/{input_doc_path}...")

        # Set up PipelineOptions
        pipeline_options = PdfPipelineOptions()
        pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
        pipeline_options.generate_page_images = True
        pipeline_options.generate_picture_images = True

        # Set up converter
        doc_converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
            }
        )
        start_time = time.time()
        conv_res = doc_converter.convert(f"{source_dir}/{input_doc_path}")
        doc_filename = conv_res.input.file.stem

        # Convert first page of PDF
        for page_no, page in conv_res.document.pages.items():
            page_no = page.page_no
            page_image_filename = f"{output_dir}/{doc_filename}.png"
            with open(page_image_filename, "wb") as fp:
                page.image.pil_image.save(fp, format="PNG")
            shutil.move(f"{source_dir}/{input_doc_path}", pdf_dir)
            _log.info(f"Conversion of {input_doc_path} complete.")
            break # Convert only the first page

except Exception as e:

    _log.error(f"Error while converting files...{e}")
